### Sentence Tokenizer

In [1]:
sents='''Hello friends!
How are you? Welcome to Python Programming.'''

In [2]:
from nltk.tokenize import sent_tokenize

In [3]:
sent_list= sent_tokenize(sents)

In [4]:
sent_list

['Hello friends!', 'How are you?', 'Welcome to Python Programming.']

###  word tokenizer

In [5]:
from nltk.tokenize import word_tokenize

In [6]:
words= word_tokenize(sents)

In [7]:
words

['Hello',
 'friends',
 '!',
 'How',
 'are',
 'you',
 '?',
 'Welcome',
 'to',
 'Python',
 'Programming',
 '.']

In [8]:
import string

In [9]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [10]:
## How many percentage of punctuation symbols present in given text

In [11]:
p_cnt=0
for i in words:
    if i in string.punctuation:
        p_cnt+=1
per=p_cnt/len(words)*100
per

25.0

In [12]:
puncts=[ token for token in word_tokenize(sents)
       if token in string.punctuation]
len(puncts)/len(word_tokenize(sents))

0.25

In [13]:
puncts

['!', '?', '.']

### white-space tokenizer

In [14]:
from nltk.tokenize import WhitespaceTokenizer

In [15]:
tk=WhitespaceTokenizer()

In [16]:
sents='''Hello friends!
How are you? Welcome to Python Programming.'''

In [17]:
tk.tokenize(sents)

['Hello',
 'friends!',
 'How',
 'are',
 'you?',
 'Welcome',
 'to',
 'Python',
 'Programming.']

### space tokenizer

In [18]:
from nltk.tokenize import SpaceTokenizer  # Only Space use as seperator

In [19]:
tk=SpaceTokenizer()

In [20]:
tk.tokenize(sents)

['Hello',
 'friends!\nHow',
 'are',
 'you?',
 'Welcome',
 'to',
 'Python',
 'Programming.']

## Line Tokenizer

In [21]:
from nltk.tokenize import LineTokenizer

In [22]:
tk=LineTokenizer()

In [23]:
tk.tokenize(sents)

['Hello friends!', 'How are you? Welcome to Python Programming.']

### Tab tokenizer

In [24]:
sents='''Hello friends!
How are you? Welcome to\tPython Programming.'''

In [25]:
print(sents)

Hello friends!
How are you? Welcome to	Python Programming.


In [28]:
from nltk.tokenize import TabTokenizer
tk=TabTokenizer()
tk.tokenize(sents)

['Hello friends!\nHow are you? Welcome to', 'Python Programming.']

### tweet tokenizer

In [29]:
from nltk.tokenize import TweetTokenizer

In [37]:
sent='''Hello friends! :)
How are you? <3 Welcometo #python Programming :D
Check my web : https://python.org'''

In [38]:
print(sent)

Hello friends! :)
How are you? <3 Welcometo #python Programming :D
Check my web : https://python.org


In [39]:
tk =TweetTokenizer()
tk.tokenize(sent)

['Hello',
 'friends',
 '!',
 ':)',
 'How',
 'are',
 'you',
 '?',
 '<3',
 'Welcometo',
 '#python',
 'Programming',
 ':D',
 'Check',
 'my',
 'web',
 ':',
 'https://python.org']

In [43]:
sent='''Hello friends! 游놓
How are you? <3 Welcometo #python Programming游놐
Check my web : https://python.org'''

In [44]:
print(sent)

Hello friends! 游놓
How are you? <3 Welcometo #python Programming游놐
Check my web : https://python.org


In [45]:
tk =TweetTokenizer()
tk.tokenize(sent)

['Hello',
 'friends',
 '!',
 '游놓',
 'How',
 'are',
 'you',
 '?',
 '<3',
 'Welcometo',
 '#python',
 'Programming',
 '游놐',
 'Check',
 'my',
 'web',
 ':',
 'https://python.org']

In [46]:
word_tokenize(sent)

['Hello',
 'friends',
 '!',
 '游놓',
 'How',
 'are',
 'you',
 '?',
 '<',
 '3',
 'Welcometo',
 '#',
 'python',
 'Programming游놐',
 'Check',
 'my',
 'web',
 ':',
 'https',
 ':',
 '//python.org']

### Multi-word Expression Tokenizer

In [47]:
from nltk.tokenize import MWETokenizer

In [48]:
sent="Van Rossom is in pune today. We welcomed Van Rossom here"

In [51]:
tk=MWETokenizer()
tk.add_mwe(('Van','Rossom'))

In [52]:
tk.tokenize(word_tokenize(sent))

['Van_Rossom',
 'is',
 'in',
 'pune',
 'today',
 '.',
 'We',
 'welcomed',
 'Van_Rossom',
 'here']

In [56]:
tk=MWETokenizer(separator=' ')
tk.add_mwe(('Van','Rossom'))

In [57]:
tk.tokenize(word_tokenize(sent))

['Van Rossom',
 'is',
 'in',
 'pune',
 'today',
 '.',
 'We',
 'welcomed',
 'Van Rossom',
 'here']

## Custom Tokenizer

In [64]:
import re

In [76]:
def custom_tokenizer(text):
    return re.split(r"[.,;?#!\s]+",text)  # \s - white-space

In [77]:
sents='''Hello friends!
How are you? Wel.come to\tPython Progra#mming.'''

In [78]:
custom_tokenizer(sents)

['Hello',
 'friends',
 'How',
 'are',
 'you',
 'Wel',
 'come',
 'to',
 'Python',
 'Progra',
 'mming',
 '']

In [79]:
## https://mitu.co.in/dataset
## student3.csv

In [88]:
f=open("student3.tsv")

In [89]:
data=f.read()

In [90]:
print(data)

roll	name	class	marks	age
1	anil	TE	56.77	22
2	amit	TE	59.77	21
3	aniket	BE	76.88	19
4	ajinkya	TE	69.66	20
5	asha	TE	63.28	20
6	ayesha	BE	49.55	20
7	amar	BE	65.34	19
8	amita	BE	68.33	23
9	amol	TE	56.75	20
10	anmol	BE	78.66	21



In [101]:
data

'roll\tname\tclass\tmarks\tage\n1\tanil\tTE\t56.77\t22\n2\tamit\tTE\t59.77\t21\n3\taniket\tBE\t76.88\t19\n4\tajinkya\tTE\t69.66\t20\n5\tasha\tTE\t63.28\t20\n6\tayesha\tBE\t49.55\t20\n7\tamar\tBE\t65.34\t19\n8\tamita\tBE\t68.33\t23\n9\tamol\tTE\t56.75\t20\n10\tanmol\tBE\t78.66\t21\n'

In [108]:
tk=LineTokenizer()

In [109]:
tab_tk=TabTokenizer()

In [110]:
rec=[]
for i in tk.tokenize(data):
    rec.append(tab_tk.tokenize(i))

In [111]:
rec

[['roll', 'name', 'class', 'marks', 'age'],
 ['1', 'anil', 'TE', '56.77', '22'],
 ['2', 'amit', 'TE', '59.77', '21'],
 ['3', 'aniket', 'BE', '76.88', '19'],
 ['4', 'ajinkya', 'TE', '69.66', '20'],
 ['5', 'asha', 'TE', '63.28', '20'],
 ['6', 'ayesha', 'BE', '49.55', '20'],
 ['7', 'amar', 'BE', '65.34', '19'],
 ['8', 'amita', 'BE', '68.33', '23'],
 ['9', 'amol', 'TE', '56.75', '20'],
 ['10', 'anmol', 'BE', '78.66', '21']]

In [112]:
max=rec[1][3]
ind=rec[1][1]
for i in rec[1:]:
    if i[3]>max:
        ind=i[1]     
print(ind)        

anmol
