### In this tutorial, we will look at some of the tokenizers available in nltk

In [1]:
## Tokenization using NLTK
# word_tokenize
import nltk
from nltk.tokenize import word_tokenize
s = "Good muffins cost $3.80 in New York.\nDr. Ram Please buy me two of them.\nThanks."
print("Sentence: \n\n"+s) 
print("\nword_tokenize output")
print(word_tokenize(s))

print("\nsplit tokenize output")
print(s.split())
print("\n")

Sentence: 

Good muffins cost $3.80 in New York.
Dr. Ram Please buy me two of them.
Thanks.

word_tokenize output
['Good', 'muffins', 'cost', '$', '3.80', 'in', 'New', 'York', '.', 'Dr.', 'Ram', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']

split tokenize output
['Good', 'muffins', 'cost', '$3.80', 'in', 'New', 'York.', 'Dr.', 'Ram', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks.']




In [2]:
# word_tokenize
import nltk
from nltk.tokenize import wordpunct_tokenize
s = "Good muffins cost $3.80 in New York.\nDr. Ram Please buy me two of them.\nThanks."
print("Sentence: \n\n"+s) 
print("\nwordpunct_tokenize output")
print(wordpunct_tokenize(s))
print("\n")

Sentence: 

Good muffins cost $3.80 in New York.
Dr. Ram Please buy me two of them.
Thanks.

wordpunct_tokenize output
['Good', 'muffins', 'cost', '$', '3', '.', '80', 'in', 'New', 'York', '.', 'Dr', '.', 'Ram', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']




In [3]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
s2= "Good muffins cost $3.80 in New York. Dr. Ram Please buy me two of them.Thanks."
print("Sentence: \n\n"+s2) 
print("\nsent_tokenize output")
print(sent_tokenize(s2))
print("\nword_tokenize output")
for t in sent_tokenize(s2):
    print(word_tokenize(t))
print("\n")

Sentence: 

Good muffins cost $3.80 in New York. Dr. Ram Please buy me two of them.Thanks.

sent_tokenize output
['Good muffins cost $3.80 in New York.', 'Dr. Ram Please buy me two of them.Thanks.']

word_tokenize output
['Good', 'muffins', 'cost', '$', '3.80', 'in', 'New', 'York', '.']
['Dr.', 'Ram', 'Please', 'buy', 'me', 'two', 'of', 'them.Thanks', '.']




In [4]:
# LineTokenizer
import nltk
from nltk.tokenize import LineTokenizer

LineTokenizer can be used to split strings containing newline characters

In [5]:
s = "I love kites.\nI like cricket.\nI like football.\n"

print("Sentences: ") 
print(s)
print("LineTokenizer...")
print(LineTokenizer().tokenize(s))
print("\nword_tokenizer... ")
for sent in LineTokenizer().tokenize(s):
    print(word_tokenize(sent))

Sentences: 
I love kites.
I like cricket.
I like football.

LineTokenizer...
['I love kites.', 'I like cricket.', 'I like football.']

word_tokenizer... 
['I', 'love', 'kites', '.']
['I', 'like', 'cricket', '.']
['I', 'like', 'football', '.']


In [6]:
from nltk.tokenize import RegexpTokenizer

RegexpTokenizer allows us to provide regular expressions as delimiters
The material between the tokens is discarded. 

In [7]:
s = "Petrol price has gone upto Rs.75.89 on 01/02/2017. John and Mrs. Thomas are thinking of using electric scooters."
tokenizer = RegexpTokenizer('Rs\.[\d]+\.[\d]+')
print("Sentence: "+s)
print("\nRegexpTokenizer...")
print(tokenizer.tokenize(s))
print("\n")

Sentence: Petrol price has gone upto Rs.75.89 on 01/02/2017. John and Mrs. Thomas are thinking of using electric scooters.

RegexpTokenizer...
['Rs.75.89']




In [8]:
#Let us say we want to extract all words beginning with an uppercase character
tokenizer = RegexpTokenizer('[A-Z]\w*\S+')
print(tokenizer.tokenize(s))

['Petrol', 'Rs.75.89', 'John', 'Mrs.', 'Thomas']


#### SExprTokenizer : Tokenizes parenthesized expressions in a string 

In [9]:
from nltk.tokenize import SExprTokenizer

In [10]:
s = '?(a(b c)d)ef(g(h(i)))'
print("Sentence: "+s)
print("\nSExprTokenizer...")
print(SExprTokenizer().tokenize(s))
print("\n")

Sentence: ?(a(b c)d)ef(g(h(i)))

SExprTokenizer...
['?', '(a(b c)d)', 'ef', '(g(h(i)))']




#### TreebankWordTokenizer is standard tokenizer tool used and does a decent job

In [12]:
#TreebankWordTokenizer
from nltk.tokenize import TreebankWordTokenizer

In [13]:
s = "Good muffins cost $3.80 in New York. Dr. Ram Please buy me two of them. Thanks."
print("Sentence: "+s)
print("\nTreebankWordTokenizer...")
print(TreebankWordTokenizer().tokenize(s))
print("\n")

Sentence: Good muffins cost $3.80 in New York. Dr. Ram Please buy me two of them. Thanks.

TreebankWordTokenizer...
['Good', 'muffins', 'cost', '$', '3.80', 'in', 'New', 'York.', 'Dr.', 'Ram', 'Please', 'buy', 'me', 'two', 'of', 'them.', 'Thanks', '.']




In [14]:
s= "@Nikes: This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
print("\nSentence: "+s)
print(TreebankWordTokenizer().tokenize(s))


Sentence: @Nikes: This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--
['@', 'Nikes', ':', 'This', 'is', 'a', 'cooool', '#', 'dummysmiley', ':', ':', '-', ')', ':', '-P', '<', '3', 'and', 'some', 'arrows', '<', '>', '-', '>', '<', '--']


#### The previous tokenizers fail badly for tweets, TweetTokenizer can be used to tokenize tweets

In [15]:
from nltk.tokenize import TweetTokenizer

In [16]:
tknzr = TweetTokenizer()
s0 = "@Nike: This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
tknzr.tokenize(s0)

['@Nike',
 ':',
 'This',
 'is',
 'a',
 'cooool',
 '#dummysmiley',
 ':',
 ':-)',
 ':-P',
 '<3',
 'and',
 'some',
 'arrows',
 '<',
 '>',
 '->',
 '<--']

In [18]:
#**WordNet Lemmatizer**
#Lemmatize using WordNetâ€™s built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet.
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
print(wnl.lemmatize('computed'))
print(wnl.lemmatize('computed','v'))
print(wnl.lemmatize('nationality'))

computed
compute
nationality


In [19]:
#**SnowballStemmer**
#For Snowball Stemmer, which is based on Snowball Stemming Algorithm, can be used in NLTK like this:
from nltk.stem import SnowballStemmer
print(" ".join(SnowballStemmer.languages))

danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish


In [20]:
snowball_stemmer = SnowballStemmer('english')
#snowball_stemmer.stem('maximum')
#snowball_stemmer.stem('presumably')
print(snowball_stemmer.stem('computing'))
print(snowball_stemmer.stem('nationality'))

comput
nation


In [21]:
from nltk.stem.snowball import GermanStemmer
stemmer = GermanStemmer()
stemmer.stem("Autobahnen")

'autobahn'

In [70]:
#for more details and examples see http://www.nltk.org/api/nltk.tokenize.html1