### In this tutorial, we will look at some of the tokenizers available in nltk

In [1]:
## Tokenization using NLTK
# word_tokenize
import nltk
from nltk.tokenize import word_tokenize
s = "Good muffins cost Rs.3.80.\nin New York.  Please buy me two of them.\n\nThanks."
print("Sentence: \n\n"+s) 
print("\nword_tokenize output")
print(word_tokenize(s))
print("\n")

Sentence: 

Good muffins cost Rs.3.80.
in New York.  Please buy me two of them.

Thanks.

word_tokenize output
['Good', 'muffins', 'cost', 'Rs.3.80', '.', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']




In [2]:
# word_tokenize
import nltk
from nltk.tokenize import wordpunct_tokenize
s = '''Good muffins cost $3.88\nin New York.  Please buy me two of them.\n\nThanks.'''
print("Sentence: \n\n"+s) 
print("wordpunct_tokenize output")
print(wordpunct_tokenize(s))
print("\n")

Sentence: 

Good muffins cost $3.88
in New York.  Please buy me two of them.

Thanks.
wordpunct_tokenize output
['Good', 'muffins', 'cost', '$', '3', '.', '88', 'in', 'New', 'York', '.', 'Please', 'buy', 'me', 'two', 'of', 'them', '.', 'Thanks', '.']




In [6]:
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
#s1 = '''Good muffins cost $3.88\n to co-author in New York.  Please buy me two of them.\n\nThanks.'''
s2= "abc is the context-based approach for morph-analysis. The accuracy of the approach is 77.80%"
print("Sentence: \n\n"+s2) 
print("\nsent_tokenize output")
print(sent_tokenize(s2))
print("\nword_tokenize output")
for t in sent_tokenize(s2):
    print(word_tokenize(t))
print("\n")

Sentence: 

abc is the context-based approach for morph-analysis. The accuracy of the approach is 77.80%

sent_tokenize output
['abc is the context-based approach for morph-analysis.', 'The accuracy of the approach is 77.80%']

word_tokenize output
['abc', 'is', 'the', 'context-based', 'approach', 'for', 'morph-analysis', '.']
['The', 'accuracy', 'of', 'the', 'approach', 'is', '77.80', '%']




In [7]:
## Tokenization using NLTK

# LineTokenizer
import nltk
from nltk.tokenize import LineTokenizer

LineTokenizer can be used to split strings containing newline characters

In [13]:
s = "I love kites\nI like cricket\nI like football\n"

print("Sentences: ") 
print(s)
print("LineTokenizer...")
print(LineTokenizer().tokenize(s))
print("\nword_tokenizer... ")
for sent in LineTokenizer().tokenize(s):
    print(word_tokenize(sent))

Sentences: 
I love kites
I like cricket
I like football

LineTokenizer...
['I love kites', 'I like cricket', 'I like football']

word_tokenizer... 
['I', 'love', 'kites']
['I', 'like', 'cricket']
['I', 'like', 'football']


In [14]:
from nltk.tokenize import RegexpTokenizer

RegexpTokenizer allows us to provide regular expressions as delimiters
The material between the tokens is discarded. 

In [38]:
s = "Petrol price has gone upto Rs.75.89. 01/02/2017 I,John and Mrs. Thomas are thinking of using electric scooters."
tokenizer = RegexpTokenizer('Rs\.[\d\.]+\S+')
print("Sentence: "+s)
print("\nRegexpTokenizer...")
print(tokenizer.tokenize(s))
print("\n")
#Let us say we want to extract all words beginning with an uppercase character
capword_tokenizer = RegexpTokenizer('[A-Z]\w*\S+')
print(capword_tokenizer.tokenize(s))

Sentence: Petrol price has gone upto Rs.75.89. 01/02/2017 I,John and Mrs. Thomas are thinking of using electric scooters.

RegexpTokenizer...
['Rs.75.89.']


['Petrol', 'Rs.75.89.', 'I,John', 'Mrs.', 'Thomas']


#### SExprTokenizer : Tokenizes parenthesized expressions in a string 

In [17]:
from nltk.tokenize import SExprTokenizer

In [18]:
s = '?((a(b c)d)ef(g(h(i))))'
print("Sentence: "+s)
print("\nSExprTokenizer...")
print(SExprTokenizer().tokenize(s))
print("\n")

Sentence: ?((a(b c)d)ef(g(h(i))))

SExprTokenizer...
['?', '((a(b c)d)ef(g(h(i))))']




#### TreebankWordTokenizer is standard tokenizer tool used and does a decent job

In [19]:
#TreebankWordTokenizer
from nltk.tokenize import TreebankWordTokenizer

In [20]:
s = "Petrol price has gone upto Rs.75.89 I,John and Mrs. Thomas are thinking of using electric scooters."
print("Sentence: "+s)
print("\nTreebankWordTokenizer...")
print(TreebankWordTokenizer().tokenize(s))
print("\n")

#s = "@someone did you check out this #superawesome!! it's very cool \xF0\x9F\x98\x81 http://t.co/ydfY2"
#print("Sentence: "+s)
#print(TreebankWordTokenizer().tokenize(s))
#print("\n")

#s = "@Nike's quest to break the 2-hour marathon barrier is LIVE on Twitter. #Breaking2"
s= "@Nikes: This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
print("\nSentence: "+s)
print(TreebankWordTokenizer().tokenize(s))

Sentence: Petrol price has gone upto Rs.75.89 I,John and Mrs. Thomas are thinking of using electric scooters.

TreebankWordTokenizer...
['Petrol', 'price', 'has', 'gone', 'upto', 'Rs.75.89', 'I', ',', 'John', 'and', 'Mrs.', 'Thomas', 'are', 'thinking', 'of', 'using', 'electric', 'scooters', '.']



Sentence: @Nikes: This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--
['@', 'Nikes', ':', 'This', 'is', 'a', 'cooool', '#', 'dummysmiley', ':', ':', '-', ')', ':', '-P', '<', '3', 'and', 'some', 'arrows', '<', '>', '-', '>', '<', '--']


#### The previous tokenizers fail badly for tweets, TweetTokenizer can be used to tokenize tweets

In [21]:
from nltk.tokenize import TweetTokenizer

In [22]:
tknzr = TweetTokenizer()
s0 = "@Nike: This is a cooool #dummysmiley: :-) :-P <3 and some arrows < > -> <--"
tknzr.tokenize(s0)

['@Nike',
 ':',
 'This',
 'is',
 'a',
 'cooool',
 '#dummysmiley',
 ':',
 ':-)',
 ':-P',
 '<3',
 'and',
 'some',
 'arrows',
 '<',
 '>',
 '->',
 '<--']

In [23]:
tweet = str("@someone did you check out this #superawesome!! it's very cool \xF0\x9F\x98\x81 http://t.co/ydfY2")
print(tknzr.tokenize(tweet))

s = str("@Nike's quest to break the 2-hour marathon barrier is LIVE on Twitter. #Breaking2")
#tknzr.tokenize(s)

['@someone', 'did', 'you', 'check', 'out', 'this', '#superawesome', '!', '!', "it's", 'very', 'cool', 'ð', '\x9f', '\x98', '\x81', 'http://t.co/ydfY2']


What Python version are you using? If it is Python 2.x then tweet becomes a bytestring (while you should pass unicode to NLTK methods). If it is Python 3 you're getting str representation of a bytes object, this is not what you want:
https://github.com/nltk/nltk/issues/1155

In [24]:
#For Python 3.5
tweet = b"@someone did you check out this #superawesome!! it's very cool \xF0\x9F\x98\x81 http://t.co/ydfY2".decode('utf-8')
#tknzr.tokenize(bytes(tweet, 'utf-8').decode('utf-8'))

#### Usually we want to have phrases like New_York as a single word. This might be beneficial to certain downstream applications

In [25]:
from nltk.tokenize import MWETokenizer
from nltk.collocations import *
from nltk.corpus import gutenberg

In [26]:
#load the gutenburg corpus
corpus = []
for fileid in gutenberg.fileids():
    corpus.extend(gutenberg.sents(fileid))

In [27]:
print(len(corpus))

98552


Let us load Bigram Association measures

In [28]:
#load the BigramAssocMeasures
bigram_measures = nltk.collocations.BigramAssocMeasures()

In [31]:
#Collect the bigram statistics from the corpus
finder = BigramCollocationFinder.from_documents(corpus)

In [33]:
#Use say Chi-Squared test to extract top-k bigram candidates
collocs = finder.nbest(bigram_measures.dice, 200)
print(collocs[:20])

[('(~),', 'asterisk'), ('1000', '1997'), ('10000', '2004'), ('1500', '1998'), ('1739', 'University'), ('217', 'Peabody'), ('26th', 'ult'), ('9000', '2003'), ('AH', 'SUNFLOWER'), ('AN', 'INTERLUDE'), ('ANCIENT', 'BARD'), ('ANNUS', 'MIRABILIS'), ('AUGUST', '3d'), ('AUTUMN', 'RIVULETS'), ('Abhorred', 'Styx'), ('Adders', 'Forke'), ('Adult', 'Reformatory'), ('Agnus', 'Dei'), ('Alexandrian', 'Pharos'), ('Ally', 'Sloper')]


In [34]:
#Import MWETokenizer
from nltk.tokenize import MWETokenizer

In [None]:
#Initialize with the previously collected collocation
tokenizer = MWETokenizer(collocs)

In [None]:
sentence = 'The Tower of Hercules, near A Coruña in Spain, a 2nd century AD Roman lighthouse, is closely modelled on the Alexandrian Pharos'
tokenizer.tokenize(sentence.split())

In [None]:
#We wan't 2nd century AD to be a single token
# print(collocs)
collocs.append(('2nd', 'century', 'AD'))
tokenizer = MWETokenizer(collocs)
sentence = 'The Tower of Hercules, near A Coruña in Spain, a 2nd century AD Roman lighthouse, is closely modelled on the Alexandrian Pharos'
tokenizer.tokenize(sentence.split())

In [116]:
#**WordNet Lemmatizer**
#Lemmatize using WordNet’s built-in morphy function. Returns the input word unchanged if it cannot be found in WordNet.
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
print(wnl.lemmatize('nationalism'))
#print(wnl.lemmatize('loving','v'))
#print(wnl.lemmatize('went', 'v'))

nationalism


In [35]:
#**SnowballStemmer**
#For Snowball Stemmer, which is based on Snowball Stemming Algorithm, can be used in NLTK like this:
from nltk.stem import SnowballStemmer
print(" ".join(SnowballStemmer.languages))

danish dutch english finnish french german hungarian italian norwegian porter portuguese romanian russian spanish swedish


In [36]:
snowball_stemmer = SnowballStemmer('english')
#snowball_stemmer.stem('maximum')
#snowball_stemmer.stem('presumably')
snowball_stemmer.stem('nationalism')

'nation'

In [37]:
from nltk.stem.snowball import GermanStemmer
stemmer = GermanStemmer()
stemmer.stem("Autobahnen")

'autobahn'

In [None]:
#more details and examples here
# http://www.nltk.org/api/nltk.tokenize.html1