### 1. Tokenizing words and sentences

In [1]:
import nltk
from nltk.tokenize import sent_tokenize,word_tokenize

In [2]:
example_text = "Hello Mr. Smith, how are you doing today? The Weather is great."

# split the sentences. 
print(sent_tokenize(example_text))

['Hello Mr. Smith, how are you doing today?', 'The Weather is great.']


In [3]:
print(word_tokenize(example_text))

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'The', 'Weather', 'is', 'great', '.']


### 2. Stop Words

In [4]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [5]:
example_sentence = "This is an example showing off stop words filtration."

stop_words = set(stopwords.words("english"))

In [6]:
print(stop_words)

{'am', 'during', 'is', 'y', 'out', 'weren', 'doesn', 'up', 'for', 'when', 'ain', 'because', 'be', 'needn', 'a', 'here', 'we', 'i', 'between', 'same', 'can', 'ours', 'which', 'his', 'from', 'mustn', 'do', 'over', 'some', 'few', 'yourselves', 'once', 'hers', 'he', 's', 'couldn', 'down', 'where', 'as', 'hadn', 'has', 'below', 'this', 'having', 're', 'above', 'did', 'it', 'o', 'these', 'their', 'and', 'own', 'but', 'in', 'into', 'does', 't', 'hasn', 'shouldn', 'off', 'why', 'aren', 'shan', 'other', 'him', 've', 'yourself', 'was', 'she', 'while', 'only', 'ma', 'or', 'too', 'her', 'mightn', 'if', 'further', 'not', 'being', 'will', 'should', 'on', 'had', 'whom', 'then', 'who', 'all', 'isn', 'what', 'theirs', 'itself', 'your', 'both', 'my', 'wouldn', 'at', 'that', 'its', 'now', 'no', 'won', 'ourselves', 'been', 'about', 'haven', 'so', 'the', 'them', 'to', 'just', 'more', 'wasn', 'very', 'are', 'how', 'were', 'an', 'doing', 'don', 'herself', 'of', 'those', 'me', 'didn', 'd', 'they', 'll', 'such

In [7]:
words = word_tokenize(example_sentence)
filtered_sentence = []
for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)
print(filtered_sentence)

['This', 'example', 'showing', 'stop', 'words', 'filtration', '.']


In [8]:
filtered_sentence = [w for w in words if w not in stop_words]
print(filtered_sentence)

['This', 'example', 'showing', 'stop', 'words', 'filtration', '.']


### 3. Stemming

In [9]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [10]:
ps = PorterStemmer()

example_words = ['python','pythonic','pythoning']

for w in example_words:
    print(ps.stem(w))

python
python
python


### 4.  Part of Speech Tagging

In [11]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [12]:
train_text =  state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw("2006-GWBush.txt")
custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [13]:
def process_content():
    try:
        for i in tokenized:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))

#process_content()        

```
POS tag list:

CC	coordinating conjunction
CD	cardinal digit
DT	determiner
EX	existential there (like: "there is" ... think of it like "there exists")
FW	foreign word
IN	preposition/subordinating conjunction
JJ	adjective	'big'
JJR	adjective, comparative	'bigger'
JJS	adjective, superlative	'biggest'
LS	list marker	1)
MD	modal	could, will
NN	noun, singular 'desk'
NNS	noun plural	'desks'
NNP	proper noun, singular	'Harrison'
NNPS	proper noun, plural	'Americans'
PDT	predeterminer	'all the kids'
POS	possessive ending	parent's
PRP	personal pronoun	I, he, she
PRP$	possessive pronoun	my, his, hers
RB	adverb	very, silently,
RBR	adverb, comparative	better
RBS	adverb, superlative	best
RP	particle	give up
TO	to	go 'to' the store.
UH	interjection	errrrrrrrm
VB	verb, base form	take
VBD	verb, past tense	took
VBG	verb, gerund/present participle	taking
VBN	verb, past participle	taken
VBP	verb, sing. present, non-3d	take
VBZ	verb, 3rd person sing. present	takes
WDT	wh-determiner	which
WP	wh-pronoun	who, what
WP$	possessive wh-pronoun	whose
WRB	wh-abverb	where, when
```

### 5.  Chunking with Regular Expression

<img src="files/regxcheatsheet.png">

In [14]:
grammer=r"""NP: {<DT|pp\$>?<JJ>*<NN>}
                {<NNP>+}"""
cp = nltk.RegexpParser(grammer)
for i in tokenized[1:2]:
    words = nltk.word_tokenize(i)
    # discard stop words
    words_nonstop = [w for w in words if w not in stop_words]
    tagged = nltk.pos_tag(words_nonstop)
    print(cp.parse(tagged))

(S
  (NP Mr./NNP Speaker/NNP)
  ,/,
  (NP Vice/NNP President/NNP Cheney/NNP)
  ,/,
  members/NNS
  (NP Congress/NNP)
  ,/,
  members/NNS
  (NP Supreme/NNP Court/NNP)
  (NP diplomatic/JJ corps/NN)
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  fellow/JJ
  citizens/NNS
  :/:
  (NP Today/NN)
  (NP nation/NN)
  lost/VBD
  beloved/VBN
  ,/,
  graceful/JJ
  ,/,
  (NP courageous/JJ woman/NN)
  called/VBN
  (NP America/NNP)
  founding/VBG
  ideals/NNS
  carried/VBD
  (NP noble/JJ dream/NN)
  ./.)


### 6. Chinking
<p>Chinking is a part of the chuncking process. A chink is what we wish to remove from chunk</p>

In [15]:
grammer=r"""NP: {<.*>+}
                }<VB.?|IN|DT>+{"""
cp = nltk.RegexpParser(grammer)
for i in tokenized[1:2]:
    words = nltk.word_tokenize(i)
    # discard stop words
    words_nonstop = [w for w in words if w not in stop_words]
    tagged = nltk.pos_tag(words_nonstop)
    chuncked = cp.parse(tagged)
    print(chuncked)

(S
  (NP
    Mr./NNP
    Speaker/NNP
    ,/,
    Vice/NNP
    President/NNP
    Cheney/NNP
    ,/,
    members/NNS
    Congress/NNP
    ,/,
    members/NNS
    Supreme/NNP
    Court/NNP
    diplomatic/JJ
    corps/NN
    ,/,
    distinguished/JJ
    guests/NNS
    ,/,
    fellow/JJ
    citizens/NNS
    :/:
    Today/NN
    nation/NN)
  lost/VBD
  beloved/VBN
  (NP ,/, graceful/JJ ,/, courageous/JJ woman/NN)
  called/VBN
  (NP America/NNP)
  founding/VBG
  (NP ideals/NNS)
  carried/VBD
  (NP noble/JJ dream/NN ./.))


### 7. Name Entity Recognition
<p>Name entity recognition is useful to quickly find out what the subjects of discussions are.</p>

In [16]:
cp = nltk.RegexpParser(grammer)
for i in tokenized[1:2]:
    words = nltk.word_tokenize(i)
    # discard stop words
    words_nonstop = [w for w in words if w not in stop_words]
    tagged = nltk.pos_tag(words_nonstop)
    nameEnt = nltk.ne_chunk(tagged)
    print(nameEnt)

(S
  (PERSON Mr./NNP Speaker/NNP)
  ,/,
  Vice/NNP
  President/NNP
  (PERSON Cheney/NNP)
  ,/,
  members/NNS
  (ORGANIZATION Congress/NNP)
  ,/,
  members/NNS
  (ORGANIZATION Supreme/NNP Court/NNP)
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  fellow/JJ
  citizens/NNS
  :/:
  Today/NN
  nation/NN
  lost/VBD
  beloved/VBN
  ,/,
  graceful/JJ
  ,/,
  courageous/JJ
  woman/NN
  called/VBN
  (GPE America/NNP)
  founding/VBG
  ideals/NNS
  carried/VBD
  noble/JJ
  dream/NN
  ./.)


### 8. Lemmatizing
LEMMATIZING is a very similar operation to stemming. The major difference is, as you saw earlier, stemming can often create non-existent words. 
**Group words together**. 

In [17]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("better",pos="a"))

good


### 9. Corpora

In [18]:
from nltk.corpus import gutenberg
sample_text = gutenberg.raw("bible-kjv.txt")
tokenize = sent_tokenize(sample_text)
#print(tokenize[:20])

### 10. WordNet
With WordNet we can do things like look up words and their meaning according to their parts of speech, we can find synonyms, antonyms, and even examples of the word in use. 

In [19]:
from nltk.corpus import wordnet

In [20]:
syns = wordnet.synsets("program")
print(syns)

[Synset('plan.n.01'), Synset('program.n.02'), Synset('broadcast.n.02'), Synset('platform.n.02'), Synset('program.n.05'), Synset('course_of_study.n.01'), Synset('program.n.07'), Synset('program.n.08'), Synset('program.v.01'), Synset('program.v.02')]


In [21]:
print(syns[0].lemmas()[0].name()) # just the word

plan


In [22]:
print(syns[0].definition()) #definition

a series of steps to be carried out or goals to be accomplished


In [23]:
print(syns[0].examples()) #example

['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [24]:
# synonyms and antonyms 
synonyms = []
antonyms = []
for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        #print(l.name())
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
print(set(synonyms))
print(set(antonyms))

{'in_force', 'honorable', 'goodness', 'safe', 'undecomposed', 'thoroughly', 'full', 'upright', 'serious', 'secure', 'dear', 'salutary', 'skillful', 'proficient', 'commodity', 'good', 'unspoiled', 'near', 'just', 'practiced', 'sound', 'beneficial', 'honest', 'skilful', 'dependable', 'unspoilt', 'in_effect', 'adept', 'expert', 'well', 'soundly', 'respectable', 'estimable', 'trade_good', 'right', 'ripe', 'effective'}
{'evilness', 'evil', 'bad', 'badness', 'ill'}


In [25]:
# similarity comparison between two words
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("boat.n.01")
print(w1.wup_similarity(w2))

0.9090909090909091


### 11. Text Classification 
we're gonna classify the movie reviews into pos and neg categories based on the words distributions. 

It's a naive algorithm for now!!!

In [26]:
import nltk
import random
from nltk.corpus import movie_reviews

documents = [(movie_reviews.words(fileid),category)
              for category in movie_reviews.categories()
              for fileid in movie_reviews.fileids(category)]


random.shuffle(documents)
print(documents[1])

(['we', 'could', 'paraphrase', 'michelle', 'pfieffer', ...], 'neg')


In [27]:
all_words = [w.lower() for w in movie_reviews.words()]
all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


Above list of words are useless, we'll improve it in later sections.

In [28]:
print (all_words["stupid"])

253


### 12. Word as Features

In [29]:
word_features = list(all_words.keys())[:3000]

Check if the every words in the text is in the top 3000 words in the review corpus

In [30]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w]=(w in words)
    return features
#print(find_features(movie_reviews.words('neg/cv000_29416.txt')))


In [31]:
featuresets = [(find_features(rev),category) for (rev,category) in documents]
#print(featuresets[:10])

### 13. Naive Bayes