## Video 1 : tokenizing

In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
example_txt = "Hello Mr. Smith, how are you doing today? It is good to see you after so long. Get ready by 9:00 AM. We are going to a meeting with Miss. Francis."

In [20]:
print(sent_tokenize(example_txt))

['Hello Mr. Smith, how are you doing today?', 'It is good to see you after so long.', 'Get ready by 9:00 AM.', 'We are going to a meeting with Miss.', 'Francis.']


In [21]:
print(word_tokenize(example_txt))

['Hello', 'Mr.', 'Smith', ',', 'how', 'are', 'you', 'doing', 'today', '?', 'It', 'is', 'good', 'to', 'see', 'you', 'after', 'so', 'long', '.', 'Get', 'ready', 'by', '9:00', 'AM', '.', 'We', 'are', 'going', 'to', 'a', 'meeting', 'with', 'Miss', '.', 'Francis', '.']


## Video 2 : Stopwords

In [22]:
from nltk.corpus import stopwords

In [23]:
ex = "This is an example sentence for testing stop words."

In [24]:
stop_words = set(stopwords.words("english"))
words = word_tokenize(ex)

In [25]:
filtered_sentence = []
for w in words:
    if w not in stop_words:
        filtered_sentence.append(w)
print(filtered_sentence)

['This', 'example', 'sentence', 'testing', 'stop', 'words', '.']


In [26]:
fil_sen = [w for w in words if not w in stop_words]
print(fil_sen)

['This', 'example', 'sentence', 'testing', 'stop', 'words', '.']


## Video 3 : Stemming

In [27]:
from nltk.stem import PorterStemmer

In [28]:
ps = PorterStemmer()

In [29]:
ex = ["python", "pythoning", "pythoner"]

In [30]:
for w in ex:
    print(ps.stem(w))

python
python
python


In [31]:
new = "It is very important to do pythoning while learning python and be pythonly good in it."

In [32]:
words = word_tokenize(new)

In [33]:
for w in words:
    print(ps.stem(w))

It
is
veri
import
to
do
python
while
learn
python
and
be
pythonli
good
in
it
.


## Video 4 : PoS Tagging

In [42]:
import nltk
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

In [35]:
train_set = state_union.raw("2005-GWBush.txt")

In [36]:
sample_set = state_union.raw("2006-GWBush.txt")

In [37]:
custom_sent_tokenizer = PunktSentenceTokenizer(train_set)

0.009965034965034964 0.006688963210702341 0.010145729570190002 5720 299 57 2
0.00017482517482517483 0.0033444816053511705 0.0 5720 299 1 1
0.0013986013986013986 0.0033444816053511705 0.0012912746725696365 5720 299 8 1
0.001048951048951049 0.0033444816053511705 0.0009223390518354548 5720 299 6 1
0.0012237762237762239 0.016722408026755852 0.00036893562073418186 5720 299 7 5
0.04020979020979021 0.07357859531772576 0.03836930455635491 5720 299 230 22
0.00017482517482517483 0.0033444816053511705 0.0 5720 299 1 1
0.0015734265734265735 0.006688963210702341 0.0012912746725696365 5720 299 9 2
0.00017482517482517483 0.0033444816053511705 0.0 5720 299 1 1
0.0026223776223776225 0.010033444816053512 0.002213613724405091 5720 299 15 3
0.00017482517482517483 0.0033444816053511705 0.0 5720 299 1 1
0.0008741258741258741 0.006688963210702341 0.0005534034311012728 5720 299 5 2
0.0022727272727272726 0.0033444816053511705 0.002213613724405091 5720 299 13 1
0.003146853146853147 0.013377926421404682 0.002582

In [39]:
tokenized = custom_sent_tokenizer.tokenize(sample_set)

In [43]:
def process_pos():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))

In [44]:
process_pos()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

## Video 5 : Chunking

In [49]:
def process_chunk():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunk = r"""Chunk: {<RB.?>*<VB.?>*<NNP><NN>?}"""
            chunk_parser = nltk.RegexpParser(chunk)
            chunked = chunk_parser.parse(tagged)
            print(chunked)
    except Exception as e:
        print(str(e))

In [50]:
process_chunk()

(S
  (Chunk PRESIDENT/NNP)
  (Chunk GEORGE/NNP)
  (Chunk W./NNP)
  (Chunk BUSH/NNP)
  'S/POS
  (Chunk ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP)
  (Chunk JOINT/NNP)
  (Chunk SESSION/NNP)
  OF/IN
  (Chunk THE/NNP)
  (Chunk CONGRESS/NNP)
  (Chunk ON/NNP)
  (Chunk THE/NNP)
  (Chunk STATE/NNP)
  OF/IN
  (Chunk THE/NNP)
  (Chunk UNION/NNP)
  (Chunk January/NNP)
  31/CD
  ,/,
  2006/CD
  (Chunk THE/NNP)
  (Chunk PRESIDENT/NNP)
  :/:
  (Chunk Thank/NNP)
  you/PRP
  all/DT
  ./.)
(S
  (Chunk Mr./NNP)
  (Chunk Speaker/NNP)
  ,/,
  (Chunk Vice/NNP)
  (Chunk President/NNP)
  (Chunk Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (Chunk Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (Chunk Supreme/NNP)
  (Chunk Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  graceful/JJ
  ,/,
  courageous/JJ
  woman/NN
  who/WP
  (Chunk called/VBD America/NNP)

## Video 6 : Chinking

In [51]:
def process_chink():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            chunk = r"""Chunk: {<.*>+}
                    }<VB.?|IN|DT|TO>+{"""
            chunk_parser = nltk.RegexpParser(chunk)
            chunked = chunk_parser.parse(tagged)
            print(chunked)
    except Exception as e:
        print(str(e))

In [52]:
process_chink()

(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP 'S/POS ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk
    THE/NNP
    UNION/NNP
    January/NNP
    31/CD
    ,/,
    2006/CD
    THE/NNP
    PRESIDENT/NNP
    :/:
    Thank/NNP
    you/PRP)
  all/DT
  (Chunk ./.))
(S
  (Chunk
    Mr./NNP
    Speaker/NNP
    ,/,
    Vice/NNP
    President/NNP
    Cheney/NNP
    ,/,
    members/NNS)
  of/IN
  (Chunk Congress/NNP ,/, members/NNS)
  of/IN
  the/DT
  (Chunk
    Supreme/NNP
    Court/NNP
    and/CC
    diplomatic/JJ
    corps/NN
    ,/,
    distinguished/JJ
    guests/NNS
    ,/,
    and/CC
    fellow/JJ
    citizens/NNS
    :/:)
  Today/VB
  (Chunk our/PRP$ nation/NN)
  lost/VBD
  a/DT
  beloved/VBN
  (Chunk ,/, graceful/JJ ,/, courageous/JJ woman/NN who/WP)
  called/VBD
  (Chunk America/NNP)
  to/TO
  (Chunk its/PRP$ founding/NN ideals/NNS and/CC)
  carried/VBD
  on/IN
  a/DT
  (Chunk noble/

## Video 7 : Named Entity Recognition

In [57]:
def process_ner():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            namedEnt = nltk.ne_chunk(tagged)#, binary=True)
            print(namedEnt)
    except Exception as e:
        print(str(e))

In [58]:
process_ner()

(S
  PRESIDENT/NNP
  (PERSON GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (ORGANIZATION ADDRESS/NNP)
  BEFORE/IN
  A/NNP
  (ORGANIZATION JOINT/NNP)
  SESSION/NNP
  OF/IN
  (ORGANIZATION THE/NNP)
  (ORGANIZATION CONGRESS/NNP)
  ON/NNP
  THE/NNP
  (ORGANIZATION STATE/NNP OF/IN)
  (ORGANIZATION THE/NNP)
  (ORGANIZATION UNION/NNP)
  January/NNP
  31/CD
  ,/,
  2006/CD
  (ORGANIZATION THE/NNP)
  PRESIDENT/NNP
  :/:
  Thank/NNP
  you/PRP
  all/DT
  ./.)
(S
  (PERSON Mr./NNP Speaker/NNP)
  ,/,
  Vice/NNP
  President/NNP
  (PERSON Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (ORGANIZATION Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (ORGANIZATION Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  graceful/JJ
  ,/,
  courageous/JJ
  woman/NN
  who/WP
  called/VBD
  (GPE America/NNP)
  to/TO
  its/PRP$
  founding/NN
  ideals/NNS
  

## Video 8 : Lemmetizing

In [59]:
from nltk.stem import WordNetLemmatizer

In [60]:
lem = WordNetLemmatizer()

In [61]:
print(lem.lemmatize("cats"))

cat


In [62]:
print(lem.lemmatize("cacti"))

cactus


In [63]:
print(lem.lemmatize("better"))

better


In [64]:
print(lem.lemmatize("better", pos='a'))

good


In [66]:
print(lem.lemmatize("run", pos='a'))

run


## Video 9 : NLTK Corpora

In [67]:
from nltk.corpus import gutenberg
from nltk.tokenize import sent_tokenize

In [68]:
sample_corp = gutenberg.raw("bible-kjv.txt")

In [69]:
tok = sent_tokenize(sample_corp)

In [71]:
print(tok[5:15])

['1:5 And God called the light Day, and the darkness he called Night.', 'And the evening and the morning were the first day.', '1:6 And God said, Let there be a firmament in the midst of the waters,\nand let it divide the waters from the waters.', '1:7 And God made the firmament, and divided the waters which were\nunder the firmament from the waters which were above the firmament:\nand it was so.', '1:8 And God called the firmament Heaven.', 'And the evening and the\nmorning were the second day.', '1:9 And God said, Let the waters under the heaven be gathered together\nunto one place, and let the dry land appear: and it was so.', '1:10 And God called the dry land Earth; and the gathering together of\nthe waters called he Seas: and God saw that it was good.', '1:11 And God said, Let the earth bring forth grass, the herb yielding\nseed, and the fruit tree yielding fruit after his kind, whose seed is\nin itself, upon the earth: and it was so.', '1:12 And the earth brought forth grass, and

## Video 10 : WordNet

In [72]:
from nltk.corpus import wordnet

In [73]:
sysns = wordnet.synsets("program")

In [74]:
print(sysns)

[Synset('plan.n.01'), Synset('program.n.02'), Synset('broadcast.n.02'), Synset('platform.n.02'), Synset('program.n.05'), Synset('course_of_study.n.01'), Synset('program.n.07'), Synset('program.n.08'), Synset('program.v.01'), Synset('program.v.02')]


In [75]:
print(sysns[0].lemmas())

[Lemma('plan.n.01.plan'), Lemma('plan.n.01.program'), Lemma('plan.n.01.programme')]


In [76]:
print(sysns[0].lemmas()[0])

Lemma('plan.n.01.plan')


In [77]:
print(sysns[0].lemmas()[0].name())

plan


In [78]:
print(sysns[0].definition())

a series of steps to be carried out or goals to be accomplished


In [79]:
print(sysns[0].examples())

['they drew up a six-step plan', 'they discussed plans for a new bond issue']


In [80]:
synonym = []
antonym = []

In [88]:
for sysns in wordnet.synsets("good"):
    for l in sysns.lemmas():
        synonym.append(l.name())
        if l.antonyms():
            antonym.append(l.antonyms()[0].name())
print("SYNONYMS : "+str(set(synonym)))
print("ANTONYMS : "+str(set(antonym)))

SYNONYMS : {'respectable', 'unspoiled', 'dear', 'honorable', 'secure', 'soundly', 'proficient', 'ripe', 'near', 'honest', 'expert', 'practiced', 'in_force', 'full', 'well', 'just', 'upright', 'skillful', 'right', 'good', 'sound', 'skilful', 'trade_good', 'estimable', 'commodity', 'effective', 'serious', 'adept', 'goodness', 'salutary', 'unspoilt', 'thoroughly', 'in_effect', 'safe', 'dependable', 'beneficial', 'undecomposed'}
ANTONYMS : {'bad', 'evilness', 'ill', 'badness', 'evil'}


In [89]:
# Semantic similarity
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("boat.n.01")

In [90]:
print(w1.wup_similarity(w2))

0.9090909090909091


In [91]:
w3 = wordnet.synset("cat.n.01")

In [92]:
print(w1.wup_similarity(w3))

0.32


## Video 11 : Text Classification

In [93]:
import random
from nltk.corpus import movie_reviews

In [96]:
docs = [(list(movie_reviews.words(fileids)), category) for category in movie_reviews.categories() for fileids in movie_reviews.fileids(category)]

In [97]:
random.shuffle(docs)

In [98]:
print(docs[1])

(['woody', 'allen', 'is', 'one', 'of', 'the', 'most', 'successful', 'artist', '-', 'directors', 'in', 'hollywood', ',', 'but', 'he', 'is', 'becoming', 'less', 'and', 'less', 'reliable', 'as', 'a', 'filmmaker', '.', 'in', 'his', 'early', 'years', 'of', 'film', '-', 'making', 'he', 'mastered', 'the', 'simple', 'comedy', '.', 'from', 'there', 'he', 'went', 'into', 'a', 'second', 'phase', 'and', 'took', 'risks', 'experimenting', 'with', 'different', 'approaches', 'and', 'styles', '.', 'some', 'of', 'these', 'work', 'better', 'than', 'others', '.', 'zelig', 'and', 'crimes', 'and', 'misdemeanors', 'are', 'the', 'work', 'of', 'a', 'creative', 'and', 'intelligent', 'artist', '.', 'deconstructing', 'harry', 'goes', 'to', 'the', 'other', 'extreme', 'and', 'is', 'a', 'bizarre', 'experiment', 'demanding', 'more', 'of', 'the', 'viewer', 'than', 'it', 'gives', 'back', '.', 'harry', 'block', '(', 'allen', ')', 'has', 'in', 'his', 'life', 'only', 'two', 'drives', '.', 'he', 'wants', 'to', 'have', 'sex

In [99]:
all_words = []

In [101]:
for w in movie_reviews.words():
    all_words.append(w.lower())

In [102]:
all_words = nltk.FreqDist(all_words)

In [103]:
print(all_words.most_common(15))

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


In [104]:
print(all_words["stupid"])

253


## Video 12 : Words as features for learning

In [105]:
word_features = list(all_words.keys())[:1000]

In [106]:
def find_features(doc):
    words = set(doc)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

In [107]:
print(find_features(movie_reviews.words('neg/cv000_29416.txt')))

{'pinter': False, 'jeanine': False, 'barreled': False, 'lowers': False, 'unmistakeable': False, 'smug': False, 'castro': False, 'nuts': False, 'flicks': True, 'eighth': False, 'represents': False, 'concede': False, 'orient': False, 'ws': False, 'sonnenfelds': False, 'souk': False, 'tinder': False, 'engagements': False, 'conehead': False, 'harming': False, 'accessorize': False, 'ratchets': False, 'langella': False, 'comfortably': False, 'weightiness': False, 'ditzism': False, 'thelma': False, 'kotto': False, 'escapades': False, 'damed': False, 'unpleasantness': False, 'rhythms': False, 'sheila': False, 'saunders': False, 'popitti': False, 'reshoot': False, '48': False, 'attractively': False, 'otherness': False, 'roads': False, 'notables': False, 'thrillerism': False, 'mindlessness': False, 'apparatus': False, 'unwanted': False, 'sorossy': False, 'millimeter': False, 'fluffy': False, 'mediciney': False, 'jointly': False, 'nora': False, 'gobbledegook': False, 'takehiro': False, 'tightly':

In [109]:
feature_set = [(find_features(rev), category) for (rev, category) in docs]

## Video 13 : Naive Bayes algorithm

In [111]:
train = feature_set[:1900]
test = feature_set[1900:]

In [112]:
classifiers = nltk.NaiveBayesClassifier.train(train)

In [114]:
print("Accuracy of naive bayes : ", (nltk.classify.accuracy(classifiers, test))*100)

Accuracy of naive bayes :  69.0


In [115]:
classifiers.show_most_informative_features(15)

Most Informative Features
                  denial = True              pos : neg    =      7.7 : 1.0
                  subtly = True              pos : neg    =      5.7 : 1.0
                 regards = True              pos : neg    =      5.7 : 1.0
               collector = True              pos : neg    =      5.7 : 1.0
              rightfully = True              pos : neg    =      5.0 : 1.0
              unoriginal = True              neg : pos    =      4.7 : 1.0
          anthropologist = True              neg : pos    =      4.3 : 1.0
                   spins = True              pos : neg    =      4.3 : 1.0
            terrifically = True              pos : neg    =      4.3 : 1.0
                  wilder = True              pos : neg    =      4.3 : 1.0
                  rubber = True              neg : pos    =      4.2 : 1.0
              skillfully = True              pos : neg    =      4.1 : 1.0
                peaceful = True              pos : neg    =      3.9 : 1.0