In [1]:
import nltk
# nltk.download()

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize

example_text = 'Hello Mr. Smith, How are you doing? Today\'s weather is awesome and Python is good. See you soon.'
print(sent_tokenize(example_text))
print(word_tokenize(example_text))

['Hello Mr. Smith, How are you doing?', "Today's weather is awesome and Python is good.", 'See you soon.']
['Hello', 'Mr.', 'Smith', ',', 'How', 'are', 'you', 'doing', '?', 'Today', "'s", 'weather', 'is', 'awesome', 'and', 'Python', 'is', 'good', '.', 'See', 'you', 'soon', '.']


In [3]:
from nltk.corpus import stopwords

example_sentence = 'This is an example showing off stop word filteration.'
stop_words = set(stopwords.words("english"))

words = word_tokenize(example_sentence)

filtered_sentence = [w for w in words if not w in stop_words]
print(filtered_sentence)

['This', 'example', 'showing', 'stop', 'word', 'filteration', '.']


In [4]:
from nltk.stem import PorterStemmer

ps = PorterStemmer()

example_sentence = "It is very important to be pythonly while you are pythoning with python. All pythoners have pythoned poorly atleat once."

stemmed_sentence = [ps.stem(w) for w in word_tokenize(example_sentence)]
print(stemmed_sentence)

['It', 'is', 'veri', 'import', 'to', 'be', 'pythonli', 'while', 'you', 'are', 'python', 'with', 'python', '.', 'all', 'python', 'have', 'python', 'poorli', 'atleat', 'onc', '.']


In [5]:
from nltk.corpus import state_union

sample_text = state_union.raw("2006-GWBush.txt")
tokenized = sent_tokenize(sample_text[0:600])
print(sample_text[0:200])
for i in tokenized:
    print(nltk.pos_tag(nltk.word_tokenize(i)))

PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION
 
January 31, 2006

THE PRESIDENT: Thank you all. Mr. Speaker, Vice President Cheney, members of Con
[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomati

In [6]:
for i in tokenized:
    pos = nltk.pos_tag(nltk.word_tokenize(i))
    chunkGram = r"""Chunk: {<RB.?>*<VB.?>*<NNP>+<NN>?}"""
    chunkParser = nltk.RegexpParser(chunkGram)
    chunked = chunkParser. parse(pos)
    print(chunked)
    #chunked.draw()

(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (Chunk ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk THE/NNP UNION/NNP January/NNP)
  31/CD
  ,/,
  2006/CD
  (Chunk THE/NNP PRESIDENT/NNP)
  :/:
  (Chunk Thank/NNP)
  you/PRP
  all/DT
  ./.)
(S
  (Chunk Mr./NNP Speaker/NNP)
  ,/,
  (Chunk Vice/NNP President/NNP Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (Chunk Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (Chunk Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  graceful/JJ
  ,/,
  courageous/JJ
  woman/NN
  who/WP
  (Chunk called/VBD America/NNP)
  to/TO
  its/PRP$
  founding/NN
  ideals/NNS
  and/CC
  carried/VBD
  on/IN
  a/DT
  noble/JJ
  dream/NN
  ./.)
(S
  Tonight/NN
  we/PRP
  are/VBP
  comforted

In [7]:
for i in tokenized:
    pos = nltk.pos_tag(nltk.word_tokenize(i))
    chunkGram = r"""Chunk: {<.*>+} 
                           }<VB.?|IN|DT|TO>+{"""
    chunkParser = nltk.RegexpParser(chunkGram)
    chunked = chunkParser. parse(pos)
    print(chunked)
    #chunked.draw()

(S
  (Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP 'S/POS ADDRESS/NNP)
  BEFORE/IN
  (Chunk A/NNP JOINT/NNP SESSION/NNP)
  OF/IN
  (Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
  OF/IN
  (Chunk
    THE/NNP
    UNION/NNP
    January/NNP
    31/CD
    ,/,
    2006/CD
    THE/NNP
    PRESIDENT/NNP
    :/:
    Thank/NNP
    you/PRP)
  all/DT
  (Chunk ./.))
(S
  (Chunk
    Mr./NNP
    Speaker/NNP
    ,/,
    Vice/NNP
    President/NNP
    Cheney/NNP
    ,/,
    members/NNS)
  of/IN
  (Chunk Congress/NNP ,/, members/NNS)
  of/IN
  the/DT
  (Chunk
    Supreme/NNP
    Court/NNP
    and/CC
    diplomatic/JJ
    corps/NN
    ,/,
    distinguished/JJ
    guests/NNS
    ,/,
    and/CC
    fellow/JJ
    citizens/NNS
    :/:)
  Today/VB
  (Chunk our/PRP$ nation/NN)
  lost/VBD
  a/DT
  beloved/VBN
  (Chunk ,/, graceful/JJ ,/, courageous/JJ woman/NN who/WP)
  called/VBD
  (Chunk America/NNP)
  to/TO
  (Chunk its/PRP$ founding/NN ideals/NNS and/CC)
  carried/VBD
  on/IN
  a/DT
  (Chunk noble/

In [8]:
for i in tokenized:
    pos = nltk.pos_tag(nltk.word_tokenize(i))
    chunked = nltk.ne_chunk(pos)
    print(chunked)
    #chunked.draw()

(S
  PRESIDENT/NNP
  (PERSON GEORGE/NNP W./NNP BUSH/NNP)
  'S/POS
  (ORGANIZATION ADDRESS/NNP)
  BEFORE/IN
  A/NNP
  (ORGANIZATION JOINT/NNP)
  SESSION/NNP
  OF/IN
  (ORGANIZATION THE/NNP)
  (ORGANIZATION CONGRESS/NNP)
  ON/NNP
  THE/NNP
  (ORGANIZATION STATE/NNP OF/IN)
  (ORGANIZATION THE/NNP)
  (ORGANIZATION UNION/NNP)
  January/NNP
  31/CD
  ,/,
  2006/CD
  (ORGANIZATION THE/NNP)
  PRESIDENT/NNP
  :/:
  Thank/NNP
  you/PRP
  all/DT
  ./.)
(S
  (PERSON Mr./NNP Speaker/NNP)
  ,/,
  Vice/NNP
  President/NNP
  (PERSON Cheney/NNP)
  ,/,
  members/NNS
  of/IN
  (ORGANIZATION Congress/NNP)
  ,/,
  members/NNS
  of/IN
  the/DT
  (ORGANIZATION Supreme/NNP Court/NNP)
  and/CC
  diplomatic/JJ
  corps/NN
  ,/,
  distinguished/JJ
  guests/NNS
  ,/,
  and/CC
  fellow/JJ
  citizens/NNS
  :/:
  Today/VB
  our/PRP$
  nation/NN
  lost/VBD
  a/DT
  beloved/VBN
  ,/,
  graceful/JJ
  ,/,
  courageous/JJ
  woman/NN
  who/WP
  called/VBD
  (GPE America/NNP)
  to/TO
  its/PRP$
  founding/NN
  ideals/NNS
  

In [9]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
print(lemmatizer.lemmatize("better",'a'))

good


In [10]:
nltk.__file__

'C:\\Users\\TeddyBear\\anaconda3\\envs\\py3-TF2\\lib\\site-packages\\nltk\\__init__.py'

In [11]:
from nltk.corpus import wordnet

syns = wordnet.synsets("program")
print(syns[0].lemmas()[1].name())
print(syns[1].definition())
print(syns[1].examples())

program
a system of projects or services intended to meet a public need
['he proposed an elaborate program of public works', 'working mothers rely on the day care program']


In [12]:
synonyms = []
antonyms = []

for syn in wordnet.synsets("good"):
    for l in syn.lemmas():
        synonyms.append(l.name())
        if l.antonyms():
            antonyms.append(l.antonyms()[0].name())
            
print(synonyms)
print(antonyms)

['good', 'good', 'goodness', 'good', 'goodness', 'commodity', 'trade_good', 'good', 'good', 'full', 'good', 'good', 'estimable', 'good', 'honorable', 'respectable', 'beneficial', 'good', 'good', 'good', 'just', 'upright', 'adept', 'expert', 'good', 'practiced', 'proficient', 'skillful', 'skilful', 'good', 'dear', 'good', 'near', 'dependable', 'good', 'safe', 'secure', 'good', 'right', 'ripe', 'good', 'well', 'effective', 'good', 'in_effect', 'in_force', 'good', 'good', 'serious', 'good', 'sound', 'good', 'salutary', 'good', 'honest', 'good', 'undecomposed', 'unspoiled', 'unspoilt', 'good', 'well', 'good', 'thoroughly', 'soundly', 'good']
['evil', 'evilness', 'bad', 'badness', 'bad', 'evil', 'ill']


In [13]:
w1 = wordnet.synset("ship.n.01")
w2 = wordnet.synset("boat.n.01")
print(w1.wup_similarity(w2))

0.9090909090909091


In [14]:
import random
from nltk.corpus import movie_reviews

documents = [(list(movie_reviews.words(fileid)), category)
                for category in movie_reviews.categories()
                for fileid in movie_reviews.fileids(category)]
# print(documents[1])
random.shuffle(documents)
# print(documents[2])
all_words = []
count = 0
for w in movie_reviews.words():
    all_words.append(w.lower())
all_words = nltk.FreqDist(all_words)
print(all_words["stupid"])

253


In [15]:
word_features = list(all_words.keys())[0:3000]

def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

# print(find_features(movie_reviews.words('neg/cv000_29416.txt')))
feature_sets = [(find_features(rev), category) for (rev, category) in documents]

In [16]:
training_set = feature_sets[:1900]
test_set = feature_sets[1900:]

classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Accuracy: ", nltk.classify.accuracy(classifier, test_set))

Accuracy:  0.81


In [17]:
classifier.show_most_informative_features(15)

Most Informative Features
              schumacher = True              neg : pos    =     11.1 : 1.0
                   sucks = True              neg : pos    =      9.1 : 1.0
                 frances = True              pos : neg    =      8.9 : 1.0
                  annual = True              pos : neg    =      8.9 : 1.0
             silverstone = True              neg : pos    =      7.7 : 1.0
           unimaginative = True              neg : pos    =      7.7 : 1.0
                   groan = True              neg : pos    =      7.7 : 1.0
                  shoddy = True              neg : pos    =      7.0 : 1.0
                 idiotic = True              neg : pos    =      6.8 : 1.0
               atrocious = True              neg : pos    =      6.6 : 1.0
                  regard = True              pos : neg    =      6.6 : 1.0
                  turkey = True              neg : pos    =      6.4 : 1.0
                  suvari = True              neg : pos    =      6.4 : 1.0

In [18]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

In [19]:
the_classifier = SklearnClassifier(BernoulliNB())
classifier_ = the_classifier.train(training_set)
print("Accuracy Bernoulli: ", nltk.classify.accuracy(classifier_, test_set))

Accuracy Bernoulli:  0.81
