In [1]:
from nltk.tokenize import sent_tokenize, word_tokenize

In [2]:
text = "Hi, I am Vasanth. How are you? I am coding now, will call you later."

In [3]:
# Sentence tokenizing
print(sent_tokenize(text))

['Hi, I am Vasanth.', 'How are you?', 'I am coding now, will call you later.']


In [4]:
# Word tokenizing
print(word_tokenize(text))

['Hi', ',', 'I', 'am', 'Vasanth', '.', 'How', 'are', 'you', '?', 'I', 'am', 'coding', 'now', ',', 'will', 'call', 'you', 'later', '.']


In [5]:
# Remove stop words - unwanted words
from nltk.corpus import stopwords
print(set(stopwords.words('english')))

{'t', 'this', 'how', 'be', 'there', 'just', "hasn't", 'your', "don't", 'on', 'above', 'd', 'hers', 'doesn', 'wouldn', 'hasn', 'me', 'very', 'he', 'here', "hadn't", 'shan', 'did', 'my', 'couldn', 'are', 'other', 'while', 'a', 'into', 'weren', 'why', 'is', 'was', 'same', 'herself', 'as', 'we', "isn't", 'itself', "weren't", 'isn', "you'd", 'some', 'wasn', 'who', 'does', 'at', 'only', 'should', 'those', 'will', "mightn't", "wasn't", 'until', 'own', 'for', 'over', 'most', 'against', 'had', 'the', 'but', 'then', 'didn', 'now', 'and', "mustn't", 'hadn', 'won', 'him', 'were', 'out', 'y', 'than', 'being', 'yourself', 'don', 'll', 'after', "you're", 'have', "you'll", 'has', "couldn't", 'between', 'with', 'or', 'few', 'mustn', 'needn', "you've", "didn't", "she's", 'before', 'ma', 'too', 'so', 'below', 'about', 'where', 'i', "aren't", 'to', 'if', 'by', 're', 'all', 'under', 'haven', 'from', 'which', 've', "shouldn't", "it's", 'such', "haven't", 'her', "that'll", 'through', 'down', 'aren', 'their',

In [6]:
example = "This is some sample sentence, showing of some stop words."

stop_words = set(stopwords.words('english'))

word_tokens = word_tokenize(example)

filtered_sentence = [w for w in word_tokens if not w in stop_words]
print(filtered_sentence)

['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', '.']


In [7]:
# Stemming words with nltk
from nltk.stem import PorterStemmer

ps = PorterStemmer()

example_words = ['ride','riding','rider','rides']

for w in example_words:
    print(ps.stem(w))

ride
ride
rider
ride


In [8]:
# Stemming an entire sentence

e = "When riders are riding their horses, they often think of how cowboys rode their horses."

words = word_tokenize(e)

for w in words:
    print(ps.stem(w))

when
rider
are
ride
their
hors
,
they
often
think
of
how
cowboy
rode
their
hors
.


In [9]:
from nltk.corpus import udhr
print(udhr.raw('English-Latin1'))

Universal Declaration of Human Rights
Preamble
Whereas recognition of the inherent dignity and of the equal and inalienable rights of all members of the human family is the foundation of freedom, justice and peace in the world, 

Whereas disregard and contempt for human rights have resulted in barbarous acts which have outraged the conscience of mankind, and the advent of a world in which human beings shall enjoy freedom of speech and belief and freedom from fear and want has been proclaimed as the highest aspiration of the common people, 

Whereas it is essential, if man is not to be compelled to have recourse, as a last resort, to rebellion against tyranny and oppression, that human rights should be protected by the rule of law, 

Whereas it is essential to promote the development of friendly relations between nations, 

Whereas the peoples of the United Nations have in the Charter reaffirmed their faith in fundamental human rights, in the dignity and worth of the human person and in

In [10]:
from nltk.corpus import state_union
from nltk.tokenize import PunktSentenceTokenizer

train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

In [11]:
print(train_text)

PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION
 
February 2, 2005


9:10 P.M. EST 

THE PRESIDENT: Mr. Speaker, Vice President Cheney, members of Congress, fellow citizens: 

As a new Congress gathers, all of us in the elected branches of government share a great privilege: We've been placed in office by the votes of the people we serve. And tonight that is a privilege we share with newly-elected leaders of Afghanistan, the Palestinian Territories, Ukraine, and a free and sovereign Iraq. (Applause.) 

Two weeks ago, I stood on the steps of this Capitol and renewed the commitment of our nation to the guiding ideal of liberty for all. This evening I will set forth policies to advance that ideal at home and around the world. 

Tonight, with a healthy, growing economy, with more Americans going back to work, with our nation an active force for good in the world -- the state of our union is confident and strong. (Applause.) 

Our generati

In [12]:
# Train PunktSentence Tokenizer

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)

In [13]:
tokenized = custom_sent_tokenizer.tokenize(sample_text)

In [14]:
print(tokenized)

["PRESIDENT GEORGE W. BUSH'S ADDRESS BEFORE A JOINT SESSION OF THE CONGRESS ON THE STATE OF THE UNION\n \nJanuary 31, 2006\n\nTHE PRESIDENT: Thank you all.", 'Mr. Speaker, Vice President Cheney, members of Congress, members of the Supreme Court and diplomatic corps, distinguished guests, and fellow citizens: Today our nation lost a beloved, graceful, courageous woman who called America to its founding ideals and carried on a noble dream.', 'Tonight we are comforted by the hope of a glad reunion with the husband who was taken so long ago, and we are grateful for the good life of Coretta Scott King.', '(Applause.)', 'President George W. Bush reacts to applause during his State of the Union Address at the Capitol, Tuesday, Jan.', '31, 2006.', "White House photo by Eric DraperEvery time I'm invited to this rostrum, I'm humbled by the privilege, and mindful of the history we've seen together.", 'We have gathered under this Capitol dome in moments of national mourning and national achievemen

In [15]:
# Define function for part of speech tagging
import nltk

def process_content():
    try:
        for i in tokenized[:5]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            print(tagged)
    except Exception as e:
        print(str(e))

process_content()

[('PRESIDENT', 'NNP'), ('GEORGE', 'NNP'), ('W.', 'NNP'), ('BUSH', 'NNP'), ("'S", 'POS'), ('ADDRESS', 'NNP'), ('BEFORE', 'IN'), ('A', 'NNP'), ('JOINT', 'NNP'), ('SESSION', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('CONGRESS', 'NNP'), ('ON', 'NNP'), ('THE', 'NNP'), ('STATE', 'NNP'), ('OF', 'IN'), ('THE', 'NNP'), ('UNION', 'NNP'), ('January', 'NNP'), ('31', 'CD'), (',', ','), ('2006', 'CD'), ('THE', 'NNP'), ('PRESIDENT', 'NNP'), (':', ':'), ('Thank', 'NNP'), ('you', 'PRP'), ('all', 'DT'), ('.', '.')]
[('Mr.', 'NNP'), ('Speaker', 'NNP'), (',', ','), ('Vice', 'NNP'), ('President', 'NNP'), ('Cheney', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('Congress', 'NNP'), (',', ','), ('members', 'NNS'), ('of', 'IN'), ('the', 'DT'), ('Supreme', 'NNP'), ('Court', 'NNP'), ('and', 'CC'), ('diplomatic', 'JJ'), ('corps', 'NN'), (',', ','), ('distinguished', 'JJ'), ('guests', 'NNS'), (',', ','), ('and', 'CC'), ('fellow', 'JJ'), ('citizens', 'NNS'), (':', ':'), ('Today', 'VB'), ('our', 'PRP$'), ('nat

In [16]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

In [18]:
# Chunking with NLTK

train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[:3]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            # Combine part of speech tag with regular expression
            chunkGram = r"""Chunk: {<.*>+}
                                          }<VB.?|IN|DT|TO>+{"""
            chunkparser = nltk.RegexpParser(chunkGram)
            chunked = chunkparser.parse(tagged)
            
            # Main difference in brackets }{
            
            # Print the nltk tree
            for subtree in chunked.subtrees(filter = lambda t: t.label() == "Chunk"):
                print(subtree)
            
            # Draw chunks with nltk
            #chunked.draw()
            
    except Exception as e:
        print(str(e))

process_content()

(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
(Chunk ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk THE/NNP UNION/NNP January/NNP)
(Chunk THE/NNP PRESIDENT/NNP)
(Chunk Thank/NNP)
(Chunk Mr./NNP Speaker/NNP)
(Chunk Vice/NNP President/NNP Cheney/NNP)
(Chunk Congress/NNP)
(Chunk Supreme/NNP Court/NNP)
(Chunk called/VBD America/NNP)
(Chunk Coretta/NNP Scott/NNP King/NNP)


In [19]:
# Chinking with NLTK

train_text = state_union.raw('2005-GWBush.txt')
sample_text = state_union.raw('2006-GWBush.txt')

custom_sent_tokenizer = PunktSentenceTokenizer(train_text)
tokenized = custom_sent_tokenizer.tokenize(sample_text)

def process_content():
    try:
        for i in tokenized[:3]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            
            # Combine part of speech tag with regular expression
            chunkGram = r"""Chunk: {<RB.?>*<VB.>*<NNP>+<NN>?}"""
            chunkparser = nltk.RegexpParser(chunkGram)
            chunked = chunkparser.parse(tagged)
            
            # Print the nltk tree
            for subtree in chunked.subtrees(filter = lambda t: t.label() == "Chunk"):
                print(subtree)
            
            # Draw chunks with nltk
            #chunked.draw()
            
    except Exception as e:
        print(str(e))

process_content()

(Chunk PRESIDENT/NNP GEORGE/NNP W./NNP BUSH/NNP)
(Chunk ADDRESS/NNP)
(Chunk A/NNP JOINT/NNP SESSION/NNP)
(Chunk THE/NNP CONGRESS/NNP ON/NNP THE/NNP STATE/NNP)
(Chunk THE/NNP UNION/NNP January/NNP)
(Chunk THE/NNP PRESIDENT/NNP)
(Chunk Thank/NNP)
(Chunk Mr./NNP Speaker/NNP)
(Chunk Vice/NNP President/NNP Cheney/NNP)
(Chunk Congress/NNP)
(Chunk Supreme/NNP Court/NNP)
(Chunk called/VBD America/NNP)
(Chunk Coretta/NNP Scott/NNP King/NNP)


In [21]:
def process_content():
    try:
        for i in tokenized[:3]:
            words = nltk.word_tokenize(i)
            tagged = nltk.pos_tag(words)
            named_ENT = nltk.ne_chunk(tagged, binary = False)
            
            # Draw NER with nltk
            named_ENT.draw()
            
    except Exception as e:
        print(str(e))

process_content()

In [22]:
import random
from nltk.corpus import movie_reviews

In [23]:
# Build list od documents
document = [(list(movie_reviews.words(fileid)), category)
           for category in movie_reviews.categories()
           for fileid in movie_reviews.fileids(category)]

# Shuffle the document
random.shuffle(document)

print("Number of documents: ",len(document))
print(document[0])

all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words = nltk.FreqDist(all_words)

print("Most Common Words: ",all_words.most_common(5))
print("All words happy: ",all_words['happy'])

Number of documents:  2000
(['"', 'if', 'there', "'", 's', 'a', 'beast', 'in', 'men', ',', 'it', 'meets', 'its', 'match', 'in', 'women', ',', 'too', '.', '"', 'starring', 'sarah', 'patterson', ',', 'angela', 'lansbury', ',', 'tusse', 'silberg', ',', 'david', 'warner', 'directed', 'by', 'neil', 'jordan', 'written', 'by', 'jordan', 'and', 'angela', 'carter', ',', 'from', 'a', 'story', 'by', 'carter', 'cinematography', 'by', 'bryan', 'loftus', 'in', 'recent', 'years', ',', 'there', 'has', 'been', 'a', 'trend', 'in', 'the', 'field', 'of', 'fantasy', ':', 'writers', 'have', 'been', 'revisiting', 'the', 'fertile', 'world', 'of', 'myth', 'and', 'fairy', 'tale', 'and', 'reclaiming', 'that', 'world', ',', 'investing', 'it', 'with', 'new', 'life', 'and', 'energy', '.', 'in', 'modern', 'times', ',', 'fairy', 'tales', 'have', 'become', 'disneyfied', 'and', 'debased', ';', 'they', 'have', 'become', 'trite', 'adventures', 'involving', 'leering', 'witches', ',', 'friendly', 'dwarves', ',', 'and', 'cu

In [24]:
print(len(all_words))

39768


In [25]:
# We will use only 4000 most common words as features
word_features = list(all_words.keys())[:4000]

In [27]:
# Build a find feature function to find words present in the review

def find_features(doc):
    words = set(doc)
    features = {}
    
    for w in word_features:
        features[w] = (w in words)
    
    return features

# Example of negative review
ex = find_features(movie_reviews.words('neg/cv000_29416.txt'))
for key,value in ex.items():
    if value == True:
        print(key)

plot
:
two
teen
couples
go
to
a
church
party
,
drink
and
then
drive
.
they
get
into
an
accident
one
of
the
guys
dies
but
his
girlfriend
continues
see
him
in
her
life
has
nightmares
what
'
s
deal
?
watch
movie
"
sorta
find
out
critique
mind
-
fuck
for
generation
that
touches
on
very
cool
idea
presents
it
bad
package
which
is
makes
this
review
even
harder
write
since
i
generally
applaud
films
attempt
break
mold
mess
with
your
head
such
(
lost
highway
&
memento
)
there
are
good
ways
making
all
types
these
folks
just
didn
t
snag
correctly
seem
have
taken
pretty
neat
concept
executed
terribly
so
problems
well
its
main
problem
simply
too
jumbled
starts
off
normal
downshifts
fantasy
world
you
as
audience
member
no
going
dreams
characters
coming
back
from
dead
others
who
look
like
strange
apparitions
disappearances
looooot
chase
scenes
tons
weird
things
happen
most
not
explained
now
personally
don
trying
unravel
film
every
when
does
give
me
same
clue
over
again
kind
fed
up
after
while
biggest


In [28]:
print(ex)



In [29]:
# Now lets do for all documents

featuresets = [(find_features(rev), category) for rev,category in document]

In [31]:
from sklearn.model_selection import train_test_split

training,testing = train_test_split(featuresets, test_size = 0.25)

In [32]:
print(len(training))
print(len(testing))

1500
500


In [33]:
# Use sklearn in nltk

from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.svm import SVC

In [34]:
model = SklearnClassifier(SVC(kernel = 'linear'))

In [35]:
# Train the model
model.train(training)

<SklearnClassifier(SVC(kernel='linear'))>

In [36]:
# Test the model
accuracy = nltk.classify.accuracy(model,testing)
print("Accuracy: ",accuracy)

Accuracy:  0.822
