In [1]:
# Libraries

# NLKT - Natural Language Toolki

import nltk


In [2]:
nltk.download()

showing info https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml


True

## Tokenization

In [3]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

# Dataset
dataset = 'Hello Everyone. Welcome to this course. We are studying NLP.'
print(dataset)

Hello Everyone. Welcome to this course. We are studying NLP.


In [4]:
# Tokenizing sentences

print(sent_tokenize(text=dataset, language='english'))

['Hello Everyone.', 'Welcome to this course.', 'We are studying NLP.']


In [5]:
for i in sent_tokenize(text=dataset, language='english'):
    print(i)

Hello Everyone.
Welcome to this course.
We are studying NLP.


In [6]:
# Tokenizing words

print(word_tokenize(text=dataset, language='english'))

['Hello', 'Everyone', '.', 'Welcome', 'to', 'this', 'course', '.', 'We', 'are', 'studying', 'NLP', '.']


In [7]:
for i in word_tokenize(text=dataset, language='english'):
    print(i)

Hello
Everyone
.
Welcome
to
this
course
.
We
are
studying
NLP
.


## Stemming

In [8]:
from nltk.stem import PorterStemmer

dataset = ['love', 'loving', 'lover', 'loved', 'lovingly']
print(dataset)

['love', 'loving', 'lover', 'loved', 'lovingly']


In [9]:
# Apply stemming

ps = PorterStemmer()

for i in dataset:
    print(ps.stem(i))

love
love
lover
love
lovingli


In [10]:
# Another example - tokenization and stemming

new_dataset = """It feels very special when you are loving someone.
                We care for our loved ones.
                Specially when we love each other unconditionally."""

words = word_tokenize(text=new_dataset)

for w in words:
    print(ps.stem(w))

It
feel
veri
special
when
you
are
love
someon
.
We
care
for
our
love
one
.
special
when
we
love
each
other
uncondit
.


## Lemmatization

Process of converting a word into its dictionary form. e.g: Feet > Foot

In [11]:
from nltk.stem import WordNetLemmatizer

wnl = WordNetLemmatizer()

wnl.lemmatize('churches')

'church'

In [12]:
wnl.lemmatize('feet')

'foot'

In [13]:
# With an adjective 

wnl.lemmatize('better', pos='a')

'good'

## Stop Words

In [14]:
from nltk.corpus import stopwords

adataset = """ Hello Mr. Watson, how are you doing today? 
             The weather is awesome. The garden is Green. 
             We should go out for a walk."""
print(adataset)

 Hello Mr. Watson, how are you doing today? 
             The weather is awesome. The garden is Green. 
             We should go out for a walk.


In [15]:
# Create set of English Stop Words

stop_words = set(stopwords.words('english'))
print(stop_words)

{'yourself', 'further', 'we', 'be', 'mightn', 'he', 'their', "won't", 'from', 'didn', 'weren', 'against', 'can', "doesn't", 'whom', 'too', 'my', "weren't", 'himself', 'of', 'will', 'such', 'do', 'so', 'she', 'each', "it's", 'most', 'does', 'myself', 'until', 'in', 'shan', 'down', 'with', 'if', 'at', 'm', "should've", 'had', 'needn', 'who', 'by', 'but', 'theirs', 'been', "she's", "wouldn't", 'same', "hadn't", 'under', "mustn't", 'you', 'ourselves', "don't", 'all', 'haven', 'were', 'below', 'the', 'has', 'after', 'while', 'because', 'should', "needn't", 'did', "you're", 'her', 'just', 'on', 'and', "that'll", 're', 'where', 'yours', 'not', "haven't", 'when', 'or', 'isn', "shan't", 'doesn', 'ours', 'herself', 'here', 'ain', 'ma', 'don', 'as', 'itself', 'y', 'into', 'them', 's', 't', 'mustn', 'which', 'll', 'few', "didn't", "mightn't", 'hadn', 'during', 'doing', 'very', "couldn't", 'o', "aren't", 'through', 'more', 've', 'a', "isn't", 'this', 'hers', 'other', 'that', 'aren', 'having', 'no',

In [16]:
# Tokenize words

word_tokenize = word_tokenize(text=adataset)

In [17]:
# Remove Stop Words from dataset

filtered_sentences = []

for w in word_tokenize:
    if w not in stop_words:
        filtered_sentences.append(w)
        
print(filtered_sentences)

['Hello', 'Mr.', 'Watson', ',', 'today', '?', 'The', 'weather', 'awesome', '.', 'The', 'garden', 'Green', '.', 'We', 'go', 'walk', '.']


## POS Tagging - Part of Speech

e.g. 'Paper' > 'Noun'

In [18]:
from nltk.tag import pos_tag

another_dataset = """Taj Mahal is one of the world's most celebrated structures
            in the world.
            It is a stunning symbol of Indian rich history."""
print(another_dataset)

Taj Mahal is one of the world's most celebrated structures
            in the world.
            It is a stunning symbol of Indian rich history.


In [21]:
# word_tokenize = word_tokenize(text=another_dataset)

In [22]:
# Applying POS tagging

pos_tag(word_tokenize)

[('Hello', 'NNP'),
 ('Mr.', 'NNP'),
 ('Watson', 'NNP'),
 (',', ','),
 ('how', 'WRB'),
 ('are', 'VBP'),
 ('you', 'PRP'),
 ('doing', 'VBG'),
 ('today', 'NN'),
 ('?', '.'),
 ('The', 'DT'),
 ('weather', 'NN'),
 ('is', 'VBZ'),
 ('awesome', 'JJ'),
 ('.', '.'),
 ('The', 'DT'),
 ('garden', 'NN'),
 ('is', 'VBZ'),
 ('Green', 'JJ'),
 ('.', '.'),
 ('We', 'PRP'),
 ('should', 'MD'),
 ('go', 'VB'),
 ('out', 'RP'),
 ('for', 'IN'),
 ('a', 'DT'),
 ('walk', 'NN'),
 ('.', '.')]

In [24]:
# Tag set - meaning for each of the tags above
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

## Chunking

In [26]:
from nltk.chunk import RegexpParser

pos_tagging = pos_tag(word_tokenize)

print(pos_tagging)

[('Hello', 'NNP'), ('Mr.', 'NNP'), ('Watson', 'NNP'), (',', ','), ('how', 'WRB'), ('are', 'VBP'), ('you', 'PRP'), ('doing', 'VBG'), ('today', 'NN'), ('?', '.'), ('The', 'DT'), ('weather', 'NN'), ('is', 'VBZ'), ('awesome', 'JJ'), ('.', '.'), ('The', 'DT'), ('garden', 'NN'), ('is', 'VBZ'), ('Green', 'JJ'), ('.', '.'), ('We', 'PRP'), ('should', 'MD'), ('go', 'VB'), ('out', 'RP'), ('for', 'IN'), ('a', 'DT'), ('walk', 'NN'), ('.', '.')]


In [27]:
# Define sequence of chunk

sequence_chunk = """
chunk:
    {<NNP>+}
    {<NN>+}
    {<JJ>+} """

In [29]:
# Create an object with regular expression
chunk = RegexpParser(sequence_chunk)

# Apply chunking
chunk_result = chunk.parse(pos_tagging)
print(chunk_result)

(S
  (chunk Hello/NNP Mr./NNP Watson/NNP)
  ,/,
  how/WRB
  are/VBP
  you/PRP
  doing/VBG
  (chunk today/NN)
  ?/.
  The/DT
  (chunk weather/NN)
  is/VBZ
  (chunk awesome/JJ)
  ./.
  The/DT
  (chunk garden/NN)
  is/VBZ
  (chunk Green/JJ)
  ./.
  We/PRP
  should/MD
  go/VB
  out/RP
  for/IN
  a/DT
  (chunk walk/NN)
  ./.)


## Named Entity Recognition - NER

In [30]:
from nltk.chunk import ne_chunk

dataset = """Abraham Lincoln was an American stateman and lawyer
            who served as the 16th President of the United States."""

print(dataset)

Abraham Lincoln was an American stateman and lawyer
            who served as the 16th President of the United States.


In [32]:
# Tokenization and POS tagging
# dataset_tag = pos_tag(word_tokenize(dataset))

pos_tagging = pos_tag(word_tokenize)

print(pos_tagging)

[('Hello', 'NNP'), ('Mr.', 'NNP'), ('Watson', 'NNP'), (',', ','), ('how', 'WRB'), ('are', 'VBP'), ('you', 'PRP'), ('doing', 'VBG'), ('today', 'NN'), ('?', '.'), ('The', 'DT'), ('weather', 'NN'), ('is', 'VBZ'), ('awesome', 'JJ'), ('.', '.'), ('The', 'DT'), ('garden', 'NN'), ('is', 'VBZ'), ('Green', 'JJ'), ('.', '.'), ('We', 'PRP'), ('should', 'MD'), ('go', 'VB'), ('out', 'RP'), ('for', 'IN'), ('a', 'DT'), ('walk', 'NN'), ('.', '.')]


In [34]:
# Applying NER

dataset_ner = ne_chunk(pos_tagging)
print(dataset_ner)

(S
  (PERSON Hello/NNP)
  (PERSON Mr./NNP Watson/NNP)
  ,/,
  how/WRB
  are/VBP
  you/PRP
  doing/VBG
  today/NN
  ?/.
  The/DT
  weather/NN
  is/VBZ
  awesome/JJ
  ./.
  The/DT
  garden/NN
  is/VBZ
  (GPE Green/JJ)
  ./.
  We/PRP
  should/MD
  go/VB
  out/RP
  for/IN
  a/DT
  walk/NN
  ./.)


In [35]:
# Tree Diagram
dataset_ner.draw()

## Text Classification

In [36]:
import random
from nltk.corpus import movie_reviews

In [37]:
# Create a list of tuples
documents = []

In [38]:
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((list(movie_reviews.words(fileid)), category))

In [39]:
# Shuffle the documents
random.shuffle(documents)
print(documents[0])

(['a', 'follow', '-', 'up', 'to', 'disney', "'", 's', 'live', '-', 'action', '"', '101', 'dalmatians', '"', 'that', "'", 's', 'better', ',', 'more', 'entertaining', 'than', 'the', 'first', '?', 'just', 'as', 'unlikely', '.', 'with', '"', '102', 'dalmatians', ',', '"', 'the', 'disney', 'studios', 'have', 'proven', 'that', 'when', 'it', 'comes', 'to', 'going', 'to', 'the', 'dogs', ',', 'more', 'is', 'definitely', '*', 'not', '*', 'the', 'merrier', '.', '1996', "'", 's', '"', '101', 'dalmatians', '"', 'certainly', 'wasn', "'", 't', 'the', 'greatest', 'movie', '-', 'going', 'experience', 'of', 'all', 'time', ',', 'but', 'it', 'did', 'feature', 'glenn', 'close', 'in', 'an', 'outrageous', ',', 'larger', '-', 'than', '-', 'life', 'performance', '.', 'in', 'addition', ',', 'we', 'had', 'two', 'amiable', 'leads', 'in', 'the', 'form', 'of', 'jeff', 'daniels', 'and', 'joely', 'richardson', ',', 'and', 'lots', 'and', 'lots', 'of', 'adorable', 'spotted', 'puppies', '.', 'this', 'time', 'around', 't

In [40]:
# Normalize the dataset
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

In [42]:
# NLTK frequency distribution
all_words = nltk.FreqDist(all_words)
print(all_words.most_common(15))

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


In [43]:
print(all_words['love'])

1119


In [44]:
# Limit the words
word_features = list(all_words.keys())[:3000]

In [47]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    return features

print((find_features(movie_reviews.words('pos/cv000_29590.txt'))))



In [48]:
featuresets = [(find_features(rev), category) for (rev, category) in documents]

In [49]:
# Split the dataset into Train and Test set
train_set = featuresets[:1900]
test_set = featuresets[1900:]

In [50]:
# Training the Classifier
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [51]:
# Test Accuracy
print('Accuracy:', 
      (nltk.classify.accuracy(classifier, test_set))*100)

Accuracy: 83.0


In [52]:
classifier.show_most_informative_features(15)

Most Informative Features
              schumacher = True              neg : pos    =     11.2 : 1.0
                   sucks = True              neg : pos    =     10.4 : 1.0
                  annual = True              pos : neg    =      8.8 : 1.0
                 frances = True              pos : neg    =      8.8 : 1.0
             silverstone = True              neg : pos    =      7.8 : 1.0
                  shoddy = True              neg : pos    =      7.1 : 1.0
                 idiotic = True              neg : pos    =      7.1 : 1.0
                  suvari = True              neg : pos    =      7.1 : 1.0
                    mena = True              neg : pos    =      7.1 : 1.0
           unimaginative = True              neg : pos    =      7.1 : 1.0
                  regard = True              pos : neg    =      6.9 : 1.0
               atrocious = True              neg : pos    =      6.7 : 1.0
                  turkey = True              neg : pos    =      6.4 : 1.0