## Natural Language Processing 

In [2]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [3]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

###  Tokenization

In [4]:
corpus = """Hello, I am Nikhil, This is NLP practical. 
There are many different techniques ! involved in this concept."""

In [5]:
documents = sent_tokenize(corpus)

In [6]:
for sentence in documents:
    print(sentence)

Hello, I am Nikhil, This is NLP practical.
There are many different techniques !
involved in this concept.


In [7]:
word_tokenize(corpus)

['Hello',
 ',',
 'I',
 'am',
 'Nikhil',
 ',',
 'This',
 'is',
 'NLP',
 'practical',
 '.',
 'There',
 'are',
 'many',
 'different',
 'techniques',
 '!',
 'involved',
 'in',
 'this',
 'concept',
 '.']

In [8]:
from nltk.tokenize import wordpunct_tokenize

In [9]:
wordpunct_tokenize(corpus)

['Hello',
 ',',
 'I',
 'am',
 'Nikhil',
 ',',
 'This',
 'is',
 'NLP',
 'practical',
 '.',
 'There',
 'are',
 'many',
 'different',
 'techniques',
 '!',
 'involved',
 'in',
 'this',
 'concept',
 '.']

In [10]:
from nltk.tokenize import TreebankWordTokenizer

In [11]:
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(corpus)

['Hello',
 ',',
 'I',
 'am',
 'Nikhil',
 ',',
 'This',
 'is',
 'NLP',
 'practical.',
 'There',
 'are',
 'many',
 'different',
 'techniques',
 '!',
 'involved',
 'in',
 'this',
 'concept',
 '.']

In [12]:
## Stemming and Its Types 
# Reduces a word to its word stem that affixes, suffixes and prefixes to the roots of the words known as lemma.
words = ['eats', "eaten", "eating", "wrote", "written", "writing", "dancer", "dancing", "woken","awake", "waking","programming","programs","finally","finalize"]

In [13]:
# PorterStemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [14]:
# Major disadvantage: Changes meaning of the word 
for word in words:
    print(word +'---------->'+ ps.stem(word))

eats---------->eat
eaten---------->eaten
eating---------->eat
wrote---------->wrote
written---------->written
writing---------->write
dancer---------->dancer
dancing---------->danc
woken---------->woken
awake---------->awak
waking---------->wake
programming---------->program
programs---------->program
finally---------->final
finalize---------->final


In [15]:
# RegexpStemmer
from nltk.stem import RegexpStemmer
reg_stemmer = RegexpStemmer('ing$|s$|e$|able$|en$', min=4)

In [16]:
for word in words:
    print(word +'---------->'+ reg_stemmer.stem(word))

eats---------->eat
eaten---------->eat
eating---------->eat
wrote---------->wrot
written---------->writt
writing---------->writ
dancer---------->dancer
dancing---------->danc
woken---------->wok
awake---------->awak
waking---------->wak
programming---------->programm
programs---------->program
finally---------->finally
finalize---------->finaliz


In [17]:
## Snowball Stemmer
# Performs better than porter stemmer 
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer('english')

In [18]:
for word in words:
    print(word+'------>'+snowball_stemmer.stem(word))

eats------>eat
eaten------>eaten
eating------>eat
wrote------>wrote
written------>written
writing------>write
dancer------>dancer
dancing------>danc
woken------>woken
awake------>awak
waking------>wake
programming------>program
programs------>program
finally------>final
finalize------>final


In [19]:
## Lemmatization
# Overcomes the cons of stemmers 
# Wordnet Lemmatizer:output is known as lemma which is a root word rather than root stem

In [20]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [21]:
"""
POS- Noun -n,
Verb - v,
adjective - a,
adverb - r
"""
lemmatizer.lemmatize("writing", pos='v')

'write'

In [22]:
for word in words:
    print(word+'--->'+lemmatizer.lemmatize(word, pos='v'))

eats--->eat
eaten--->eat
eating--->eat
wrote--->write
written--->write
writing--->write
dancer--->dancer
dancing--->dance
woken--->wake
awake--->awake
waking--->wake
programming--->program
programs--->program
finally--->finally
finalize--->finalize


In [23]:
# Stopwords, partofspeech, named entity recognition
paragraph = """This is our hope. This is the faith that I go back to the South with. With this faith, we will be able to hew out of the mountain of despair a stone of hope. With this faith we will be able to transform the jangling discords of our nation into a beautiful symphony of brotherhood. With this faith we will be able to work together, to pray together, to struggle together, to go to jail together, to stand up for freedom together, knowing that we will be free one day.

This will be the day when all of God's children will be able to sing with new meaning: My country, 'tis of thee, sweet land of liberty, of thee I sing. Land where my fathers died, land of the pilgrims' pride, from every mountainside, let freedom ring.

And if America is to be a great nation, this must become true. And so let freedom ring from the prodigious hilltops of New Hampshire. Let freedom ring from the mighty mountains of New York. Let freedom ring from the heightening Alleghenies of Pennsylvania. Let freedom ring from the snowcapped Rockies of Colorado. Let freedom ring from the curvaceous slopes of California. But not only that, let freedom ring from Stone Mountain of Georgia. Let freedom ring from Lookout Mountain of Tennessee. Let freedom ring from every hill and molehill of Mississippi. From every mountainside, let freedom ring.

And when this happens, and when we allow freedom ring, when we let it ring from every village and every hamlet, from every state and every city, we will be able to speed up that day when all of God's children, Black men and white men, Jews and Gentiles, Protestants and Catholics, will be able to join hands and sing in the words of the old Negro spiritual: Free at last. Free at last. Thank God almighty, we are free at last."""

In [24]:
paragraph

"This is our hope. This is the faith that I go back to the South with. With this faith, we will be able to hew out of the mountain of despair a stone of hope. With this faith we will be able to transform the jangling discords of our nation into a beautiful symphony of brotherhood. With this faith we will be able to work together, to pray together, to struggle together, to go to jail together, to stand up for freedom together, knowing that we will be free one day.\n\nThis will be the day when all of God's children will be able to sing with new meaning: My country, 'tis of thee, sweet land of liberty, of thee I sing. Land where my fathers died, land of the pilgrims' pride, from every mountainside, let freedom ring.\n\nAnd if America is to be a great nation, this must become true. And so let freedom ring from the prodigious hilltops of New Hampshire. Let freedom ring from the mighty mountains of New York. Let freedom ring from the heightening Alleghenies of Pennsylvania. Let freedom ring 

In [25]:
from nltk.stem import PorterStemmer
import nltk
from nltk.corpus import stopwords 
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [27]:
stemmer = PorterStemmer()
sentences = nltk.sent_tokenize(paragraph)

In [28]:
snowball_stemmer = SnowballStemmer('english')

In [29]:
# Apply stopwords and filter and then apply stemming
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [snowball_stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = " ".join(words)

In [30]:
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [31]:
# Apply stopwords and filter and then apply lemmatizer
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word.lower(), pos='v') for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = " ".join(words)
    # Pos_Tag
    pos_tag = nltk.pos_tag(words)
    print(pos_tag)

[('hope', 'NN'), ('.', '.')]
[('faith', 'NN'), ('go', 'VB'), ('back', 'RB'), ('south', 'RB'), ('.', '.')]
[('faith', 'NN'), (',', ','), ('abl', 'JJ'), ('hew', 'NN'), ('mountain', 'NN'), ('despair', 'NN'), ('stone', 'NN'), ('hope', 'NN'), ('.', '.')]
[('faith', 'NN'), ('abl', 'NN'), ('transform', 'NN'), ('jangl', 'NN'), ('discord', 'NN'), ('nation', 'NN'), ('beauti', 'NN'), ('symphoni', 'NN'), ('brotherhood', 'NN'), ('.', '.')]
[('faith', 'NN'), ('abl', 'NN'), ('work', 'NN'), ('togeth', 'NN'), (',', ','), ('pray', 'NN'), ('togeth', 'NN'), (',', ','), ('struggl', 'NN'), ('togeth', 'NN'), (',', ','), ('go', 'VB'), ('jail', 'NN'), ('togeth', 'NN'), (',', ','), ('stand', 'VBP'), ('freedom', 'NN'), ('togeth', 'NNS'), (',', ','), ('know', 'VBP'), ('free', 'JJ'), ('one', 'CD'), ('day', 'NN'), ('.', '.')]
[('day', 'NN'), ('god', 'NN'), ("'s", 'POS'), ('children', 'NNS'), ('abl', 'VBP'), ('sing', 'VBG'), ('new', 'JJ'), ('mean', 'NN'), (':', ':'), ('countri', 'NN'), (',', ','), ("'t", "''"), ('th

In [32]:
sentences

['hope .',
 'faith go back south .',
 'faith , abl hew mountain despair stone hope .',
 'faith abl transform jangl discord nation beauti symphoni brotherhood .',
 'faith abl work togeth , pray togeth , struggl togeth , go jail togeth , stand freedom togeth , know free one day .',
 "day god 's children abl sing new mean : countri , 't thee , sweet land liberti , thee sing .",
 "land father die , land pilgrim ' pride , everi mountainsid , let freedom ring .",
 'america great nation , must becom true .',
 'let freedom ring prodigi hilltop new hampshir .',
 'let freedom ring mighti mountain new york .',
 'let freedom ring heighten allegheni pennsylvania .',
 'let freedom ring snowcap rocki colorado .',
 'let freedom ring curvac slope california .',
 ', let freedom ring stone mountain georgia .',
 'let freedom ring lookout mountain tennesse .',
 'let freedom ring everi hill molehil mississippi .',
 'everi mountainsid , let freedom ring .',
 "happen , allow freedom ring , let ring everi vill

In [33]:
sentence = """Freedom is never dear at any price. It is the breath of life. What would a man not pay for living?"""

In [34]:
# named entity recognition
words=nltk.word_tokenize(sentence)

In [35]:
tag_elements = nltk.pos_tag(words)

In [36]:
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [37]:
nltk.ne_chunk(tag_elements).draw()

In [38]:
# Text Vectorization
# Bag of Words 
# Tf-Idf
# N-grams
# Bi-grams
import re
import numpy as np

In [39]:
# Bag of Words 
# step 1: Filter the stopwords and lowercase the words 
# step 2: Build a vocabulary with all the unique words
# step 3: Update count based on the frequency within the document 
dataset = nltk.sent_tokenize(paragraph)
for i in range(len(dataset)):
    dataset[i] = dataset[i].lower()
    dataset[i] = re.sub(r'\W', ' ', dataset[i])
    dataset[i] = re.sub(r'\s+', ' ', dataset[i])

In [40]:
dataset

['this is our hope ',
 'this is the faith that i go back to the south with ',
 'with this faith we will be able to hew out of the mountain of despair a stone of hope ',
 'with this faith we will be able to transform the jangling discords of our nation into a beautiful symphony of brotherhood ',
 'with this faith we will be able to work together to pray together to struggle together to go to jail together to stand up for freedom together knowing that we will be free one day ',
 'this will be the day when all of god s children will be able to sing with new meaning my country tis of thee sweet land of liberty of thee i sing ',
 'land where my fathers died land of the pilgrims pride from every mountainside let freedom ring ',
 'and if america is to be a great nation this must become true ',
 'and so let freedom ring from the prodigious hilltops of new hampshire ',
 'let freedom ring from the mighty mountains of new york ',
 'let freedom ring from the heightening alleghenies of pennsylvania

In [41]:
word2count = {}
for data in dataset:
    words = nltk.word_tokenize(data)
    for word in words:
        
        if word not in word2count.keys():
            word2count[word] = 1
        else:
            word2count[word] += 1             

In [42]:
len(word2count)

129

In [43]:
import heapq 
freq_words = heapq.nlargest(100, word2count, key=word2count.get)

In [44]:
X = [] 
for data in dataset: 
    vector = [] 
    for word in freq_words: 
        if word in nltk.word_tokenize(data): 
            vector.append(1) 
        else: 
            vector.append(0) 
    X.append(vector) 
X = np.asarray(X)

In [45]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 1, 1, ..., 0, 0, 0],
       [1, 1, 1, ..., 0, 0, 0],
       ...,
       [1, 1, 1, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [46]:
# N-Grams - are continuous sequences of words or symbols, or tokens in a document. In technical terms, they can be defined as the neighboring sequences of items in a document. They come into play when we deal with text data in NLP (Natural Language Processing) tasks. They have a wide range of applications, like language models, semantic features, spelling correction, machine translation, text mining, etc.
from nltk import ngrams


In [47]:
dataset = nltk.sent_tokenize(paragraph)
for i in range(len(dataset)):
    dataset[i] = dataset[i].lower()
    dataset[i] = re.sub(r'\W', ' ', dataset[i])
    dataset[i] = re.sub(r'\s+', ' ', dataset[i])

In [48]:
def ngram(text,grams):  
    model=[]
    # model will contain n-gram strings
    count=0
    for token in text[:len(text)-grams+1]:  
       model.append(text[count:count+grams])  
       count=count+1  
    return model

In [49]:
ngram(dataset[1].split(), grams=5)

    

[['this', 'is', 'the', 'faith', 'that'],
 ['is', 'the', 'faith', 'that', 'i'],
 ['the', 'faith', 'that', 'i', 'go'],
 ['faith', 'that', 'i', 'go', 'back'],
 ['that', 'i', 'go', 'back', 'to'],
 ['i', 'go', 'back', 'to', 'the'],
 ['go', 'back', 'to', 'the', 'south'],
 ['back', 'to', 'the', 'south', 'with']]

## Count Vectorizer

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [51]:
bow = cv.fit_transform(dataset)

<20x126 sparse matrix of type '<class 'numpy.int64'>'
	with 265 stored elements in Compressed Sparse Row format>

In [52]:
print(cv.vocabulary_)

{'this': 109, 'is': 51, 'our': 81, 'hope': 47, 'the': 107, 'faith': 28, 'that': 106, 'go': 36, 'back': 9, 'to': 111, 'south': 95, 'with': 122, 'we': 117, 'will': 121, 'be': 10, 'able': 0, 'hew': 44, 'out': 82, 'of': 77, 'mountain': 68, 'despair': 24, 'stone': 100, 'transform': 113, 'jangling': 54, 'discords': 26, 'nation': 73, 'into': 50, 'beautiful': 11, 'symphony': 103, 'brotherhood': 14, 'work': 124, 'together': 112, 'pray': 85, 'struggle': 101, 'jail': 53, 'stand': 98, 'up': 115, 'for': 30, 'freedom': 32, 'knowing': 57, 'free': 31, 'one': 79, 'day': 23, 'when': 118, 'all': 1, 'god': 37, 'children': 18, 'sing': 91, 'new': 75, 'meaning': 63, 'my': 72, 'country': 21, 'tis': 110, 'thee': 108, 'sweet': 102, 'land': 58, 'liberty': 61, 'where': 119, 'fathers': 29, 'died': 25, 'pilgrims': 84, 'pride': 86, 'from': 33, 'every': 27, 'mountainside': 70, 'let': 60, 'ring': 89, 'and': 6, 'if': 48, 'america': 5, 'great': 38, 'must': 71, 'become': 12, 'true': 114, 'so': 94, 'prodigious': 87, 'hill

In [53]:
print(bow[0].toarray())

[[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
  0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]
