## Natural Language Processing 

In [1]:
pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [2]:
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

###  Tokenization

In [3]:
corpus = """Hello, I am Nikhil, This is NLP practical. 
There are many different techniques ! involved in this concept."""

In [4]:
documents = sent_tokenize(corpus)

In [5]:
for sentence in documents:
    print(sentence)

Hello, I am Nikhil, This is NLP practical.
There are many different techniques !
involved in this concept.


In [6]:
word_tokenize(corpus)

['Hello',
 ',',
 'I',
 'am',
 'Nikhil',
 ',',
 'This',
 'is',
 'NLP',
 'practical',
 '.',
 'There',
 'are',
 'many',
 'different',
 'techniques',
 '!',
 'involved',
 'in',
 'this',
 'concept',
 '.']

In [7]:
from nltk.tokenize import wordpunct_tokenize

In [8]:
wordpunct_tokenize(corpus)

['Hello',
 ',',
 'I',
 'am',
 'Nikhil',
 ',',
 'This',
 'is',
 'NLP',
 'practical',
 '.',
 'There',
 'are',
 'many',
 'different',
 'techniques',
 '!',
 'involved',
 'in',
 'this',
 'concept',
 '.']

In [9]:
from nltk.tokenize import TreebankWordTokenizer

In [10]:
tokenizer = TreebankWordTokenizer()
tokenizer.tokenize(corpus)

['Hello',
 ',',
 'I',
 'am',
 'Nikhil',
 ',',
 'This',
 'is',
 'NLP',
 'practical.',
 'There',
 'are',
 'many',
 'different',
 'techniques',
 '!',
 'involved',
 'in',
 'this',
 'concept',
 '.']

In [15]:
## Stemming and Its Types 
# Reduces a word to its word stem that affixes, suffixes and prefixes to the roots of the words known as lemma.
words = ['eats', "eaten", "eating", "wrote", "written", "writing", "dancer", "dancing", "woken","awake", "waking","programming","programs","finally","finalize"]

In [16]:
# PorterStemmer
from nltk.stem import PorterStemmer
ps = PorterStemmer()

In [17]:
# Major disadvantage: Changes meaning of the word 
for word in words:
    print(word +'---------->'+ ps.stem(word))

eats---------->eat
eaten---------->eaten
eating---------->eat
wrote---------->wrote
written---------->written
writing---------->write
dancer---------->dancer
dancing---------->danc
woken---------->woken
awake---------->awak
waking---------->wake
programming---------->program
programs---------->program
finally---------->final
finalize---------->final


In [22]:
# RegexpStemmer
from nltk.stem import RegexpStemmer
reg_stemmer = RegexpStemmer('ing$|s$|e$|able$|en$', min=4)

In [23]:
for word in words:
    print(word +'---------->'+ reg_stemmer.stem(word))

eats---------->eat
eaten---------->eat
eating---------->eat
wrote---------->wrot
written---------->writt
writing---------->writ
dancer---------->dancer
dancing---------->danc
woken---------->wok
awake---------->awak
waking---------->wak
programming---------->programm
programs---------->program
finally---------->finally
finalize---------->finaliz


In [25]:
## Snowball Stemmer
# Performs better than porter stemmer 
from nltk.stem import SnowballStemmer
snowball_stemmer = SnowballStemmer('english')

In [27]:
for word in words:
    print(word+'------>'+snowball_stemmer.stem(word))

eats------>eat
eaten------>eaten
eating------>eat
wrote------>wrote
written------>written
writing------>write
dancer------>dancer
dancing------>danc
woken------>woken
awake------>awak
waking------>wake
programming------>program
programs------>program
finally------>final
finalize------>final


In [None]:
## Lemmatization
# Overcomes the cons of stemmers 
# Wordnet Lemmatizer:output is known as lemma which is a root word rather than root stem

In [28]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

In [30]:
"""
POS- Noun -n,
Verb - v,
adjective - a,
adverb - r
"""
lemmatizer.lemmatize("writing", pos='v')

'write'

In [32]:
for word in words:
    print(word+'--->'+lemmatizer.lemmatize(word, pos='v'))

eats--->eat
eaten--->eat
eating--->eat
wrote--->write
written--->write
writing--->write
dancer--->dancer
dancing--->dance
woken--->wake
awake--->awake
waking--->wake
programming--->program
programs--->program
finally--->finally
finalize--->finalize


In [48]:
# Stopwords, partofspeech, named entity recognition
paragraph = """This is our hope. This is the faith that I go back to the South with. With this faith, we will be able to hew out of the mountain of despair a stone of hope. With this faith we will be able to transform the jangling discords of our nation into a beautiful symphony of brotherhood. With this faith we will be able to work together, to pray together, to struggle together, to go to jail together, to stand up for freedom together, knowing that we will be free one day.

This will be the day when all of God's children will be able to sing with new meaning: My country, 'tis of thee, sweet land of liberty, of thee I sing. Land where my fathers died, land of the pilgrims' pride, from every mountainside, let freedom ring.

And if America is to be a great nation, this must become true. And so let freedom ring from the prodigious hilltops of New Hampshire. Let freedom ring from the mighty mountains of New York. Let freedom ring from the heightening Alleghenies of Pennsylvania. Let freedom ring from the snowcapped Rockies of Colorado. Let freedom ring from the curvaceous slopes of California. But not only that, let freedom ring from Stone Mountain of Georgia. Let freedom ring from Lookout Mountain of Tennessee. Let freedom ring from every hill and molehill of Mississippi. From every mountainside, let freedom ring.

And when this happens, and when we allow freedom ring, when we let it ring from every village and every hamlet, from every state and every city, we will be able to speed up that day when all of God's children, Black men and white men, Jews and Gentiles, Protestants and Catholics, will be able to join hands and sing in the words of the old Negro spiritual: Free at last. Free at last. Thank God almighty, we are free at last."""

In [36]:
paragraph

"This is our hope. This is the faith that I go back to the South with. With this faith, we will be able to hew out of the mountain of despair a stone of hope. With this faith we will be able to transform the jangling discords of our nation into a beautiful symphony of brotherhood. With this faith we will be able to work together, to pray together, to struggle together, to go to jail together, to stand up for freedom together, knowing that we will be free one day.\n\nThis will be the day when all of God's children will be able to sing with new meaning: My country, 'tis of thee, sweet land of liberty, of thee I sing. Land where my fathers died, land of the pilgrims' pride, from every mountainside, let freedom ring.\n\nAnd if America is to be a great nation, this must become true. And so let freedom ring from the prodigious hilltops of New Hampshire. Let freedom ring from the mighty mountains of New York. Let freedom ring from the heightening Alleghenies of Pennsylvania. Let freedom ring 

In [38]:
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords 
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [39]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [54]:
stemmer = PorterStemmer()
sentences = nltk.sent_tokenize(paragraph)

In [None]:
snowball_stemmer = SnowballStemmer('english')

In [44]:
# Apply stopwords and filter and then apply stemming
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [snowball_stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = " ".join(words)

In [62]:
nltk.download('averaged_perceptron_tagger')


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [68]:
# Apply stopwords and filter and then apply lemmatizer
for i in range(len(sentences)):
    words = nltk.word_tokenize(sentences[i])
    words = [lemmatizer.lemmatize(word.lower(), pos='v') for word in words if word not in set(stopwords.words('english'))]
    sentences[i] = " ".join(words)
    # Pos_Tag
    pos_tag = nltk.pos_tag(words)
    print(pos_tag)

[('hope', 'NN'), ('.', '.')]
[('faith', 'NN'), ('go', 'VB'), ('back', 'RB'), ('south', 'RB'), ('.', '.')]
[('faith', 'NN'), (',', ','), ('able', 'JJ'), ('hew', 'NN'), ('mountain', 'NN'), ('despair', 'NN'), ('stone', 'NN'), ('hope', 'NN'), ('.', '.')]
[('faith', 'NN'), ('able', 'JJ'), ('transform', 'NN'), ('jangle', 'NN'), ('discord', 'NN'), ('nation', 'NN'), ('beautiful', 'JJ'), ('symphony', 'NN'), ('brotherhood', 'NN'), ('.', '.')]
[('faith', 'NN'), ('able', 'JJ'), ('work', 'NN'), ('together', 'RB'), (',', ','), ('pray', 'NN'), ('together', 'RB'), (',', ','), ('struggle', 'NN'), ('together', 'RB'), (',', ','), ('go', 'VB'), ('jail', 'NN'), ('together', 'RB'), (',', ','), ('stand', 'VBP'), ('freedom', 'NN'), ('together', 'RB'), (',', ','), ('know', 'VBP'), ('free', 'JJ'), ('one', 'CD'), ('day', 'NN'), ('.', '.')]
[('day', 'NN'), ('god', 'NN'), ("'s", 'POS'), ('children', 'NNS'), ('able', 'JJ'), ('sing', 'VBG'), ('new', 'JJ'), ('mean', 'NN'), (':', ':'), ('country', 'NN'), (',', ','), (

In [69]:
sentences

['hope .',
 'faith go back south .',
 'faith , able hew mountain despair stone hope .',
 'faith able transform jangle discord nation beautiful symphony brotherhood .',
 'faith able work together , pray together , struggle together , go jail together , stand freedom together , know free one day .',
 "day god 's children able sing new mean : country , 't thee , sweet land liberty , thee sing .",
 "land father die , land pilgrims ' pride , every mountainside , let freedom ring .",
 'america great nation , must become true .',
 'let freedom ring prodigious hilltops new hampshire .',
 'let freedom ring mighty mountains new york .',
 'let freedom ring heighten alleghenies pennsylvania .',
 'let freedom ring snowcapped rockies colorado .',
 'let freedom ring curvaceous slop california .',
 ', let freedom ring stone mountain georgia .',
 'let freedom ring lookout mountain tennessee .',
 'let freedom ring every hill molehill mississippi .',
 'every mountainside , let freedom ring .',
 "happen ,

In [74]:
sentence = """Freedom is never dear at any price. It is the breath of life. What would a man not pay for living?"""

In [75]:
# named entity recognition
words=nltk.word_tokenize(sentence)

In [77]:
tag_elements = nltk.pos_tag(words)

In [78]:
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\nikhi\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [None]:
nltk.ne_chunk(tag_elements).draw()