## 1. Tokenization

In [1]:
! pip install -q nltk


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.1[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import nltk

In [3]:
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize

In [4]:
dataset = "Hello! welcome to this visual meeting now. I hope you are glad. Today is another day to learn about Natural Language Processing. I hope you are excite as I am."

In [5]:
# dataset = input("Enter your review: ")

In [6]:
# Tokenizing Sentences

print(sent_tokenize(text=dataset, language="english"))

['Hello!', 'welcome to this visual meeting now.', 'I hope you are glad.', 'Today is another day to learn about Natural Language Processing.', 'I hope you are excite as I am.']


In [7]:
sent_token = sent_tokenize(text=dataset, language="english")
sent_token

['Hello!',
 'welcome to this visual meeting now.',
 'I hope you are glad.',
 'Today is another day to learn about Natural Language Processing.',
 'I hope you are excite as I am.']

In [8]:
for i in sent_token:
    print(i)

Hello!
welcome to this visual meeting now.
I hope you are glad.
Today is another day to learn about Natural Language Processing.
I hope you are excite as I am.


In [9]:
# Tokenizing Words

word_token = word_tokenize(text=dataset, language="english")

In [10]:
word_token

['Hello',
 '!',
 'welcome',
 'to',
 'this',
 'visual',
 'meeting',
 'now',
 '.',
 'I',
 'hope',
 'you',
 'are',
 'glad',
 '.',
 'Today',
 'is',
 'another',
 'day',
 'to',
 'learn',
 'about',
 'Natural',
 'Language',
 'Processing',
 '.',
 'I',
 'hope',
 'you',
 'are',
 'excite',
 'as',
 'I',
 'am',
 '.']

In [11]:
# for i in word_tokenize(text=dataset, language="english")

for i in word_token:
    print(i, end=" ")

Hello ! welcome to this visual meeting now . I hope you are glad . Today is another day to learn about Natural Language Processing . I hope you are excite as I am . 

## 2. Stemming

In [12]:
import nltk
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [13]:
dataset = ["love", "loving", "lover", "lovingly", "loved"]

In [14]:
# Apply Stemming
ps = PorterStemmer()

In [15]:
for i in dataset:
    print(ps.stem(i))

love
love
lover
lovingli
love


In [16]:
new_data = """It feels special to learn how to program in python.
Programming is the most relevant and best 
career choice anyone can make from this century onwards. 
Go Be Happy!
"""

In [17]:
words = word_tokenize(new_data)
print(words, end=" ")

['It', 'feels', 'special', 'to', 'learn', 'how', 'to', 'program', 'in', 'python', '.', 'Programming', 'is', 'the', 'most', 'relevant', 'and', 'best', 'career', 'choice', 'anyone', 'can', 'make', 'from', 'this', 'century', 'onwards', '.', 'Go', 'Be', 'Happy', '!'] 

In [18]:
for word in words:
    print(ps.stem(word))

it
feel
special
to
learn
how
to
program
in
python
.
program
is
the
most
relev
and
best
career
choic
anyon
can
make
from
thi
centuri
onward
.
go
be
happi
!


## 3. Lemmatization

In [19]:
import nltk
from nltk.stem import WordNetLemmatizer

In [20]:
wnl = WordNetLemmatizer()

In [21]:
wnl.lemmatize("churches")

'church'

In [22]:
wnl.lemmatize("dogs")

'dog'

In [23]:
wnl.lemmatize("feet")

'foot'

In [24]:
wnl.lemmatize("better", pos="a")

'good'

In [25]:
new_data = """It feels special to learn how to program in python.
Programming is the most relevant and best 
career choice anyone can make from this century onwards. 
Go Be Happy!
"""

In [26]:
noun_lemma = wnl.lemmatize("teeth")
print(noun_lemma)

teeth


## 4. Stop Words

In [27]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [28]:
stop_words = set(stopwords.words("english"))
print(stop_words)

{'once', 'during', 'after', 'before', "mightn't", 'that', 'was', 'y', 'doing', 'himself', 'ma', 'any', 'have', 'who', 'by', 'not', 'she', "don't", 'your', 'my', 'too', 'you', 'they', 'how', 'his', 'shouldn', 'we', 'the', 'mustn', 'both', 'which', 'are', 'd', "shouldn't", 'being', 'just', "you're", 'again', 'our', "didn't", "won't", 're', "doesn't", 'won', 'below', 'been', 'theirs', 'them', 'had', 'couldn', 'm', 'did', 'me', 'these', 'out', 's', 'he', 'him', 'up', "wasn't", 'be', 'so', 'but', 'were', 'on', "mustn't", 'such', "she's", 'those', 'ourselves', 'why', 'down', 'further', 'when', 'having', 'wasn', 'weren', 'yourselves', 'than', 've', 'does', 'what', "you'd", 'ours', 'above', 'with', 'yourself', 'no', 'itself', 'where', 'all', 'a', 'off', 'don', "weren't", 'their', 'should', 'here', 'other', 'didn', "shan't", 'into', 'while', 'over', 'of', 'to', 'hers', 'only', "couldn't", 'because', 'isn', 'few', 'is', 'own', 'aren', 'more', "haven't", 'it', 'i', "hasn't", "aren't", 'hadn', 'th

In [29]:
new_data = """It feels special to learn how to program in python.
Programming is the most relevant and best 
career choice anyone can make from this century onwards. 
Go Be Happy!
"""

In [30]:
# Tokenize words
word_tokenize = word_tokenize(new_data)
word_tokenize

['It',
 'feels',
 'special',
 'to',
 'learn',
 'how',
 'to',
 'program',
 'in',
 'python',
 '.',
 'Programming',
 'is',
 'the',
 'most',
 'relevant',
 'and',
 'best',
 'career',
 'choice',
 'anyone',
 'can',
 'make',
 'from',
 'this',
 'century',
 'onwards',
 '.',
 'Go',
 'Be',
 'Happy',
 '!']

In [31]:
# Remove Stop Words from dataset
filtered_sentences = []

for word in word_tokenize:
    if word not in stop_words:
        filtered_sentences.append(word)

print(filtered_sentences)

['It', 'feels', 'special', 'learn', 'program', 'python', '.', 'Programming', 'relevant', 'best', 'career', 'choice', 'anyone', 'make', 'century', 'onwards', '.', 'Go', 'Be', 'Happy', '!']


## 5. POS Tagging

In [32]:
# importing the libraries
import nltk

from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag

In [33]:
dataset = """ Taj Mahal is one of the world's most iconic and celebrated structures.
It is a stunning symbol of rich Indian history
"""

In [34]:
data = word_tokenize(dataset)
print(data)

['Taj', 'Mahal', 'is', 'one', 'of', 'the', 'world', "'s", 'most', 'iconic', 'and', 'celebrated', 'structures', '.', 'It', 'is', 'a', 'stunning', 'symbol', 'of', 'rich', 'Indian', 'history']


In [35]:
pos_tag(data)

[('Taj', 'NNP'),
 ('Mahal', 'NNP'),
 ('is', 'VBZ'),
 ('one', 'CD'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('world', 'NN'),
 ("'s", 'POS'),
 ('most', 'RBS'),
 ('iconic', 'JJ'),
 ('and', 'CC'),
 ('celebrated', 'JJ'),
 ('structures', 'NNS'),
 ('.', '.'),
 ('It', 'PRP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('stunning', 'JJ'),
 ('symbol', 'NN'),
 ('of', 'IN'),
 ('rich', 'JJ'),
 ('Indian', 'JJ'),
 ('history', 'NN')]

In [36]:
nltk.help.upenn_tagset()

$: dollar
    $ -$ --$ A$ C$ HK$ M$ NZ$ S$ U.S.$ US$
'': closing quotation mark
    ' ''
(: opening parenthesis
    ( [ {
): closing parenthesis
    ) ] }
,: comma
    ,
--: dash
    --
.: sentence terminator
    . ! ?
:: colon or ellipsis
    : ; ...
CC: conjunction, coordinating
    & 'n and both but either et for less minus neither nor or plus so
    therefore times v. versus vs. whether yet
CD: numeral, cardinal
    mid-1890 nine-thirty forty-two one-tenth ten million 0.5 one forty-
    seven 1987 twenty '79 zero two 78-degrees eighty-four IX '60s .025
    fifteen 271,124 dozen quintillion DM2,000 ...
DT: determiner
    all an another any both del each either every half la many much nary
    neither no some such that the them these this those
EX: existential there
    there
FW: foreign word
    gemeinschaft hund ich jeux habeas Haementeria Herr K'ang-si vous
    lutihaw alai je jour objets salutaris fille quibusdam pas trop Monte
    terram fiche oui corporis ...
IN: preposition or

## 6. Chunking

In [37]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import RegexpParser

In [38]:
dataset = """ Taj Mahal is one of the world's most iconic and celebrated structures.
It is a stunning symbol of rich Indian history
"""

In [43]:
data = word_tokenize(dataset)

print(data, end=" ")

['Taj', 'Mahal', 'is', 'one', 'of', 'the', 'world', "'s", 'most', 'iconic', 'and', 'celebrated', 'structures', '.', 'It', 'is', 'a', 'stunning', 'symbol', 'of', 'rich', 'Indian', 'history'] 

In [40]:
pos_tagging = pos_tag(data)
pos_tagging

[('Taj', 'NNP'),
 ('Mahal', 'NNP'),
 ('is', 'VBZ'),
 ('one', 'CD'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('world', 'NN'),
 ("'s", 'POS'),
 ('most', 'RBS'),
 ('iconic', 'JJ'),
 ('and', 'CC'),
 ('celebrated', 'JJ'),
 ('structures', 'NNS'),
 ('.', '.'),
 ('It', 'PRP'),
 ('is', 'VBZ'),
 ('a', 'DT'),
 ('stunning', 'JJ'),
 ('symbol', 'NN'),
 ('of', 'IN'),
 ('rich', 'JJ'),
 ('Indian', 'JJ'),
 ('history', 'NN')]

In [48]:
sequence_chunk = """
chunk:
    {<NNPS>+}
    {<NNP>+}
    {<NN>+}
    {<JJ>+}

"""

In [49]:
chunk = RegexpParser(sequence_chunk)

In [52]:
chunk_results = chunk.parse(pos_tagging)
print(chunk_results)

(S
  (chunk Taj/NNP Mahal/NNP)
  is/VBZ
  one/CD
  of/IN
  the/DT
  (chunk world/NN)
  's/POS
  most/RBS
  (chunk iconic/JJ)
  and/CC
  (chunk celebrated/JJ)
  structures/NNS
  ./.
  It/PRP
  is/VBZ
  a/DT
  (chunk stunning/JJ)
  (chunk symbol/NN)
  of/IN
  (chunk rich/JJ Indian/JJ)
  (chunk history/NN)) 

## 7. Named Entity Recognition - NER

In [53]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk

In [54]:
# Dataset

dataset = """Abraham Lincoln was an American statesman and a lawyer.
He served as the 16th president of the United States. 
"""

dataset2 = """Chief Bola Ahmed Adekunle Tinubu GCFR 
(born 29 March 1952) is a Nigerian politician who is the 16th 
and current president of Nigeria. He was the governor of 
Lagos State from 1999 to 2007, and senator for Lagos West 
in the Third Republic.  
"""

In [55]:
tag = pos_tag(word_tokenize(dataset))
tag

[('Abraham', 'NNP'),
 ('Lincoln', 'NNP'),
 ('was', 'VBD'),
 ('an', 'DT'),
 ('American', 'JJ'),
 ('statesman', 'NN'),
 ('and', 'CC'),
 ('a', 'DT'),
 ('lawyer', 'NN'),
 ('.', '.'),
 ('He', 'PRP'),
 ('served', 'VBD'),
 ('as', 'IN'),
 ('the', 'DT'),
 ('16th', 'CD'),
 ('president', 'NN'),
 ('of', 'IN'),
 ('the', 'DT'),
 ('United', 'NNP'),
 ('States', 'NNPS'),
 ('.', '.')]

In [56]:
# Apply Ne Chunk

dataset_ner = ne_chunk(tag)

In [58]:
print(dataset_ner)

(S
  (PERSON Abraham/NNP)
  (PERSON Lincoln/NNP)
  was/VBD
  an/DT
  (GPE American/JJ)
  statesman/NN
  and/CC
  a/DT
  lawyer/NN
  ./.
  He/PRP
  served/VBD
  as/IN
  the/DT
  16th/CD
  president/NN
  of/IN
  the/DT
  (GPE United/NNP States/NNPS)
  ./.)


In [59]:
dataset_ner.draw()