# **Tokenization**

In [2]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize,sent_tokenize
sentence="Hello! How are you doing today? I'm learning NLTK"

word_tokens=word_tokenize(sentence)
print("Word tokens:",word_tokens)

sent_tokens=sent_tokenize(sentence)
print("Sentence tokens:",sent_tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Word tokens: ['Hello', '!', 'How', 'are', 'you', 'doing', 'today', '?', 'I', "'m", 'learning', 'NLTK']
Sentence tokens: ['Hello!', 'How are you doing today?', "I'm learning NLTK"]


#Wordnet


In [3]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [8]:
from nltk.corpus import wordnet
synonyms=wordnet.synsets('good')
for syn in synonyms:
  print(syn.name(),syn.definition())

print("Examples:",synonyms[1].examples())

good.n.01 benefit
good.n.02 moral excellence or admirableness
good.n.03 that which is pleasing or valuable or useful
commodity.n.01 articles of commerce
good.a.01 having desirable or positive qualities especially those suitable for a thing specified
full.s.06 having the normally expected amount
good.a.03 morally admirable
estimable.s.02 deserving of esteem and respect
beneficial.s.01 promoting or enhancing well-being
good.s.06 agreeable or pleasing
good.s.07 of moral excellence
adept.s.01 having or showing knowledge and skill and aptitude
good.s.09 thorough
dear.s.02 with or in a close or intimate relationship
dependable.s.04 financially sound
good.s.12 most suitable or right for a particular purpose
good.s.13 resulting favorably
effective.s.04 exerting force or influence
good.s.15 capable of pleasing
good.s.16 appealing to the mind
good.s.17 in excellent physical condition
good.s.18 tending to promote physical well-being; beneficial to health
good.s.19 not forged
good.s.20 not left to

# Stopwords

In [10]:
nltk.download('stopwords')
nltk.download('words')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [11]:
from nltk.corpus import stopwords
stop_words=set(stopwords.words('english'))
text = "This is a sample sentence, showing off the stop words filtration."
word_tokens=word_tokenize(text)
filtered_words=[word for word in word_tokens if word not in stop_words]
print(filtered_words)

['This', 'sample', 'sentence', ',', 'showing', 'stop', 'words', 'filtration', '.']


# Stemming

In [12]:
from nltk.stem import PorterStemmer
ps=PorterStemmer()
words=["running","ran","runner","easily","fairly"]
stemmed_words=[ps.stem(word) for word in words]
print(stemmed_words)

['run', 'ran', 'runner', 'easili', 'fairli']


#Lemmatization

In [13]:
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
print(lemmatizer.lemmatize('running',pos='v'))
print(lemmatizer.lemmatize('better',pos='a'))

run
good


#Named Entity Recognition


In [14]:
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping chunkers/maxent_ne_chunker.zip.


True

In [16]:
from nltk import ne_chunk,pos_tag,word_tokenize
text="Barack Obama was the 44th President of the United States."
tokens=word_tokenize(text)
pos_tags=pos_tag(tokens)
entities=ne_chunk(pos_tags)
print(entities)

(S
  (PERSON Barack/NNP)
  (PERSON Obama/NNP)
  was/VBD
  the/DT
  44th/JJ
  President/NNP
  of/IN
  the/DT
  (GPE United/NNP States/NNPS)
  ./.)


#Parse Tree

In [20]:
from nltk import CFG
grammar=CFG.fromstring("""
  S -> NP VP
  NP -> Det N
  VP -> V PP
  PP -> P NP
  Det -> 'the'
  N -> 'cat'|'mat'
  V -> 'sat'
  P -> 'on'
  """)
parser=nltk.ChartParser(grammar)
sentence="the cat sat on the mat".split()
for tree in parser.parse(sentence):
  tree.pretty_print()

             S                     
      _______|_______               
     |               VP            
     |        _______|___           
     |       |           PP        
     |       |    _______|___       
     NP      |   |           NP    
  ___|___    |   |        ___|___   
Det      N   V   P      Det      N 
 |       |   |   |       |       |  
the     cat sat  on     the     mat

