In [1]:
import nltk
nltk.download('punkt_tab') #for sentence/word tokenization
nltk.download('wordnet') #wordnet lemmatization


[nltk_data] Downloading package punkt_tab to C:\Users\Vanshika
[nltk_data]     Garg\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Vanshika
[nltk_data]     Garg\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
#TOKENIZATION : breaking paragraph/sentences into words
#SENTENCE_TOKENISATION
#paragraph-->sentences

In [3]:
from nltk.tokenize import sent_tokenize

corpus = "I love animals. Nature is beautiful! Let's protect it."
sentences = sent_tokenize(corpus)
print(sentences)


['I love animals.', 'Nature is beautiful!', "Let's protect it."]


In [4]:
#WORD TOKENIZATION
#paragraph/sentence-->words

In [5]:
from nltk.tokenize import word_tokenize

(word_tokenize(corpus))

['I',
 'love',
 'animals',
 '.',
 'Nature',
 'is',
 'beautiful',
 '!',
 'Let',
 "'s",
 'protect',
 'it',
 '.']

In [6]:
from nltk.tokenize import wordpunct_tokenize
#treats punctuation as different words
(wordpunct_tokenize(corpus))

['I',
 'love',
 'animals',
 '.',
 'Nature',
 'is',
 'beautiful',
 '!',
 'Let',
 "'",
 's',
 'protect',
 'it',
 '.']

In [7]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer=TreebankWordTokenizer()
tokenizer.tokenize(corpus)

# "." is not treated as a separate word in mid of sentence 

['I',
 'love',
 'animals.',
 'Nature',
 'is',
 'beautiful',
 '!',
 'Let',
 "'s",
 'protect',
 'it',
 '.']

**STEMMING**
reducing words to their root/base form


**PORTER STEMMER**

In [8]:
words=["playing","played", "player", "plays","history","historical","eat","eaten","eating"]
from nltk.stem import PorterStemmer

stemming=PorterStemmer()

for word in words:
    print(word+"---->"+stemming.stem(word))

#Disadvantage: many words cannot be rooted correctly

playing---->play
played---->play
player---->player
plays---->play
history---->histori
historical---->histor
eat---->eat
eaten---->eaten
eating---->eat


**REGEXP STEMMER** The RegexpStemmer lets you define your own regex rules to chop off word endings (suffixes) 


In [9]:
from nltk.stem import RegexpStemmer
reg_stemmer=RegexpStemmer('ing$|s$|en$|',min=4)
#$--> ending with ing,s,en amd minimum word length=4
reg_stemmer.stem('eating')

'eat'

**SNOWBALL STEMMER** 

In [10]:
from nltk.stem import SnowballStemmer

snow_stemmer=SnowballStemmer('english')
for word in words:
    print(word+"---->"+snow_stemmer.stem(word))

snow_stemmer.stem('fairly')

playing---->play
played---->play
player---->player
plays---->play
history---->histori
historical---->histor
eat---->eat
eaten---->eaten
eating---->eat


'fair'

**LEMETIZATION** : returning a word to its base/real word form


In [11]:
from nltk.stem import WordNetLemmatizer

lemmatizer=WordNetLemmatizer()
lemmatizer.lemmatize('going',pos='v')

'''POS :
NOUN-n
VERB-v
ADJECTIVE-a
ADVERB-r'''

for word in words:
    print(word+ "---->"+lemmatizer.lemmatize(word,pos='v'))
## Q&a,CHATBOTS


playing---->play
played---->play
player---->player
plays---->play
history---->history
historical---->historical
eat---->eat
eaten---->eat
eating---->eat


**STOPWORDS** : #those words which do not carry much meaning in a text and are generally filtered before text processing


In [12]:
from nltk.corpus import stopwords

In [13]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to C:\Users\Vanshika
[nltk_data]     Garg\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [14]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [15]:
paragraph="""Good morning everyone. Today, I want to talk about the importance of innovation in education. 
We believe that by embracing new technologies and creative thinking, we can transform the way students learn and grow. 
It's not just about books anymore; it's about experiences, interactions, and real-world problem-solving."""


In [16]:
sentences=nltk.sent_tokenize(paragraph)

In [17]:
type(paragraph)

str

In [18]:
type(sentences)

list

In [19]:
print(sentences)

['Good morning everyone.', 'Today, I want to talk about the importance of innovation in education.', 'We believe that by embracing new technologies and creative thinking, we can transform the way students learn and grow.', "It's not just about books anymore; it's about experiences, interactions, and real-world problem-solving."]


In [20]:
from nltk.stem import SnowballStemmer
stemmer=SnowballStemmer('english')

In [21]:
for i in range(len(sentences)):
    words=nltk.word_tokenize(sentences[i])
    words=[stemmer.stem(word)for word in words if word not in set(stopwords.words('english'))]
    sentences[i]=' '.join(words)

In [22]:
sentences

['good morn everyon .',
 'today , i want talk import innov educ .',
 'we believ embrac new technolog creativ think , transform way student learn grow .',
 "it 's book anymor ; 's experi , interact , real-world problem-solv ."]

In [23]:
from nltk.stem import WordNetLemmatizer
lemmitizer=WordNetLemmatizer()
for i in range(len(sentences)):
    words=nltk.word_tokenize(sentences[i])
    words=[lemmitizer.lemmatize(word,pos='v')for word in words if word not in set(stopwords.words('english'))]
    sentences[i]=' '.join(words)


In [24]:
sentences

['good morn everyon .',
 'today , want talk import innov educ .',
 'believ embrac new technolog creativ think , transform way student learn grow .',
 "'s book anymor ; 's experi , interact , real-world problem-solv ."]

**PARTS OF SPEECH TAGGS**

In [25]:
from nltk.corpus import stopwords
import nltk
nltk.download('averaged_perceptron_tagger_eng')
sentences=nltk.sent_tokenize(paragraph)
for i in range(len(sentences)):
    words=nltk.word_tokenize(sentences[i])
    words=[lemmitizer.lemmatize(word,pos='v')for word in words if word not in set(stopwords.words('english'))]
    pos_tag=nltk.pos_tag(words)
    print(pos_tag)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\Vanshika Garg\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


[('Good', 'JJ'), ('morning', 'NN'), ('everyone', 'NN'), ('.', '.')]
[('Today', 'NN'), (',', ','), ('I', 'PRP'), ('want', 'VBP'), ('talk', 'NN'), ('importance', 'NN'), ('innovation', 'NN'), ('education', 'NN'), ('.', '.')]
[('We', 'PRP'), ('believe', 'VBP'), ('embrace', 'JJ'), ('new', 'JJ'), ('technologies', 'NNS'), ('creative', 'JJ'), ('think', 'NN'), (',', ','), ('transform', 'VB'), ('way', 'NN'), ('students', 'NNS'), ('learn', 'VBP'), ('grow', 'NN'), ('.', '.')]
[('It', 'PRP'), ("'s", 'VBZ'), ('book', 'NN'), ('anymore', 'RB'), (';', ':'), ("'s", 'POS'), ('experience', 'NN'), (',', ','), ('interactions', 'NNS'), (',', ','), ('real-world', 'JJ'), ('problem-solving', 'NN'), ('.', '.')]


In [26]:
sentences

['Good morning everyone.',
 'Today, I want to talk about the importance of innovation in education.',
 'We believe that by embracing new technologies and creative thinking, we can transform the way students learn and grow.',
 "It's not just about books anymore; it's about experiences, interactions, and real-world problem-solving."]

In [27]:
"Taj Mahal is a beautiful monument".split()

['Taj', 'Mahal', 'is', 'a', 'beautiful', 'monument']

In [28]:
print(nltk.pos_tag("Taj Mahal is a beautiful monument".split()))

[('Taj', 'NNP'), ('Mahal', 'NNP'), ('is', 'VBZ'), ('a', 'DT'), ('beautiful', 'JJ'), ('monument', 'NN')]


**NAMED ENTITY ORGANIZATION**

In [29]:
 import nltk
 nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
sentences="Taj Mahal is a beautiful monument built in 1913 by shehnshah"
words=nltk.word_tokenize(sentences)
tag_elements=nltk.pos_tag(words)

nltk.ne_chunk(tag_elements).draw()

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\Vanshika Garg\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to C:\Users\Vanshika
[nltk_data]     Garg\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
