In [1]:
!pip install nltk



## Tokenization

In [3]:
corpus="""Hello i am currently learning generative ai,
Now i am learning nlp.
After that i will learn generative ai."""
print(corpus)

Hello i am currently learning generative ai,
Now i am learning nlp.
After that i will learn generative ai.


In [9]:
from nltk.tokenize import sent_tokenize
documents=sent_tokenize(corpus)
for sentence in documents:
    print(sentence)

Hello i am currently learning generative ai,
Now i am learning nlp.
After that i will learn generative ai.


In [11]:
from nltk.tokenize import word_tokenize
word_tokenize(corpus)

['Hello',
 'i',
 'am',
 'currently',
 'learning',
 'generative',
 'ai',
 ',',
 'Now',
 'i',
 'am',
 'learning',
 'nlp',
 '.',
 'After',
 'that',
 'i',
 'will',
 'learn',
 'generative',
 'ai',
 '.']

In [13]:
for sentence in documents:
    print(word_tokenize(sentence))

['Hello', 'i', 'am', 'currently', 'learning', 'generative', 'ai', ',', 'Now', 'i', 'am', 'learning', 'nlp', '.']
['After', 'that', 'i', 'will', 'learn', 'generative', 'ai', '.']


In [17]:
from nltk.tokenize import wordpunct_tokenize
wordpunct_tokenize(corpus)

['Hello',
 'i',
 'am',
 'currently',
 'learning',
 'generative',
 'ai',
 ',',
 'Now',
 'i',
 'am',
 'learning',
 'nlp',
 '.',
 'After',
 'that',
 'i',
 'will',
 'learn',
 'generative',
 'ai',
 '.']

In [23]:
from nltk.tokenize import TreebankWordTokenizer
tokenizer=TreebankWordTokenizer()
tokenizer.tokenize(corpus)

['Hello',
 'i',
 'am',
 'currently',
 'learning',
 'generative',
 'ai',
 ',',
 'Now',
 'i',
 'am',
 'learning',
 'nlp.',
 'After',
 'that',
 'i',
 'will',
 'learn',
 'generative',
 'ai',
 '.']

## Stemming

In [121]:
#stemming
words=["eating","eats","eaten","writing","writes","finally","history","finalized","programs","programming"]

### PorterStemmer 

In [31]:
from nltk.stem import PorterStemmer
stemming=PorterStemmer()

In [33]:
for word in words:
    print(word+ "---->"+stemming.stem(word))

eating---->eat
eats---->eat
eaten---->eaten
writing---->write
writes---->write
finally---->final
history---->histori
finalized---->final


In [35]:
stemming.stem("sitting")

'sit'

In [76]:
stemming.stem('fairly'),stemming.stem('sportingly')

('fairli', 'sportingli')

### RegexpStemmer class

In [39]:
from nltk.stem import RegexpStemmer

In [63]:
reg_stemmer=RegexpStemmer('ing$|s$|able$|en$',min=4)

In [65]:
reg_stemmer.stem('eating')

'eat'

In [67]:
reg_stemmer.stem('eaten')

'eat'

### Snowball Stemmer

In [70]:
from nltk.stem import SnowballStemmer

In [72]:
snowballstemmer=SnowballStemmer('english')

In [74]:
for word in words:
    print(word+"---->"+snowballstemmer.stem(word))

eating---->eat
eats---->eat
eaten---->eaten
writing---->write
writes---->write
finally---->final
history---->histori
finalized---->final


In [78]:
snowballstemmer.stem('fairly'),snowballstemmer.stem('sportingly')

('fair', 'sport')

 ## Lemmatization

In [111]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to C:\Users\USER/nltk_data...


True

In [113]:
from nltk.stem import WordNetLemmatizer

In [115]:
lemmatizer=WordNetLemmatizer()

In [119]:
'''
POS -
Noun-n
verb-v
adjective-a
adverb-r
'''
lemmatizer.lemmatize("going",pos='v')

'go'

In [127]:
for word in words:
    print(word+"---->"+lemmatizer.lemmatize(word,pos='v'))

eating---->eat
eats---->eat
eaten---->eat
writing---->write
writes---->write
finally---->finally
history---->history
finalized---->finalize
programs---->program
programming---->program


In [131]:
lemmatizer.lemmatize('fairly',pos='n'), lemmatizer.lemmatize('sportingly',pos='n')

('fairly', 'sportingly')

### Stopwords

In [134]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER/nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [138]:
from nltk.corpus import stopwords

In [140]:
stopwords.words('english')

['a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 'her',
 'here',
 'hers',
 'herself',
 "he's",
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 'if',
 "i'll",
 "i'm",
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 "i've",
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [142]:
paragraph="""Text preprocessing is set of techniques used to clean and prepare raw text so that it can be understood by machine learning or 
deep learning models. Raw text is nothing but spelling mistakes,symbols,stopwords. Machine learning doesn't understand text directly they understand
numbers or vectors. Preprocessing converts raw text to structured , clean,meaningful vectors. Text preprocessing is important because it remove
noise(symbols,punctuations), it reduce vocablury size ,improve accuracy and speeds up the traning.
"""

In [155]:
from nltk.tokenize import word_tokenize

In [159]:
from nltk.corpus import stopwords

In [161]:
words=word_tokenize(paragraph)

In [163]:
#get english stopwords
stop_words=stopwords.words('english')

In [167]:
#remove stopwords
filtered_words=[word for word in words if word.lower() not in stop_words]

In [171]:
print("ORIGINAL WORDS:",words)
print("FILTERED WORDS:",filtered_words)

ORIGINAL WORDS: ['Text', 'preprocessing', 'is', 'set', 'of', 'techniques', 'used', 'to', 'clean', 'and', 'prepare', 'raw', 'text', 'so', 'that', 'it', 'can', 'be', 'understood', 'by', 'machine', 'learning', 'or', 'deep', 'learning', 'models', '.', 'Raw', 'text', 'is', 'nothing', 'but', 'spelling', 'mistakes', ',', 'symbols', ',', 'stopwords', '.', 'Machine', 'learning', 'does', "n't", 'understand', 'text', 'directly', 'they', 'understand', 'numbers', 'or', 'vectors', '.', 'Preprocessing', 'converts', 'raw', 'text', 'to', 'structured', ',', 'clean', ',', 'meaningful', 'vectors', '.', 'Text', 'preprocessing', 'is', 'important', 'because', 'it', 'remove', 'noise', '(', 'symbols', ',', 'punctuations', ')', ',', 'it', 'reduce', 'vocablury', 'size', ',', 'improve', 'accuracy', 'and', 'speeds', 'up', 'the', 'traning', '.']
FILTERED WORDS: ['Text', 'preprocessing', 'set', 'techniques', 'used', 'clean', 'prepare', 'raw', 'text', 'understood', 'machine', 'learning', 'deep', 'learning', 'models',

In [173]:
from nltk.stem import PorterStemmer

In [175]:
stemmer=PorterStemmer()

In [177]:
sentences=nltk.sent_tokenize(paragraph)

In [179]:
type(sentences)

list

In [181]:
## apply stopwords and filter and then apply Porter stemming
for i in range(len(sentences)):
    words=nltk.word_tokenize(sentences[i])
    words=[stemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i]=' '.join(words) #converting all the list of words into sentences

In [183]:
print(sentences)

['text preprocess set techniqu use clean prepar raw text understood machin learn deep learn model .', 'raw text noth spell mistak , symbol , stopword .', "machin learn n't understand text directli understand number vector .", 'preprocess convert raw text structur , clean , meaning vector .', 'text preprocess import remov nois ( symbol , punctuat ) , reduc vocabluri size , improv accuraci speed trane .']


In [185]:
from nltk.stem import SnowballStemmer

In [187]:
snowballstemmer=SnowballStemmer('english')

In [195]:
sentences=nltk.sent_tokenize(paragraph)

In [199]:
#apply stopword and filter then apply snowball stemming
for i in range(len(sentences)):
    words=nltk.word_tokenize(sentences[i])
    words=[snowballstemmer.stem(word) for word in words if word not in set(stopwords.words('english'))]
    sentences[i]=' '.join(words)

In [201]:
print(sentences)

['text preprocess set techniqu use clean prepar raw text understood machin learn deep learn model .', 'raw text noth spell mistak , symbol , stopword .', "machin learn n't understand text direct understand number vector .", 'preprocess convert raw text structur , clean , meaning vector .', 'text preprocess import remov nois ( symbol , punctuat ) , reduc vocabluri size , improv accuraci speed trane .']


In [203]:
from nltk.stem import WordNetLemmatizer

In [206]:
lemmatizer=WordNetLemmatizer()


In [208]:
sentences=nltk.sent_tokenize(paragraph)

In [236]:
# apply stopwords and filter and then apply lemmatization
for i in range(len(sentences)):
    words=nltk.word_tokenize(sentences[i])
    words=[lemmatizer.lemmatize(word.lower(),pos='v') for word in words if word not in set(stopwords.words('english'))]
    sentences[i]=' '.join(words)

In [238]:
print(sentences)

['text preprocessing set technique use clean prepare raw text understand machine learn deep learn model .', 'raw text nothing spell mistake , symbol , stopwords .', "machine learn n't understand text directly understand number vector .", 'preprocessing convert raw text structure , clean , meaningful vector .', 'text preprocessing important remove noise ( symbol , punctuation ) , reduce vocablury size , improve accuracy speed traning .']


 ## Parts of speech (POS) tagging

In [241]:
import nltk

In [243]:
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\USER/nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [245]:
sentence='NLP is field of artificial intelligence'

In [247]:
from nltk.tokenize import word_tokenize

In [249]:
words=word_tokenize(sentence)

In [251]:
#pos tagging
pos_tags=nltk.pos_tag(words)

In [253]:
print(pos_tags)

[('NLP', 'NNP'), ('is', 'VBZ'), ('field', 'NN'), ('of', 'IN'), ('artificial', 'JJ'), ('intelligence', 'NN')]
