In [1]:
import numpy as np
import pandas as pd

In [2]:
import nltk

In [3]:
# nltk.download()

# Tokenize

## Word Tokenize

In [4]:
from nltk.tokenize import word_tokenize

In [5]:
text = "The quick brown fox jumps over the lazy dog"
print(word_tokenize(text))

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog']


## Sentence Tokenize

In [6]:
from nltk.tokenize import sent_tokenize

In [7]:
text = "Hello Everyone. Welcome to the Python Programming"

print(sent_tokenize(text))

['Hello Everyone.', 'Welcome to the Python Programming']


# Stemming

In [8]:
from nltk.stem import PorterStemmer

In [9]:
words = ['clean','cleaning','cleans','cleaned']
ps = PorterStemmer()
for w in words:
    print(ps.stem(w))

clean
clean
clean
clean


In [10]:
words = ['give', 'giving', 'given', 'gave']
ps = PorterStemmer()
for w in words:
    print(ps.stem(w))

give
give
given
gave


# Lemmanization

In [11]:
from nltk.stem import WordNetLemmatizer

In [12]:
words = ['studies', 'study', 'floors', 'cry']
wnl = WordNetLemmatizer()
for w in words:
    print(wnl.lemmatize(w))

study
study
floor
cry


# Stop Words

In [13]:
from nltk.corpus import stopwords

In [14]:
stopwords.words('english')

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [15]:
from nltk.tokenize import word_tokenize

In [16]:
text = 'AI was introduced in the year 1956 but it gained popularity recently.'
stopw = set(stopwords.words('english'))

text = word_tokenize(text)


filtered = []

for ch in text:
    if ch not in stopw:
        filtered.append(ch)
filtered

['AI', 'introduced', 'year', '1956', 'gained', 'popularity', 'recently', '.']

# POS Tagging

In [17]:
from nltk.tokenize import word_tokenize

In [23]:
text = 'AI was introduced in the year 1956 but it gained popularity recently.'

text = word_tokenize(text)

for w in text:
    print(nltk.pos_tag([w]))

[('AI', 'NN')]
[('was', 'VBD')]
[('introduced', 'VBN')]
[('in', 'IN')]
[('the', 'DT')]
[('year', 'NN')]
[('1956', 'CD')]
[('but', 'CC')]
[('it', 'PRP')]
[('gained', 'VBN')]
[('popularity', 'NN')]
[('recently', 'RB')]
[('.', '.')]


# TF IDF

In [27]:
from collections import Counter

In [28]:
data = [
    'The quick brown fox jumps over the lazy dog',
    'The brown fox is quick',
    'The lazy dog is sleeping'
]

In [29]:
# Tokenize

tokenized = [word_tokenize(d) for d in data]
tokenized

[['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog'],
 ['The', 'brown', 'fox', 'is', 'quick'],
 ['The', 'lazy', 'dog', 'is', 'sleeping']]

In [30]:
# TF - Term Frequency

TF = [Counter(token) for token in tokenized]
TF

[Counter({'The': 1,
          'quick': 1,
          'brown': 1,
          'fox': 1,
          'jumps': 1,
          'over': 1,
          'the': 1,
          'lazy': 1,
          'dog': 1}),
 Counter({'The': 1, 'brown': 1, 'fox': 1, 'is': 1, 'quick': 1}),
 Counter({'The': 1, 'lazy': 1, 'dog': 1, 'is': 1, 'sleeping': 1})]

In [35]:
# IDF - Inverse Document Frequency

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

result = tfidf.fit_transform(data)

In [36]:
print('\nWord indexes:')
print(tfidf.vocabulary_)
 
# display tf-idf values
print('\ntf-idf values:')
print(result)


Word indexes:
{'the': 9, 'quick': 7, 'brown': 0, 'fox': 2, 'jumps': 4, 'over': 6, 'lazy': 5, 'dog': 1, 'is': 3, 'sleeping': 8}

tf-idf values:
  (0, 1)	0.30330642493908333
  (0, 5)	0.30330642493908333
  (0, 6)	0.3988114995291713
  (0, 4)	0.3988114995291713
  (0, 2)	0.30330642493908333
  (0, 0)	0.30330642493908333
  (0, 7)	0.30330642493908333
  (0, 9)	0.4710889922721062
  (1, 3)	0.46609584262774545
  (1, 2)	0.46609584262774545
  (1, 0)	0.46609584262774545
  (1, 7)	0.46609584262774545
  (1, 9)	0.3619650009883935
  (2, 8)	0.5694308628404254
  (2, 3)	0.43306684852870914
  (2, 1)	0.43306684852870914
  (2, 5)	0.43306684852870914
  (2, 9)	0.33631504064053513
