In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /home/student/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/student/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/student/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/student/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [3]:
text = "Tokenization is the first step in text analytics. The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization."

In [4]:
from nltk.tokenize import sent_tokenize
tokenized_text= sent_tokenize(text)
print(tokenized_text)

['Tokenization is the first step in text analytics.', 'The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization.']


In [5]:
from nltk.tokenize import word_tokenize
tokenized_word=word_tokenize(text)
print(tokenized_word)

['Tokenization', 'is', 'the', 'first', 'step', 'in', 'text', 'analytics', '.', 'The', 'process', 'of', 'breaking', 'down', 'a', 'text', 'paragraph', 'into', 'smaller', 'chunks', 'such', 'as', 'words', 'or', 'sentences', 'is', 'called', 'Tokenization', '.']


In [6]:
import re
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))
print(stop_words)
text= "How to remove stop words with NLTK library in Python?"
text= re.sub('[^a-zA-Z]', ' ',text)
tokens = word_tokenize(text.lower())
filtered_text=[]
for w in tokens:
    if w not in stop_words:
        filtered_text.append(w)
print("Tokenized Sentence:",tokens)
print("Filtered Sentence:",filtered_text)

{'needn', "won't", 'up', 'other', 'nor', 'o', 'if', 'down', 'under', 'he', "you're", 'myself', 'be', 'while', 'more', 'not', 'an', 'the', 'whom', 'hasn', 'where', 'ours', 'are', 'how', 'very', 'ma', 'didn', "don't", 'haven', "hasn't", 've', 'himself', "that'll", 'above', 'doing', 'her', 'she', 't', 'off', 'what', 'has', 'a', 'with', 'yours', 'have', 'for', 're', 'him', 'having', 'same', 'over', 'now', 'hadn', "she's", 'yourself', 'all', "isn't", 'until', 'should', 'then', "couldn't", 'wasn', 'this', "hadn't", 'these', 'our', 'about', 'own', 'again', 'so', "should've", 'am', 'do', "doesn't", 'before', 'couldn', 'few', 'shouldn', 'once', 'each', 'we', "shouldn't", 'd', 'most', 'ain', 'wouldn', 'them', 'it', 'does', "mightn't", 'here', 'were', 'll', "wasn't", 'too', 'through', 'your', 'they', "wouldn't", 'you', 'but', 'or', 'because', 'mightn', 'after', 'than', 'me', 'their', 'which', 'weren', 'such', "shan't", 'why', 'below', 'y', 'just', 'in', 'was', 'there', 'when', 'won', 'at', 'out',

In [7]:
from nltk.stem import PorterStemmer
e_words= ["wait", "waiting", "waited", "waits"]
ps =PorterStemmer()
for w in e_words:
    rootWord=ps.stem(w)
print(rootWord)

wait


In [8]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
    print("Lemma for {} is {}".format(w,wordnet_lemmatizer.lemmatize(w)))

Lemma for studies is study
Lemma for studying is studying
Lemma for cries is cry
Lemma for cry is cry


In [9]:
import nltk
from nltk.tokenize import word_tokenize
data="The pink sweater fit her perfectly"
words=word_tokenize(data)
for word in words:
    print(nltk.pos_tag([word]))

[('The', 'DT')]
[('pink', 'NN')]
[('sweater', 'NN')]
[('fit', 'NN')]
[('her', 'PRP$')]
[('perfectly', 'RB')]


In [10]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
documentA = 'Jupiter is the largest Planet'
documentB = 'Mars is the fourth planet from the Sun'

In [12]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

In [13]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

In [14]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

In [15]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

In [16]:
def computeIDF(documents):
    import math
    N = len(documents)
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict
idfs = computeIDF([numOfWordsA, numOfWordsB])
idfs

{'Sun': 0.6931471805599453,
 'Jupiter': 0.6931471805599453,
 'planet': 0.6931471805599453,
 'fourth': 0.6931471805599453,
 'Mars': 0.6931471805599453,
 'the': 0.0,
 'Planet': 0.6931471805599453,
 'from': 0.6931471805599453,
 'largest': 0.6931471805599453,
 'is': 0.0}

In [17]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf
tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])
df

Unnamed: 0,Sun,Jupiter,planet,fourth,Mars,the,Planet,from,largest,is
0,0.0,0.138629,0.0,0.0,0.0,0.0,0.138629,0.0,0.138629,0.0
1,0.086643,0.0,0.086643,0.086643,0.086643,0.0,0.0,0.086643,0.0,0.0
