**Text Analytics**
1. Extract Sample document and apply following document preprocessing methods:
Tokenization, POS Tagging, stop words removal, Stemming and Lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse Document 
Frequency

Import Required Libraries

In [None]:
pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import nltk
import re

 Download the required packages

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

Initialize the text

In [None]:
text= "Tokenization is the first step in text analytics. The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization."

Perform Tokenization

In [None]:
#Sentence Tokenization
from nltk.tokenize import sent_tokenize
tokenized_text= sent_tokenize(text)
print(tokenized_text)

['Tokenization is the first step in text analytics.', 'The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization.']


In [None]:
#Word Tokenization
from nltk.tokenize import word_tokenize
tokenized_word=word_tokenize(text)
print(tokenized_word)

['Tokenization', 'is', 'the', 'first', 'step', 'in', 'text', 'analytics', '.', 'The', 'process', 'of', 'breaking', 'down', 'a', 'text', 'paragraph', 'into', 'smaller', 'chunks', 'such', 'as', 'words', 'or', 'sentences', 'is', 'called', 'Tokenization', '.']


Removing Punctuations and Stop Words

In [None]:
from nltk.corpus import stopwords
stop_words=set(stopwords.words("english"))
print(stop_words)

{"that'll", 'doing', 'so', 'once', 'most', 'having', 'up', 'and', 'on', 'will', 'same', 'some', 'ma', 'an', "you'd", 'each', 'in', 'll', 'is', 'between', 'herself', "mightn't", 'before', 'have', "you'll", 'above', 'mustn', 'where', 'she', 'these', 'more', 'hers', 'then', 'now', 'after', 'his', 'by', 'whom', 'yours', 't', 'wouldn', 'off', 'when', 'be', 'shouldn', 'themselves', 'had', 'until', 'other', 'how', 'few', 'myself', 'should', 'himself', 'below', "hasn't", 'does', 'ain', 'has', 'am', "you've", 'over', 'it', 'aren', "needn't", 'down', 'only', 'were', 'are', 'don', 'couldn', 'him', "you're", 've', 'doesn', 'theirs', "she's", 'this', 'they', 'again', "shouldn't", 'that', 'about', 'as', "mustn't", 'their', 'my', 'ourselves', "couldn't", 'to', 'didn', 'its', 'there', 'do', 'why', 'not', "doesn't", 'was', 'needn', 'while', 'me', 'can', 'during', 'very', "didn't", 'a', 'isn', 'the', "aren't", 'i', 'if', 'through', "shan't", 'just', 'who', 'won', 'ours', 'what', 'here', 'm', 'no', 're',

In [None]:
text= "How to remove stop words with NLTK library in Python?"
text= re.sub('[^a-zA-Z]', ' ',text)
tokens = word_tokenize(text.lower())
filtered_text=[]
for w in tokens:
  if w not in stop_words:
    filtered_text.append(w)
print("Tokenized Sentence:",tokens)
print("Filterd Sentence:",filtered_text)


Tokenized Sentence: ['how', 'to', 'remove', 'stop', 'words', 'with', 'nltk', 'library', 'in', 'python']
Filterd Sentence: ['remove', 'stop', 'words', 'nltk', 'library', 'python']


Perform Stemming

In [None]:
from nltk.stem import PorterStemmer
e_words= ["wait", "waiting", "waited", "waits"]
ps =PorterStemmer()
for w in e_words:
  rootWord=ps.stem(w)
print(rootWord)


wait


Perform Lemmatization

In [None]:
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
  print("Lemma for {} is {}".format(w,wordnet_lemmatizer.lemmatize(w)))


Lemma for studies is study
Lemma for studying is studying
Lemma for cries is cry
Lemma for cry is cry


Apply POS Tagging to text

In [None]:
import nltk
from nltk.tokenize import word_tokenize
data="The pink sweater fit her perfectly"
words=word_tokenize(data)
for word in words:
  print(nltk.pos_tag([word]))

[('The', 'DT')]
[('pink', 'NN')]
[('sweater', 'NN')]
[('fit', 'NN')]
[('her', 'PRP$')]
[('perfectly', 'RB')]


**Part II**

Import the necessary libraries

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

Initialize the Documents

In [None]:
documentA = 'Jupiter is the largest planet'
documentB = 'Mars is the fourth planet from the Sun'

Create BagofWords (BoW) for Document A and B

In [None]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

Create Collection of Unique words from Document A and B

In [None]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

Create a dictionary of words and their occurrence for each document in the corpus

In [None]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
  numOfWordsA[word] += 1
  numOfWordsB = dict.fromkeys(uniqueWords, 0)

for word in bagOfWordsB:
  numOfWordsB[word] += 1  

Compute the term frequency for each of our documents

In [None]:
def computeTF(wordDict, bagOfWords):
  tfDict = {}
  bagOfWordsCount = len(bagOfWords)
  for word, count in wordDict.items():
    tfDict[word] = count / float(bagOfWordsCount)
  return tfDict
tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)

In [None]:
tfA

{'planet': 0.2,
 'Jupiter': 0.2,
 'is': 0.2,
 'from': 0.0,
 'Mars': 0.0,
 'Sun': 0.0,
 'the': 0.2,
 'fourth': 0.0,
 'largest': 0.2}

In [None]:
tfB

{'planet': 0.125,
 'Jupiter': 0.0,
 'is': 0.125,
 'from': 0.125,
 'Mars': 0.125,
 'Sun': 0.125,
 'the': 0.25,
 'fourth': 0.125,
 'largest': 0.0}

Compute the term Inverse Document Frequency

In [None]:
def computeIDF(documents):
  import math
  N = len(documents)
  idfDict = dict.fromkeys(documents[0].keys(), 0)
  for document in documents:
    for word, val in document.items():
      if val > 0:
        idfDict[word] += 1
  for word, val in idfDict.items():
      idfDict[word] = math.log(N / float(val))
  return idfDict
idfs = computeIDF([numOfWordsA, numOfWordsB])
idfs

{'planet': 0.0,
 'Jupiter': 0.6931471805599453,
 'is': 0.0,
 'from': 0.6931471805599453,
 'Mars': 0.6931471805599453,
 'Sun': 0.6931471805599453,
 'the': 0.0,
 'fourth': 0.6931471805599453,
 'largest': 0.6931471805599453}

Compute the term TF/IDF for all words

In [None]:
def computeTFIDF(tfBagOfWords, idfs):
  tfidf = {}
  for word, val in tfBagOfWords.items():
    tfidf[word] = val*idfs[word]
  return tfidf

tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df=pd.DataFrame([tfidfA, tfidfB])
df

Unnamed: 0,planet,Jupiter,is,from,Mars,Sun,the,fourth,largest
0,0.0,0.138629,0.0,0.0,0.0,0.0,0.0,0.0,0.138629
1,0.0,0.0,0.0,0.086643,0.086643,0.086643,0.0,0.086643,0.0
