# 7. Text Analytics
1. Extract Sample document and apply following document preprocessing
methods: Tokenization, POS Tagging, stop words removal, Stemming and
Lemmatization.
2. Create representation of document by calculating Term Frequency and Inverse
Document Frequency.

In [1]:
import nltk
import string
import math

In [2]:
# Read the docs
# Remove all non-ASCII characters

with open( "doc_01" , "r" ) as file:
    doc_1 = file.read()

with open( "doc_02" , "r" ) as file:
    doc_2 = file.read()

alphabet = string.printable
def clean_doc( doc: str ) -> str:
    return ''.join( [ c for c in doc.lower() if c in alphabet ] )

doc_1 = clean_doc( doc_1 )
doc_2 = clean_doc( doc_2 )

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Sumeet\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sumeet\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Sumeet\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Sumeet\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## 7.1. Document Preprocessing

### 7.1.1. Tokenization

In [4]:
"""
word tokenization
default tokenizer: Penn Treebank Tokenizer
Ref: https://docs.ropensci.org/tokenizers/reference/ptb-tokenizer.html
This tokenizer uses regular expressions to tokenize text similar to the tokenization used in the Penn Treebank. It assumes that text has already been split into sentences. 
The tokenizer does the following:
- splits common English contractions, e.g. don't is tokenized into do n't and they'll is tokenized into -> they 'll,
- handles punctuation characters as separate tokens,
- splits commas and single quotes off from words, when they are followed by whitespace,
- splits off periods that occur at the end of the sentence.
"""
tokens_1 = nltk.word_tokenize( doc_1 )
tokens_2 = nltk.word_tokenize( doc_2 )
print( tokens_1 )

['between', '2016', 'and', '2019', ',', 'the', 'state', 'forest', 'department', 'under', 'thebjpgovernment', 'had', 'launched', 'green', 'maharashtra', 'drive', 'with', 'an', 'aim', 'to', 'plant', '50', 'crore', 'trees', 'across', 'the', 'state', 'in', 'the', 'four-year', 'period', '.', 'in', 'october', '2019', ',', 'the', 'government', 'had', 'claimed', 'it', 'had', 'surpassed', 'the', 'target', 'by', 'planting', '33', 'crore', 'trees', 'in', 'july-september', '2019.the', 'indian', 'expresshad', 'found', 'that', 'non-forest', 'agencies', 'such', 'as', 'gram', 'panchayats', 'which', 'were', 'tasked', 'with', 'planting', 'trees', 'had', 'not', 'uploaded', 'the', 'mandatory', 'audio-visual', 'proof', 'of', 'the', 'tree', 'plantation', 'drives', 'on', 'the', 'specially', 'created', 'portal', '.', 'in', 'pune', 'revenue', 'division', ',', 'it', 'was', 'claimed', 'the', 'gram', 'panchayats', 'planted', '1.7', 'crore', 'saplings', ';', 'however', ',', 'no', 'evidence', 'was', 'uploaded', 'fo

### 7.1.2. POS Tagging

In [5]:
# default pos tagger: Perceptron Tagger
# Ref: https://explosion.ai/blog/part-of-speech-pos-tagger-in-python/
pos_tags = dict( zip( tokens_1 , nltk.pos_tag(tokens_1) ) )
print( pos_tags )

{'between': ('between', 'IN'), '2016': ('2016', 'CD'), 'and': ('and', 'CC'), '2019': ('2019', 'CD'), ',': (',', ','), 'the': ('the', 'DT'), 'state': ('state', 'NN'), 'forest': ('forest', 'NN'), 'department': ('department', 'NN'), 'under': ('under', 'IN'), 'thebjpgovernment': ('thebjpgovernment', 'NN'), 'had': ('had', 'VBD'), 'launched': ('launched', 'VBN'), 'green': ('green', 'JJ'), 'maharashtra': ('maharashtra', 'JJ'), 'drive': ('drive', 'NN'), 'with': ('with', 'IN'), 'an': ('an', 'DT'), 'aim': ('aim', 'NN'), 'to': ('to', 'TO'), 'plant': ('plant', 'NN'), '50': ('50', 'CD'), 'crore': ('crore', 'NN'), 'trees': ('trees', 'NNS'), 'across': ('across', 'IN'), 'in': ('in', 'IN'), 'four-year': ('four-year', 'JJ'), 'period': ('period', 'NN'), '.': ('.', '.'), 'october': ('october', 'JJ'), 'government': ('government', 'NN'), 'claimed': ('claimed', 'VBN'), 'it': ('it', 'PRP'), 'surpassed': ('surpassed', 'VBN'), 'target': ('target', 'NN'), 'by': ('by', 'IN'), 'planting': ('planting', 'VBG'), '33'

### 7.1.3. Sentence Tokenization


In [6]:
"""
sentence tokenization
default tokenizer: Punkt tokenizer
Ref: Unsupervised Multilingual Sentence Boundary Detection (Kiss and Strunk (2005)
"""
print( nltk.sent_tokenize( doc_1 ) )

['between 2016 and 2019, the state forest department under thebjpgovernment had launched green maharashtra drive with an aim to plant 50 crore trees across the state in the four-year period.', 'in october 2019, the government had claimed it had surpassed the target by planting 33 crore trees in july-september 2019.the indian expresshad found that non-forest agencies  such as gram panchayats  which were tasked with planting trees had not uploaded the mandatory audio-visual proof of the tree plantation drives on the specially created portal.', 'in pune revenue division, it was claimed the gram panchayats planted 1.7 crore saplings; however, no evidence was uploaded for 87 per cent (1.49 crore) saplings.', 'also, out of the 59 government agencies involved in the drive as many as 38 had not submitted survival reports about the saplings.', 'this year, the targets set by the forest department were comparatively modest.', 'for example, pune circle  which comprises three divisions in pune and 

### 7.1.4. Lemmatization

In [7]:
# lemmatization
lemmatizer = nltk.stem.WordNetLemmatizer()
tokens_1 = [ lemmatizer.lemmatize(token) for token in tokens_1 ]
tokens_2 = [ lemmatizer.lemmatize(token) for token in tokens_2 ]
print( tokens_1 )

['between', '2016', 'and', '2019', ',', 'the', 'state', 'forest', 'department', 'under', 'thebjpgovernment', 'had', 'launched', 'green', 'maharashtra', 'drive', 'with', 'an', 'aim', 'to', 'plant', '50', 'crore', 'tree', 'across', 'the', 'state', 'in', 'the', 'four-year', 'period', '.', 'in', 'october', '2019', ',', 'the', 'government', 'had', 'claimed', 'it', 'had', 'surpassed', 'the', 'target', 'by', 'planting', '33', 'crore', 'tree', 'in', 'july-september', '2019.the', 'indian', 'expresshad', 'found', 'that', 'non-forest', 'agency', 'such', 'a', 'gram', 'panchayat', 'which', 'were', 'tasked', 'with', 'planting', 'tree', 'had', 'not', 'uploaded', 'the', 'mandatory', 'audio-visual', 'proof', 'of', 'the', 'tree', 'plantation', 'drive', 'on', 'the', 'specially', 'created', 'portal', '.', 'in', 'pune', 'revenue', 'division', ',', 'it', 'wa', 'claimed', 'the', 'gram', 'panchayat', 'planted', '1.7', 'crore', 'sapling', ';', 'however', ',', 'no', 'evidence', 'wa', 'uploaded', 'for', '87', 'p

### 7.1.5. Stemming

In [8]:
stemmer = nltk.stem.PorterStemmer()
tokens_1 = [ stemmer.stem( token ) for token in tokens_1 ]
tokens_2 = [ stemmer.stem( token ) for token in tokens_2 ]
print( tokens_1 )

['between', '2016', 'and', '2019', ',', 'the', 'state', 'forest', 'depart', 'under', 'thebjpgovern', 'had', 'launch', 'green', 'maharashtra', 'drive', 'with', 'an', 'aim', 'to', 'plant', '50', 'crore', 'tree', 'across', 'the', 'state', 'in', 'the', 'four-year', 'period', '.', 'in', 'octob', '2019', ',', 'the', 'govern', 'had', 'claim', 'it', 'had', 'surpass', 'the', 'target', 'by', 'plant', '33', 'crore', 'tree', 'in', 'july-septemb', '2019.the', 'indian', 'expresshad', 'found', 'that', 'non-forest', 'agenc', 'such', 'a', 'gram', 'panchayat', 'which', 'were', 'task', 'with', 'plant', 'tree', 'had', 'not', 'upload', 'the', 'mandatori', 'audio-visu', 'proof', 'of', 'the', 'tree', 'plantat', 'drive', 'on', 'the', 'special', 'creat', 'portal', '.', 'in', 'pune', 'revenu', 'divis', ',', 'it', 'wa', 'claim', 'the', 'gram', 'panchayat', 'plant', '1.7', 'crore', 'sapl', ';', 'howev', ',', 'no', 'evid', 'wa', 'upload', 'for', '87', 'per', 'cent', '(', '1.49', 'crore', ')', 'sapl', '.', 'also', 

### 7.1.6. Stop Word Removal

In [9]:
# stop word removal
def remove_stop_words( tokens ):
    return [ token for token in tokens if token not in nltk.corpus.stopwords.words('english') ]

tokens_1 = remove_stop_words( tokens_1 )
tokens_2 = remove_stop_words( tokens_2 )
print( tokens_1 )

['2016', '2019', ',', 'state', 'forest', 'depart', 'thebjpgovern', 'launch', 'green', 'maharashtra', 'drive', 'aim', 'plant', '50', 'crore', 'tree', 'across', 'state', 'four-year', 'period', '.', 'octob', '2019', ',', 'govern', 'claim', 'surpass', 'target', 'plant', '33', 'crore', 'tree', 'july-septemb', '2019.the', 'indian', 'expresshad', 'found', 'non-forest', 'agenc', 'gram', 'panchayat', 'task', 'plant', 'tree', 'upload', 'mandatori', 'audio-visu', 'proof', 'tree', 'plantat', 'drive', 'special', 'creat', 'portal', '.', 'pune', 'revenu', 'divis', ',', 'wa', 'claim', 'gram', 'panchayat', 'plant', '1.7', 'crore', 'sapl', ';', 'howev', ',', 'evid', 'wa', 'upload', '87', 'per', 'cent', '(', '1.49', 'crore', ')', 'sapl', '.', 'also', ',', '59', 'govern', 'agenc', 'involv', 'drive', 'mani', '38', 'submit', 'surviv', 'report', 'sapl', '.', 'thi', 'year', ',', 'target', 'set', 'forest', 'depart', 'compar', 'modest', '.', 'exampl', ',', 'pune', 'circl', 'compris', 'three', 'divis', 'pune', '

## 7.2. TF and IDF

In [10]:
# Returns a map containing term-frequencies of each token
# present in `doc`
# tf( token ) = freq( token ) / num_tokens_in_doc
def term_freq( doc_tokens ):
    N = len( doc_tokens )
    token_freq = dict( [ ( token , 0 ) for token in doc_tokens ] )
    for token in doc_tokens:
        token_freq[ token ] += 1
    tf = dict( [ ( token , count / N ) for token , count in token_freq.items() ] )
    return tf

tf_1 = term_freq( tokens_1 )
tf_2 = term_freq( tokens_2 )

In [11]:
# Calculate inverse-document frequency
# IDF( token ) = log( N / (num_docs_where_token_occurs) )
all_tokens = tokens_1 + tokens_2 # list concatenation
def inverse_doc_freq():
    N = 2
    idf = {}
    for token in all_tokens:
        token_doc_freq = 0
        if token in tokens_1:
            token_doc_freq += 1
        if token in tokens_2:
            token_doc_freq += 1
        idf[ token ] = math.log( N / token_doc_freq )
    return idf

idf = inverse_doc_freq()

In [12]:
# TFIDF( token ) = TF( token ) * IDF( token )
doc_1_repr = []
for token in tokens_1:
    doc_1_repr.append( tf_1[ token ] * idf[token] )
doc_2_repr = []
for token in tokens_2:
    doc_2_repr.append( tf_2[ token ] * idf[token] )

In [13]:
print( doc_1_repr )

[0.004824690351925373, 0.0, 0.0, 0.0, 0.0, 0.012865840938467661, 0.0016082301173084576, 0.0016082301173084576, 0.006432920469233831, 0.008041150586542289, 0.0, 0.006432920469233831, 0.0, 0.004824690351925373, 0.01929876140770149, 0.0, 0.0, 0.0, 0.0016082301173084576, 0.0016082301173084576, 0.0, 0.0016082301173084576, 0.0, 0.0, 0.0, 0.0032164602346169153, 0.0016082301173084576, 0.006432920469233831, 0.0, 0.006432920469233831, 0.01929876140770149, 0.0, 0.0016082301173084576, 0.0016082301173084576, 0.0, 0.0016082301173084576, 0.0016082301173084576, 0.0016082301173084576, 0.0032164602346169153, 0.0032164602346169153, 0.0032164602346169153, 0.0016082301173084576, 0.0, 0.0, 0.0032164602346169153, 0.0016082301173084576, 0.0016082301173084576, 0.0016082301173084576, 0.0, 0.0, 0.0, 0.0032164602346169153, 0.0032164602346169153, 0.0016082301173084576, 0.0, 0.008041150586542289, 0.0016082301173084576, 0.004824690351925373, 0.0, 0.0, 0.0032164602346169153, 0.0032164602346169153, 0.00321646023461691

In [14]:
print(doc_2_repr)

[0.007097070108122307, 0.002365690036040769, 0.009462760144163076, 0.002365690036040769, 0.0, 0.004731380072081538, 0.0, 0.0, 0.0, 0.002365690036040769, 0.0, 0.002365690036040769, 0.007097070108122307, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.004731380072081538, 0.002365690036040769, 0.0, 0.0, 0.002365690036040769, 0.0, 0.002365690036040769, 0.009462760144163076, 0.009462760144163076, 0.0, 0.0, 0.011828450180203846, 0.0, 0.002365690036040769, 0.0, 0.002365690036040769, 0.0, 0.002365690036040769, 0.002365690036040769, 0.002365690036040769, 0.002365690036040769, 0.002365690036040769, 0.002365690036040769, 0.002365690036040769, 0.0, 0.0, 0.0, 0.007097070108122307, 0.0, 0.0, 0.0, 0.0, 0.0, 0.002365690036040769, 0.0, 0.0, 0.0, 0.0, 0.002365690036040769, 0.0, 0.0, 0.011828450180203846, 0.002365690036040769, 0.011828450180203846, 0.0, 0.0, 0.009462760144163076, 0.009462760144163076, 0.002365690036040769, 0.0, 0.0, 0.0, 0.0, 0.002365690036040769, 0.002365690036040769, 0.0, 0.0, 0.011828450180203846, 0.