# Text Analytics
1. Extract Sample document and apply following document preprocessing
methods:Tokenization, POS Tagging, stop words removal, Stemming andLemmatization.
2. Create representation of document by calculating Term Frequency and InverseDocumentFrequency.

In [52]:
import nltk

text = "Dr. Devin is learning NLP. It is very interesting and exciting. It is an important area of AI."

# Tokenization
Tokenization is the process of breaking a stream of text up into words, phrases, symbols, or other meaningful elements. The tokens become the input for another process like parsing and text mining. Tokenization is useful because it breaks the text into smaller, more manageable parts.

In [53]:
# Tokenize the text
nltk.download('punkt')

sentences = nltk.sent_tokenize(text) # Sentence Tokenization used to split the text into sentences
print(sentences) 

for sentence in sentences:
    words = nltk.word_tokenize(sentence)
    for word in words:
        print(word)


['Dr. Devin is learning NLP.', 'It is very interesting and exciting.', 'It is an important area of AI.']
Dr.
Devin
is
learning
NLP
.
It
is
very
interesting
and
exciting
.
It
is
an
important
area
of
AI
.


[nltk_data] Downloading package punkt to /home/vscode/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# POS Tagging
Part-of-speech tagging is the process of marking up a word in a text as corresponding to a particular part of speech, based on both its definition and its context. Part-of-speech tagging also known as word classes or lexical categories. The process of classifying words into their parts of speech and labeling them accordingly is known as part-of-speech tagging, POS-tagging, or simply tagging.

In [54]:
# POS Tagging
nltk.download('averaged_perceptron_tagger')

words = nltk.word_tokenize(text)
print(words)

tagged_words = nltk.pos_tag(words)

print(tagged_words)

['Dr.', 'Devin', 'is', 'learning', 'NLP', '.', 'It', 'is', 'very', 'interesting', 'and', 'exciting', '.', 'It', 'is', 'an', 'important', 'area', 'of', 'AI', '.']
[('Dr.', 'NNP'), ('Devin', 'NNP'), ('is', 'VBZ'), ('learning', 'VBG'), ('NLP', 'NNP'), ('.', '.'), ('It', 'PRP'), ('is', 'VBZ'), ('very', 'RB'), ('interesting', 'JJ'), ('and', 'CC'), ('exciting', 'VBG'), ('.', '.'), ('It', 'PRP'), ('is', 'VBZ'), ('an', 'DT'), ('important', 'JJ'), ('area', 'NN'), ('of', 'IN'), ('AI', 'NNP'), ('.', '.')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/vscode/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Stop Words Removal
Stop words are the most common words in a language like “the”, “is”, “in”, “for”, “where”, “when”, “to”, “at”, etc. Stop words are removed to improve the performance of the model. Stop words are removed to reduce the dimensionality of the data and remove noise.

In [55]:
# Stop words removal

nltk.download('stopwords')
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))
print(stop_words)

words = nltk.word_tokenize(text)
print(words)

filtered_words = []
for word in words:
    if word.lower() not in stop_words:
        filtered_words.append(word)
        
print(filtered_words)

{'most', "weren't", 'should', 'them', 'when', 'no', "you're", 'haven', 'do', 'did', 'my', 'too', 'in', 'hasn', "didn't", 'up', 'for', 'hadn', 'won', 'yourself', 'they', 'were', 'a', 'your', 'll', 'yourselves', 'had', "it's", 'having', 'few', 's', 're', "that'll", 'both', 'have', 'just', "shouldn't", 'until', 'm', 'further', 'himself', 'where', 'o', 'aren', 'so', 't', 'because', 'theirs', 'from', 'weren', "mustn't", 'wouldn', 'these', 'are', "you've", 'during', 'him', 'mustn', 'very', 'will', 'whom', 'before', 'once', "hasn't", 'through', 'all', "doesn't", "haven't", 'he', 'at', 'wasn', 'of', 'doing', 'down', 'other', 'out', 'ain', "isn't", 'needn', 'on', 'being', 'to', 'has', 'i', 'doesn', "you'll", 'some', 'an', 'above', 'which', 'you', 'be', 'itself', 'about', "won't", 'who', 'with', "couldn't", 'not', 'hers', 'yours', 'is', 'more', 'than', 'herself', 'y', 'our', 'didn', "wasn't", 'as', "shan't", 'over', 'and', 'd', 'themselves', 'below', "needn't", 'their', 'why', 'only', 'there', '

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Stemming

Example: The stem of the word working => work. The stem of the word worked => work. The stem of the word works => work. 

It just removes the suffixes from the word and reduces it to its root word. 

In [56]:
# Stemming
from nltk.stem import PorterStemmer

ps = PorterStemmer()

words = nltk.word_tokenize(text)
print(words)

for word in words:
    print(f"{word} : {ps.stem(word)}")
    

['Dr.', 'Devin', 'is', 'learning', 'NLP', '.', 'It', 'is', 'very', 'interesting', 'and', 'exciting', '.', 'It', 'is', 'an', 'important', 'area', 'of', 'AI', '.']
Dr. : dr.
Devin : devin
is : is
learning : learn
NLP : nlp
. : .
It : it
is : is
very : veri
interesting : interest
and : and
exciting : excit
. : .
It : it
is : is
an : an
important : import
area : area
of : of
AI : ai
. : .


# Lemmatization

Lemmatization is the process of converting a word to its base form. The difference between stemming and lemmatization is, lemmatization considers the context and converts the word to its meaningful base form, whereas stemming just removes the last few characters, often leading to incorrect meanings and spelling errors.

Example: ate => eat, gone => go, are => be, etc.

In [57]:
# Lemmatization
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

words = nltk.word_tokenize(text)
print(words)

for word in words:
    print(f"{word} : {lemmatizer.lemmatize(word, pos='v')}") # pos='v' is used to specify the part of speech of the word
    # If the part of speech is not specified, it will consider the word as a noun by default 

[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...


['Dr.', 'Devin', 'is', 'learning', 'NLP', '.', 'It', 'is', 'very', 'interesting', 'and', 'exciting', '.', 'It', 'is', 'an', 'important', 'area', 'of', 'AI', '.']
Dr. : Dr.
Devin : Devin
is : be
learning : learn
NLP : NLP
. : .
It : It
is : be
very : very
interesting : interest
and : and
exciting : excite
. : .
It : It
is : be
an : an
important : important
area : area
of : of
AI : AI
. : .


[nltk_data]   Package wordnet is already up-to-date!


# TF-IDF

- Term Frequency (TF) is a measure of how frequently a term occurs in a document. It is calculated by dividing the number of times a word appears in a document by the total number of words in the document.

- Inverse Document Frequency (IDF) is a measure of how important a term is. It is calculated by dividing the total number of documents by the number of documents containing the term, and then taking the logarithm of that quotient.

- TF-IDF is the product of TF and IDF. It is used to measure the importance of a term in a document relative to a collection of documents.

TF = (Number of times term t appears in a document) / (Total number of terms in the document)

IDF = log_e(Total number of documents / Number of documents with term t in it)

TF-IDF = TF * IDF

More the TF-IDF value, more important the word is in the document.

In [58]:
# TF-IDF

from sklearn.feature_extraction.text import TfidfVectorizer

corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

unique_words = vectorizer.get_feature_names_out()

# Print the TF-IDF values of the words in the corpus
for i in range(len(corpus)):
    print(f"Document {i+1}")
    for j in range(len(unique_words)):
        print(f"{unique_words[j]} : {X[i,j]}")
    print("\n")

Document 1
and : 0.0
document : 0.46979138557992045
first : 0.5802858236844359
is : 0.38408524091481483
one : 0.0
second : 0.0
the : 0.38408524091481483
third : 0.0
this : 0.38408524091481483


Document 2
and : 0.0
document : 0.6876235979836938
first : 0.0
is : 0.281088674033753
one : 0.0
second : 0.5386476208856763
the : 0.281088674033753
third : 0.0
this : 0.281088674033753


Document 3
and : 0.511848512707169
document : 0.0
first : 0.0
is : 0.267103787642168
one : 0.511848512707169
second : 0.0
the : 0.267103787642168
third : 0.511848512707169
this : 0.267103787642168


Document 4
and : 0.0
document : 0.46979138557992045
first : 0.5802858236844359
is : 0.38408524091481483
one : 0.0
second : 0.0
the : 0.38408524091481483
third : 0.0
this : 0.38408524091481483


