# Assignment 7 - Text Analytics

In [1]:
import nltk
# nltk.download()

In [2]:
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag
import math

In [3]:
text = "In the vast expanse of the digital realm, where information flows ceaselessly and boundaries blur between reality and virtuality, lies an intricate tapestry of interconnected nodes and networks. Within this labyrinthine landscape, individuals navigate the ever-shifting currents of data, seeking meaning amidst the chaos. From the bustling streets of social media platforms to the serene corridors of scholarly archives, voices echo and ideas converge in a symphony of diversity. Algorithms hum in the background, orchestrating the flow of content and shaping the digital discourse. Yet, amidst the noise, patterns emerge, revealing glimpses of insight and understanding.At the heart of this digital ecosystem lies the art and science of text analytics, a discipline dedicated to unraveling the mysteries hidden within the written word. Armed with computational tools and linguistic expertise, analysts embark on a journey of discovery, sifting through vast troves of textual data in search of nuggets of wisdom. Natural language processing algorithms parse sentences, extract entities, and discern sentiments, transforming raw text into structured knowledge. From sentiment analysis to topic modeling, text analytics unlocks a wealth of possibilities for understanding human behavior, informing decision-making, and driving innovation.In the realm of commerce, businesses harness the power of text analytics to glean insights from customer feedback, predict market trends, and personalize user experiences. Social scientists explore digital archives to study cultural shifts, linguistic evolution, and societal dynamics. Healthcare professionals employ text mining techniques to analyze medical records, detect patterns in patient symptoms, and enhance diagnostic accuracy. Meanwhile, policymakers turn to text analytics to monitor public opinion, track emerging issues, and inform governance strategies.As the digital landscape continues to evolve and expand, so too does the field of text analytics, pushing the boundaries of what is possible in the realm of language understanding. With each passing day, new technologies emerge, new methodologies evolve, and new frontiers beckon, inviting explorers to venture forth into the uncharted territories of the digital wilderness. And as they do, they carry with them the torch of knowledge, illuminating the path ahead and shedding light on the mysteries that lie beyond."

In [4]:
type(text)

str

In [5]:
# Tokenization - Tokenization is the process of splitting a text or document into smaller units called tokens. These tokens can be words, phrases, or symbols, depending on the specific tokenizer used. 

# Word Tokenization
print("Word Tokenization")
word_tokens = word_tokenize(text)
print(word_tokens)

# Sentence Tokenization
print("Sentence Tokenization")
sentence_tokens = sent_tokenize(text)
print(sentence_tokens)

Word Tokenization
['In', 'the', 'vast', 'expanse', 'of', 'the', 'digital', 'realm', ',', 'where', 'information', 'flows', 'ceaselessly', 'and', 'boundaries', 'blur', 'between', 'reality', 'and', 'virtuality', ',', 'lies', 'an', 'intricate', 'tapestry', 'of', 'interconnected', 'nodes', 'and', 'networks', '.', 'Within', 'this', 'labyrinthine', 'landscape', ',', 'individuals', 'navigate', 'the', 'ever-shifting', 'currents', 'of', 'data', ',', 'seeking', 'meaning', 'amidst', 'the', 'chaos', '.', 'From', 'the', 'bustling', 'streets', 'of', 'social', 'media', 'platforms', 'to', 'the', 'serene', 'corridors', 'of', 'scholarly', 'archives', ',', 'voices', 'echo', 'and', 'ideas', 'converge', 'in', 'a', 'symphony', 'of', 'diversity', '.', 'Algorithms', 'hum', 'in', 'the', 'background', ',', 'orchestrating', 'the', 'flow', 'of', 'content', 'and', 'shaping', 'the', 'digital', 'discourse', '.', 'Yet', ',', 'amidst', 'the', 'noise', ',', 'patterns', 'emerge', ',', 'revealing', 'glimpses', 'of', 'insi

In [6]:
# POS Tagging - POS tagging is the process of assigning a part-of-speech tag (e.g., noun, verb, adjective) to each token in a sentence. It helps in understanding the grammatical structure of a sentence

tags = pos_tag(word_tokens)
print(tags)

[('In', 'IN'), ('the', 'DT'), ('vast', 'JJ'), ('expanse', 'NN'), ('of', 'IN'), ('the', 'DT'), ('digital', 'JJ'), ('realm', 'NN'), (',', ','), ('where', 'WRB'), ('information', 'NN'), ('flows', 'VBZ'), ('ceaselessly', 'RB'), ('and', 'CC'), ('boundaries', 'NNS'), ('blur', 'VBP'), ('between', 'IN'), ('reality', 'NN'), ('and', 'CC'), ('virtuality', 'NN'), (',', ','), ('lies', 'VBZ'), ('an', 'DT'), ('intricate', 'JJ'), ('tapestry', 'NN'), ('of', 'IN'), ('interconnected', 'JJ'), ('nodes', 'NNS'), ('and', 'CC'), ('networks', 'NNS'), ('.', '.'), ('Within', 'IN'), ('this', 'DT'), ('labyrinthine', 'JJ'), ('landscape', 'NN'), (',', ','), ('individuals', 'NNS'), ('navigate', 'VBP'), ('the', 'DT'), ('ever-shifting', 'JJ'), ('currents', 'NNS'), ('of', 'IN'), ('data', 'NNS'), (',', ','), ('seeking', 'VBG'), ('meaning', 'NN'), ('amidst', 'IN'), ('the', 'DT'), ('chaos', 'NN'), ('.', '.'), ('From', 'IN'), ('the', 'DT'), ('bustling', 'VBG'), ('streets', 'NNS'), ('of', 'IN'), ('social', 'JJ'), ('media', '

In [7]:
# Stop Words Removal - Stop words are common words that are often filtered out from text data because they do not contribute much to the meaning of the text. These words include articles, prepositions, conjunctions, and other common words.

stop_words = set(stopwords.words("english"))
stop_words.add(".")
stop_words.add(",")
print(stop_words)

print("Filtered Words after removal of stop words.")
filtered_words = []
for word in word_tokens:
    if word.lower() not in stop_words:
        filtered_words.append(word.lower())
print(filtered_words)

{'its', 'am', 's', 'couldn', "that'll", 'were', 'did', 'do', 'under', 'more', 'with', "needn't", 'over', 'whom', 'for', 'those', 'needn', 'about', 'as', 'now', 'who', 'don', 'all', 'isn', 'other', 'haven', 'their', 'himself', "wouldn't", 'where', 'nor', 'will', 'from', 'these', 'until', 'and', 'after', 'how', "you've", 're', "it's", 'we', 'ain', 'during', 'yourself', 'at', 'through', 'no', 'been', 'll', 'that', 'only', 'doesn', 'didn', 'on', "wasn't", 'too', 'is', 'have', 'than', 'it', 'aren', 'mightn', ',', 'why', 'herself', 'against', 'myself', 'she', 'was', "isn't", 'does', 'themselves', 'be', 'd', 'into', 'below', 'my', 'here', 'they', 'again', 'few', '.', "you're", "didn't", 'most', 'down', 'when', "aren't", 'own', 'off', 'should', 'but', "you'd", 'very', 'theirs', 'a', 'up', "weren't", 'i', 'weren', 'to', 'doing', 'further', 'can', 'won', 'wouldn', 'itself', 'in', "she's", "hadn't", 'because', 'are', 'him', 'or', 'such', 'had', 'her', 'being', 'while', 'hers', 'before', 'yours', 

In [8]:
# Stemming - Stemming is the process of reducing words to their root or base form by removing affixes (e.g., prefixes, suffixes). The goal of stemming is to reduce words to their common base or root form, which helps in information retrieval and text analysis. Stemming is a process that stems or removes last few characters from a word, often leading to incorrect meanings and spelling.

print("Stemming")
porter = PorterStemmer()
stemmed_tokens = []
for word in filtered_words:
    stemmed_tokens.append(porter.stem(word))
print(stemmed_tokens)

Stemming
['vast', 'expans', 'digit', 'realm', 'inform', 'flow', 'ceaselessli', 'boundari', 'blur', 'realiti', 'virtual', 'lie', 'intric', 'tapestri', 'interconnect', 'node', 'network', 'within', 'labyrinthin', 'landscap', 'individu', 'navig', 'ever-shift', 'current', 'data', 'seek', 'mean', 'amidst', 'chao', 'bustl', 'street', 'social', 'media', 'platform', 'seren', 'corridor', 'scholarli', 'archiv', 'voic', 'echo', 'idea', 'converg', 'symphoni', 'divers', 'algorithm', 'hum', 'background', 'orchestr', 'flow', 'content', 'shape', 'digit', 'discours', 'yet', 'amidst', 'nois', 'pattern', 'emerg', 'reveal', 'glimps', 'insight', 'understanding.at', 'heart', 'digit', 'ecosystem', 'lie', 'art', 'scienc', 'text', 'analyt', 'disciplin', 'dedic', 'unravel', 'mysteri', 'hidden', 'within', 'written', 'word', 'arm', 'comput', 'tool', 'linguist', 'expertis', 'analyst', 'embark', 'journey', 'discoveri', 'sift', 'vast', 'trove', 'textual', 'data', 'search', 'nugget', 'wisdom', 'natur', 'languag', 'pro

In [9]:
# Lemmatization - Lemmatization is similar to stemming but involves reducing words to their base or dictionary form (lemma) using a vocabulary and morphological analysis of the words. Lemmatization ensures that the resulting word is a valid word. Lemmatization considers the context and converts the word to its meaningful base form, which is called Lemma.

print("Lemmatization")
lemmatizer = WordNetLemmatizer()
lemmetized_tokens = []
for word in filtered_words:
    lemmetized_tokens.append(lemmatizer.lemmatize(word))
print(lemmetized_tokens)

Lemmatization
['vast', 'expanse', 'digital', 'realm', 'information', 'flow', 'ceaselessly', 'boundary', 'blur', 'reality', 'virtuality', 'lie', 'intricate', 'tapestry', 'interconnected', 'node', 'network', 'within', 'labyrinthine', 'landscape', 'individual', 'navigate', 'ever-shifting', 'current', 'data', 'seeking', 'meaning', 'amidst', 'chaos', 'bustling', 'street', 'social', 'medium', 'platform', 'serene', 'corridor', 'scholarly', 'archive', 'voice', 'echo', 'idea', 'converge', 'symphony', 'diversity', 'algorithm', 'hum', 'background', 'orchestrating', 'flow', 'content', 'shaping', 'digital', 'discourse', 'yet', 'amidst', 'noise', 'pattern', 'emerge', 'revealing', 'glimpse', 'insight', 'understanding.at', 'heart', 'digital', 'ecosystem', 'lie', 'art', 'science', 'text', 'analytics', 'discipline', 'dedicated', 'unraveling', 'mystery', 'hidden', 'within', 'written', 'word', 'armed', 'computational', 'tool', 'linguistic', 'expertise', 'analyst', 'embark', 'journey', 'discovery', 'siftin

In [10]:
# Term Frequency - Measures how frequently a term appears in a document. TF measures the frequency of a term (word) in a document relative to the total number of words in that document.
# TF = (freq of term in a doc / total number of terms in doc)

# Inverse Document Frequency - IDF measures the rarity of a term across all documents in the corpus.
# IDF = log(totalno of docs / no of docs containing the term + 1)

In [11]:
def get_tf(docs):
    tf = {}
    for doc in docs:
        tokens = word_tokenize(doc)
        total_terms = len(tokens)
        for token in set(tokens):
            frequency = tokens.count(token)
            tf[(token, doc)] = frequency/total_terms
    return tf

def get_idf(docs):
    idf = {}
    tokens = []
    for doc in docs:
        tokens += word_tokenize(doc)
    for token in set(tokens):
        count = 1
        for d in docs:
            if token in word_tokenize(d):
                count += 1
        idf[token] = math.log(len(docs)/count)
    return idf

def get_tfidf(docs):
    tf = get_tf(docs)
    idf = get_idf(docs)
    tfidf = {}
    for token, doc in tf.keys():
        tfidf[(token, doc)] = tf[(token, doc)] * idf[token]
    return tfidf

doc1 = "Natural language processing (NLP) is a field of artificial intelligence concerned with the interaction between computers and humans in natural language. It aims to enable computers to understand, interpret, and generate human language in a way that is both meaningful and useful. NLP techniques are used in a wide range of applications, including machine translation, sentiment analysis, information extraction, and text summarization. One of the key challenges in NLP is dealing with the ambiguity and variability of natural language, which can make it difficult for computers to accurately process and understand text. However, recent advances in machine learning and deep learning have led to significant improvements in NLP performance, making it an increasingly important area of research and development."
doc2 = "Machine learning (ML) is a subset of artificial intelligence that focuses on the development of algorithms that can learn from and make predictions or decisions based on data. ML algorithms can be categorized into supervised learning, unsupervised learning, and reinforcement learning, depending on the type of training data and the learning task. Supervised learning involves training a model on labeled data, while unsupervised learning involves training on unlabeled data. Reinforcement learning involves training a model to interact with an environment and learn from feedback. ML techniques have applications in various domains, including image recognition, speech recognition, medical diagnosis, and autonomous vehicles."
doc3 = "Data science is an interdisciplinary field that combines techniques from statistics, computer science, and domain-specific knowledge to extract insights and knowledge from data. It involves various stages of the data lifecycle, including data collection, data cleaning, data analysis, and data visualization. Data scientists use a variety of tools and techniques, such as machine learning, statistical modeling, and data mining, to uncover patterns and trends in data and make data-driven decisions. Data science has applications in numerous industries, including healthcare, finance, marketing, and e-commerce."

tf = get_tf([doc1, doc2, doc3])
idf = get_idf([doc1, doc2, doc3])
tfidf = get_tfidf([doc1, doc2, doc3])

for token, doc in tf.keys():
    print(token, ":", tf[(token, doc)])

for token in idf.keys():
    print(token, ":", idf[token])

for token, doc in tfidf.keys():
    print(token, ":", tfidf[(token, doc)])

understand : 0.014814814814814815
analysis : 0.007407407407407408
ambiguity : 0.007407407407407408
wide : 0.007407407407407408
concerned : 0.007407407407407408
) : 0.007407407407407408
with : 0.014814814814814815
techniques : 0.007407407407407408
for : 0.007407407407407408
language : 0.02962962962962963
summarization : 0.007407407407407408
artificial : 0.007407407407407408
intelligence : 0.007407407407407408
research : 0.007407407407407408
make : 0.007407407407407408
dealing : 0.007407407407407408
field : 0.007407407407407408
and : 0.05925925925925926
advances : 0.007407407407407408
deep : 0.007407407407407408
generate : 0.007407407407407408
computers : 0.022222222222222223
Natural : 0.007407407407407408
that : 0.007407407407407408
translation : 0.007407407407407408
accurately : 0.007407407407407408
increasingly : 0.007407407407407408
One : 0.007407407407407408
applications : 0.007407407407407408
is : 0.022222222222222223
have : 0.007407407407407408
it : 0.014814814814814815
important 