# To implement basic text processing operations like Tokenization, Normalization, Stemming, Lemmatization, Stop words removal, Sentence Segmentation etc. on text document

In [10]:
import nltk
import spacy
from nltk.stem import PorterStemmer
from rich import print

nlp = spacy.load("en_core_web_sm")
nltk.download("punkt")
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to /Users/tushya/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Tokenization
Tokenization is the breaking down of large input into smaller and smaller chunks called tokens. These tokens usually represent the words in a sentence. In Tokenization we first break down the Paragrapgh -> Sentences and the Sentences -> Words where these words represent a token

In [11]:
text = "The rapid advancement of technology has transformed the way people communicate, work, and learn. Social media platforms allow individuals to share ideas instantly across the globe, but they also raise concerns about privacy and misinformation. At the same time, businesses rely on data-driven decision-making to improve efficiency and customer satisfaction. Education has also shifted significantly, with online learning tools providing access to knowledge for millions of students worldwide. Despite these benefits, challenges such as digital inequality and cybersecurity threats continue to grow, making it important to balance innovation with responsibility."
doc = nlp(text)
tokens = [token.text for token in doc]
print(tokens[:10])

## Case Normalization
Transforming the case of the document into lowercase for easier processing and lowered complexity

In [12]:
doc = nlp(text.lower())

## Stemming
Stemming is the process to procuring the stem/root word from a given token by removing all of its affixs. The stem produced might or might not be a word in the dictionary

In [13]:
stems = [(token, stemmer.stem(token)) for token in tokens]
print(stems[:10])

## Lemmatization
Lemmatization is the process of converting the stem words that do not have a meaning in the dictionary to acctual root words that hold a meaning in the dictionary

In [14]:
lemmas = [(token, token.lemma_) for token in doc]
print(lemmas[:10])

## Stop word removal
A lot of our input consists of stop words or words that dont hold much value like articles and punctuations. To increase our processing speed we remove these useless tokens and try to reduce the size of information our model has to process

In [15]:
filtered_tokens = [
    token
    for token in doc
    if token.is_alpha and not token.is_stop and not token.is_punct
]
print(filtered_tokens[:10])

## POS Tagging
POS or Parts of Speech Tagging is to tag each of the token according to the type they are for example noun, adverb, adjective etc according to the rules of grammar

In [16]:
pos_tags = [(token.text, token.pos_, token.tag_) for token in filtered_tokens]
print(pos_tags[:10])

## Noun Chunk Phrases
Noun chunks are a series of words that make up a phrase that are extracted from the text. For eg 'social media platforms', etc

In [17]:
noun_chunks = [chunk for chunk in doc.noun_chunks]
print(noun_chunks[:10])

## Dependency parsing
This is the process of analyzing the grammatical structure of the text, i.e. how the words are related to each other in the text

In [18]:
deps = [(token.text, token.dep_, token.head.text) for token in filtered_tokens]
print(deps[:10])

## Named Entities Recognition
Named entities are real word entities which land in a pre-defined catagory like a person or a organization

In [19]:
entities = [(ent.text, ent.label_) for ent in doc.ents]
print(entities[:10])