NLTK</br>
Natural Language Tool Kit

In [None]:
import nltk

nltk.download('punkt') 
nltk.download('stopwords')
nltk.download('wordnet') 

from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

text = "The quick brown foxes are jumping over the lazy dogs"
tokens = word_tokenize(text.lower())


stop_words = set(stopwords.words('english'))
filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]


lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

print(f"Original Tokens: {tokens}")
print(f"Cleaned Tokens: {filtered_tokens}")
print(f"Lemmatized Tokens: {lemmatized_tokens}")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ujwal\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ujwal\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ujwal\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Original Tokens: ['the', 'quick', 'brown', 'foxes', 'are', 'jumping', 'over', 'the', 'lazy', 'dogs']
Cleaned Tokens: ['quick', 'brown', 'foxes', 'jumping', 'lazy', 'dogs']
Lemmatized Tokens: ['quick', 'brown', 'fox', 'jumping', 'lazy', 'dog']


In [1]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('tagsets') 

from nltk.tokenize import word_tokenize

text = """Alexandor Albon was the rookie dirver in Formula one who was very successful at the start then ruined when
Hamilton struck his mercedes when Albon was about to win in Austria"""
tokens = word_tokenize(text)
pos_tags = nltk.pos_tag(tokens)

print(pos_tags)


[('Alexandor', 'NNP'), ('Albon', 'NNP'), ('was', 'VBD'), ('the', 'DT'), ('rookie', 'NN'), ('dirver', 'NN'), ('in', 'IN'), ('Formula', 'NNP'), ('one', 'NN'), ('who', 'WP'), ('was', 'VBD'), ('very', 'RB'), ('successful', 'JJ'), ('at', 'IN'), ('the', 'DT'), ('start', 'NN'), ('then', 'RB'), ('ruined', 'VBD'), ('when', 'WRB'), ('Hamilton', 'NNP'), ('struck', 'VBD'), ('his', 'PRP$'), ('mercedes', 'NNS'), ('when', 'WRB'), ('Albon', 'NNP'), ('was', 'VBD'), ('about', 'IN'), ('to', 'TO'), ('win', 'VB'), ('in', 'IN'), ('Austria', 'NNP')]


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\ujwal\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package tagsets to
[nltk_data]     C:\Users\ujwal\AppData\Roaming\nltk_data...
[nltk_data]   Package tagsets is already up-to-date!


## Spacy
What it is: A pipeline of functions where the output of one component is the input to the next. Each component performs a specific task and adds data to the Doc object.

A typical default pipeline might look like this:
tok2vec -> tagger -> parser -> ner -> attribute_ruler -> lemmatizer

tok2vec: Creates vector representations of words.

tagger: Performs Part-of-Speech (POS) tagging (doc[0].pos_).

parser: Performs dependency parsing (analyzing grammatical relationships).

ner: Performs Named Entity Recognition (doc.ents).

lemmatizer: Finds the base form of words (doc[0].lemma_).

In [5]:
import spacy

# Load the small English model
nlp = spacy.load("en_core_web_sm")

text = """Alexandor Albon was the rookie dirver in Formula one who was very successful at the start then ruined when
Hamilton struck his mercedes when Albon was about to win in Austria"""

doc = nlp(text)

tokens=[token for token in doc]
print("Tokens",tokens)
print(" Lemmatization and Stop Word Removal")

for token in doc:
    # .lemma_ gives the base form, .is_stop checks if it's a stop word
    if not token.is_stop and not token.is_punct:
        print(f"Token: {token.text}, Lemma: {token.lemma_}")

print("\n--- Named Entity Recognition (NER) ---")
# The 'doc.ents' attribute directly gives you the named entities
for ent in doc.ents:
    print(f"Entity: {ent.text}, Label: {ent.label_}")

Tokens [Alexandor, Albon, was, the, rookie, dirver, in, Formula, one, who, was, very, successful, at, the, start, then, ruined, when, 
, Hamilton, struck, his, mercedes, when, Albon, was, about, to, win, in, Austria]
 Lemmatization and Stop Word Removal
Token: Alexandor, Lemma: Alexandor
Token: Albon, Lemma: Albon
Token: rookie, Lemma: rookie
Token: dirver, Lemma: dirver
Token: Formula, Lemma: Formula
Token: successful, Lemma: successful
Token: start, Lemma: start
Token: ruined, Lemma: ruin
Token: 
, Lemma: 

Token: Hamilton, Lemma: Hamilton
Token: struck, Lemma: strike
Token: mercedes, Lemma: mercede
Token: Albon, Lemma: Albon
Token: win, Lemma: win
Token: Austria, Lemma: Austria

--- Named Entity Recognition (NER) ---
Entity: Alexandor Albon, Label: PERSON
Entity: Formula, Label: PERSON
Entity: Hamilton, Label: PERSON
Entity: Albon, Label: PERSON
Entity: Austria, Label: GPE


Spacy is a preDefined processing pipline</br>
we need to add to pipeline

In [12]:
from autocorrect import Speller # type: ignore
import wordninja #type: ignore
from spacy.language import Language #type: ignore
from spacy.tokens import Doc


In [None]:
Doc.set_extension("was_corrected", default=False)
Doc.set_extension("corrected_text", default="")

# Register custom extensions for the Doc object
@Language.component("text_correction_component")
def text_correction_component(doc):

    spell = Speller(lang='en')
    
    corrected_tokens = []
    for token in doc:

        corrected_word = spell(token.text)

        split_words = wordninja.split(corrected_word)

        corrected_tokens.extend(split_words)

    final_text = " ".join(corrected_tokens)
    

    if final_text != doc.text:
        doc._.was_corrected = True
        doc._.corrected_text = final_text
    else:
        doc._.was_corrected = False
        doc._.corrected_text = doc.text
        
    return doc

In [14]:

# Load a blank English model
nlp = spacy.blank("en")

# Add our custom component to the pipeline
nlp.add_pipe("text_correction_component", first=True)

print(f"Pipeline: {nlp.pipe_names}")

messy_text = "Thiss is a testfor mynewproject. I hope it workd."

# Process the text
doc = nlp(messy_text)


# Check the results!
print(f"\nOriginal Text: '{doc.text}'")
print(f"Was Corrected?: {doc._.was_corrected}")
print(f"Corrected Text: '{doc._.corrected_text}'")

Pipeline: ['text_correction_component']

Original Text: 'Thiss is a testfor mynewproject. I hope it workd.'
Was Corrected?: True
Corrected Text: 'This s is a ten for my new project I hope it work'
