In [1]:
import re
import contractions
import nltk
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [2]:
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("stopwords")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\sunny\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sunny\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\sunny\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

#### Text Cleaning and Normalization

In [3]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words("english"))

def clean_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = contractions.fix(text)
    text = re.sub(r"[^a-zA-Z\s]", "", text)
    words = word_tokenize(text)
    words = [lemmatizer.lemmatize(word) for word in words if word.lower() not in stop_words]
    return " ".join(words)

In [4]:
# Example Usage
sample_text = "The <b>price</b> is $500!! Don't miss out on this great deal."
cleaned_text = clean_text(sample_text)
print(cleaned_text)  # Output: price miss great deal

price miss great deal


#### Subword Tokenization Using SentencePiece

In [5]:
import sentencepiece as spm

# Train SentencePiece model
spm.SentencePieceTrainer.train(input="text_corpus.txt", model_prefix="spm_model", vocab_size=41)

# Load trained model
sp = spm.SentencePieceProcessor()
sp.load("spm_model.model")

# Tokenize text
text = "Natural Language Processing is amazing!"
tokens = sp.encode_as_pieces(text)
print(tokens) 

['▁', 'N', 'at', 'u', 'r', 'al', '▁', 'L', 'a', 'ng', 'u', 'a', 'ge', '▁', 'P', 'r', 'o', 'ce', 's', 's', 'i', 'ng', '▁', 'i', 's', '▁a', 'm', 'a', 'z', 'i', 'ng', '!']


#### Named Entity Recognition (NER) Using NLTK

In [6]:
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
import nltk
nltk.download("averaged_perceptron_tagger_eng")
nltk.download("maxent_ne_chunker_tab")
nltk.download("words")


[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\sunny\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\sunny\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\sunny\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


True

In [7]:
def extract_named_entities(text):
    """Extract named entities using NLTK"""
    words = word_tokenize(text)  # Tokenize words
    pos_tags = pos_tag(words)  # Assign POS tags
    named_entities = ne_chunk(pos_tags)  # Extract named entities

    entities = []
    for chunk in named_entities:
        if hasattr(chunk, "label"):
            entities.append((" ".join(c[0] for c in chunk), chunk.label()))  # Extract entity text and label
    return entities

text = "Apple Inc. was founded by Steve Jobs in Cupertino, California."
entities = extract_named_entities(text)
print(entities)  

[('Apple', 'PERSON'), ('Inc.', 'ORGANIZATION'), ('Steve Jobs', 'PERSON'), ('Cupertino', 'GPE'), ('California', 'GPE')]


#### Phrase Detection using Bigram & Trigram Models

In [8]:
from gensim.models import Phrases
from gensim.models.phrases import Phraser

# Sample dataset
sentences = [
    ["natural", "language", "processing", "is", "amazing"],
    ["deep", "learning", "is", "powerful"],
    ["new", "york", "city", "is", "beautiful"]
]

# Train Bigram model
bigram = Phrases(sentences, min_count=1, threshold=1)
bigram_phraser = Phraser(bigram)

# Apply bigram model
print([bigram_phraser[sent] for sent in sentences])

[['natural', 'language', 'processing', 'is', 'amazing'], ['deep', 'learning', 'is', 'powerful'], ['new', 'york', 'city', 'is', 'beautiful']]


####  Handling Out-of-Vocabulary (OOV) Words with FastText

In [11]:
from gensim.models import FastText

# Sample sentences
sentences = [
    ["deep", "learning", "is", "great"],
    ["natural", "language", "processing", "rocks"],
    ["neural", "networks", "are", "powerful"]
]

# Train FastText model
model = FastText(sentences, vector_size=50, window=3, min_count=1, workers=4)

# Handle OOV word
oov_word = "deeplearning"
vector = model.wv[oov_word]
print(vector)  # Prints vector representation of "deeplearning"


[-1.4704156e-03  3.6394957e-04 -1.1292394e-03 -2.0372432e-03
 -6.1567919e-04  5.3414697e-04 -4.3283504e-05 -1.2980187e-03
 -2.3745704e-03  2.2477210e-03 -2.8684249e-04  5.7080359e-04
 -2.1035390e-04 -1.5044674e-04  2.0678286e-03 -1.5373951e-03
 -1.0674075e-03 -2.1040845e-03  5.9577945e-04  3.1080056e-04
  2.5342440e-03 -1.9656378e-03 -3.2940309e-03  4.1215256e-04
 -4.6677390e-04 -6.6642975e-04 -2.6650240e-03 -3.2043477e-04
 -1.4796470e-03  1.0263290e-03  2.9163030e-03  1.4706829e-03
 -1.7472147e-03  3.3076527e-03 -2.0157434e-03 -2.1172846e-03
 -1.3795299e-03  2.7636072e-04  3.5848671e-03  3.2415316e-03
  1.5890662e-03  1.2637110e-03  2.7313128e-03 -2.4599303e-03
 -2.1126440e-03 -7.4765628e-04 -2.0636574e-03 -1.1671626e-03
 -2.1256353e-03  1.6825003e-04]
