You will learn:

* Preprocess text using Python
* Convert text into numeric format using BoW, TF-IDF, and embeddings
* Understand how CBoW and Skip-gram work through coding

## Part 1: Text Preprocessing with nltk

In [8]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('averaged_perceptron_tagger', quiet=True)

text = "Deep learning models are powerful tools for natural language processing."
tokens = word_tokenize(text)
filtered = [w for w in tokens if w.lower() not in stopwords.words('english')]
stemmed = [PorterStemmer().stem(w) for w in filtered]
lemmatized = [WordNetLemmatizer().lemmatize(w) for w in filtered]
pos_tags = pos_tag(tokens)

print("Tokens:", tokens)
print("----------------------------------------------------------------------------------------------------------------------")
print("Without Stopwords:", filtered)
print("----------------------------------------------------------------------------------------------------------------------")
print("Stemmed:", stemmed)
print("----------------------------------------------------------------------------------------------------------------------")
print("Lemmatized:", lemmatized)
print("----------------------------------------------------------------------------------------------------------------------")
print("POS Tags:", pos_tags)

Tokens: ['Deep', 'learning', 'models', 'are', 'powerful', 'tools', 'for', 'natural', 'language', 'processing', '.']
----------------------------------------------------------------------------------------------------------------------
Without Stopwords: ['Deep', 'learning', 'models', 'powerful', 'tools', 'natural', 'language', 'processing', '.']
----------------------------------------------------------------------------------------------------------------------
Stemmed: ['deep', 'learn', 'model', 'power', 'tool', 'natur', 'languag', 'process', '.']
----------------------------------------------------------------------------------------------------------------------
Lemmatized: ['Deep', 'learning', 'model', 'powerful', 'tool', 'natural', 'language', 'processing', '.']
----------------------------------------------------------------------------------------------------------------------
POS Tags: [('Deep', 'NNP'), ('learning', 'NN'), ('models', 'NNS'), ('are', 'VBP'), ('powerful', 'JJ'),

## Part 2: Text Vectorization

In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

corpus = [
    "Deep learning is powerful.",
    "Natural language processing is fun!",
    "Deep learning models help NLP."
]

# Bag of Words
bow = CountVectorizer()
print("BoW:\n", bow.fit_transform(corpus).toarray())

# TF-IDF
tfidf = TfidfVectorizer()
print("TF-IDF:\n", tfidf.fit_transform(corpus).toarray())


BoW:
 [[1 0 0 1 0 1 0 0 0 1 0]
 [0 1 0 1 1 0 0 1 0 0 1]
 [1 0 1 0 0 1 1 0 1 0 0]]
TF-IDF:
 [[0.45985353 0.         0.         0.45985353 0.         0.45985353
  0.         0.         0.         0.60465213 0.        ]
 [0.         0.46735098 0.         0.35543247 0.46735098 0.
  0.         0.46735098 0.         0.         0.46735098]
 [0.37302199 0.         0.49047908 0.         0.         0.37302199
  0.49047908 0.         0.49047908 0.         0.        ]]


## Part 3: Word Embeddings + Intro to CBoW & Skip-Gram

In [10]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

sentences = [word_tokenize(doc.lower()) for doc in corpus]

# CBoW: sg=0
cbow_model = Word2Vec(sentences, vector_size=50, window=2, min_count=1, sg=0)
print("CBoW - Vector for 'deep':", cbow_model.wv['deep'])

# Skip-Gram: sg=1
skip_model = Word2Vec(sentences, vector_size=50, window=2, min_count=1, sg=1)
print("Skip-Gram - Vector for 'deep':", skip_model.wv['deep'])


CBoW - Vector for 'deep': [ 1.56351421e-02 -1.90203730e-02 -4.11062239e-04  6.93839323e-03
 -1.87794445e-03  1.67635437e-02  1.80215668e-02  1.30730132e-02
 -1.42324204e-03  1.54208085e-02 -1.70686692e-02  6.41421322e-03
 -9.27599426e-03 -1.01779103e-02  7.17923651e-03  1.07406788e-02
  1.55390287e-02 -1.15330126e-02  1.48667218e-02  1.32509926e-02
 -7.41960062e-03 -1.74912829e-02  1.08749345e-02  1.30195115e-02
 -1.57510047e-03 -1.34197120e-02 -1.41718509e-02 -4.99412045e-03
  1.02865072e-02 -7.33047491e-03 -1.87401194e-02  7.65347946e-03
  9.76895820e-03 -1.28571270e-02  2.41711619e-03 -4.14975407e-03
  4.88066689e-05 -1.97670180e-02  5.38400887e-03 -9.50021297e-03
  2.17529293e-03 -3.15244915e-03  4.39334614e-03 -1.57631524e-02
 -5.43436781e-03  5.32639725e-03  1.06933638e-02 -4.78302967e-03
 -1.90201886e-02  9.01175756e-03]
Skip-Gram - Vector for 'deep': [ 1.56351421e-02 -1.90203730e-02 -4.11062239e-04  6.93839323e-03
 -1.87794445e-03  1.67635437e-02  1.80215668e-02  1.30730132e-02

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

# NLP tools
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from gensim.models import Word2Vec

# Deep Learning
import tensorflow as tf
from tensorflow.keras.datasets import imdb
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping


nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)


np.random.seed(42)
tf.random.set_seed(42)

# ===============================================================
# PART 1: Load Dataset
# ===============================================================
print("Loading IMDB dataset...")

max_features = 10000
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(f"Training data: {len(x_train)} reviews")
print(f"Testing data: {len(x_test)} reviews")

# Get word dictionary to decode reviews
word_index = imdb.get_word_index()
reverse_word_index = {value: key for key, value in word_index.items()}

# Function to decode review
def decode_review(encoded_review):
    # Skip special tokens and unknown words instead of using '?'
    words = [reverse_word_index.get(i-3, '') for i in encoded_review]
    # Filter out empty strings
    return ' '.join([word for word in words if word])

print("\nSample review excerpt:")
sample_review = decode_review(x_train[0])
print(sample_review[:100] + "...")
print(f"Sentiment: {'Positive' if y_train[0] == 1 else 'Negative'}")

# ===============================================================
# PART 2: Text Preprocessing Demo
# ===============================================================
def preprocess_text(text):
    """Demonstrate NLP preprocessing steps"""
    # Tokenization
    tokens = word_tokenize(text.lower())
    

    stop_words = set(stopwords.words('english'))
    filtered = [w for w in tokens if w not in stop_words]
    

    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(w) for w in filtered]
    

    lemmatizer = WordNetLemmatizer()
    lemmatized = [lemmatizer.lemmatize(w) for w in filtered]
    
    return {
        'tokens': tokens[:10], 
        'filtered': filtered[:10], 
        'stemmed': stemmed[:10], 
        'lemmatized': lemmatized[:10]
    }

# Process a sample review
print("\nDemonstrating text preprocessing:")
processed = preprocess_text(sample_review)
for key, value in processed.items():
    print(f"{key.capitalize()}: {value}")

# ===============================================================
# PART 3: Text Vectorization Demo
# ===============================================================
print("\nDemonstrating text vectorization:")

# Create a small corpus for demonstration
small_corpus = [decode_review(x) for x in x_train[:3]]

# Bag of Words
bow = CountVectorizer(max_features=1000)
bow_matrix = bow.fit_transform(small_corpus)
print(f"BoW shape: {bow_matrix.shape}")
print(f"First 5 features: {bow.get_feature_names_out()[:5]}")

# TF-IDF
tfidf = TfidfVectorizer(max_features=1000)
tfidf_matrix = tfidf.fit_transform(small_corpus)
print(f"TF-IDF shape: {tfidf_matrix.shape}")

# ===============================================================
# PART 4: Word Embeddings Demo
# ===============================================================
print("\nDemonstrating word embeddings:")

# Tokenize corpus for Word2Vec
tokenized_corpus = [word_tokenize(text.lower()) for text in small_corpus]

# Train Word2Vec models
cbow_model = Word2Vec(tokenized_corpus, vector_size=50, window=2, min_count=1, sg=0)
skipgram_model = Word2Vec(tokenized_corpus, vector_size=50, window=2, min_count=1, sg=1)

# Show embeddings for a sample word
vocab = list(cbow_model.wv.index_to_key)
if vocab:
    sample_word = vocab[0]
    print(f"Embeddings for word '{sample_word}':")
    print(f"CBoW (first 5 dims): {cbow_model.wv[sample_word][:5]}")
    print(f"Skip-gram (first 5 dims): {skipgram_model.wv[sample_word][:5]}")

# ===============================================================
# PART 5: LSTM Model for Sentiment Analysis
# ===============================================================
print("\nPreparing data for LSTM model:")

# Pad sequences to ensure uniform length
maxlen = 200
x_train_pad = pad_sequences(x_train, maxlen=maxlen)
x_test_pad = pad_sequences(x_test, maxlen=maxlen)
print(f"Training data shape after padding: {x_train_pad.shape}")

# Create LSTM model
print("Creating standard LSTM model")
model = Sequential([
    # Embedding layer converts integer indices to dense vectors
    # Add input_shape to ensure model is built immediately
    Embedding(max_features, 128, input_shape=(maxlen,)),
    
    # LSTM layer processes the sequence
    LSTM(128, dropout=0.2, recurrent_dropout=0.2),
    
    # Output layer with sigmoid activation for binary classification
    Dense(1, activation='sigmoid')
])

# Compile model
model.compile(
    optimizer='adam',
    loss='binary_crossentropy',
    metrics=['accuracy']
)

# Print summary with parameter counts
print(f"Model has {model.count_params()} total parameters")
# Skip full summary display
# model.summary()

# Train model
print("\nTraining model...")
early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

history = model.fit(
    x_train_pad, y_train,
    batch_size=128,
    epochs=3,  # Small number for demonstration
    validation_split=0.2,
    callbacks=[early_stopping],
    verbose=1
)

# Evaluate model
print("\nEvaluating model...")
loss, accuracy = model.evaluate(x_test_pad, y_test, verbose=0)
print(f"Test accuracy: {accuracy:.4f}")

# Generate predictions
y_pred = (model.predict(x_test_pad) > 0.5).astype(int).flatten()

# Show classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=['Negative', 'Positive']))



Loading IMDB dataset...
Training data: 25000 reviews
Testing data: 25000 reviews

Sample review excerpt:
this film was just brilliant casting location scenery story direction everyone's really suited the p...
Sentiment: Positive

Demonstrating text preprocessing:
Tokens: ['this', 'film', 'was', 'just', 'brilliant', 'casting', 'location', 'scenery', 'story', 'direction']
Filtered: ['film', 'brilliant', 'casting', 'location', 'scenery', 'story', 'direction', 'everyone', "'s", 'really']
Stemmed: ['film', 'brilliant', 'cast', 'locat', 'sceneri', 'stori', 'direct', 'everyon', "'s", 'realli']
Lemmatized: ['film', 'brilliant', 'casting', 'location', 'scenery', 'story', 'direction', 'everyone', "'s", 'really']

Demonstrating text vectorization:
BoW shape: (3, 260)
First 5 features: ['1990s' '80' 'abomination' 'across' 'acting']
TF-IDF shape: (3, 260)

Demonstrating word embeddings:
Embeddings for word 'the':
CBoW (first 5 dims): [-0.00099518  0.00052011  0.01024346  0.01807732 -0.01851844]
Ski

  super().__init__(**kwargs)


[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 312ms/step - accuracy: 0.6651 - loss: 0.6018 - val_accuracy: 0.7998 - val_loss: 0.4427
Epoch 2/3
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m50s[0m 319ms/step - accuracy: 0.8513 - loss: 0.3592 - val_accuracy: 0.8452 - val_loss: 0.3678
Epoch 3/3
[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m48s[0m 303ms/step - accuracy: 0.8739 - loss: 0.3127 - val_accuracy: 0.8284 - val_loss: 0.3854

Evaluating model...
Test accuracy: 0.8474
[1m782/782[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 39ms/step

Classification Report:
              precision    recall  f1-score   support

    Negative       0.88      0.81      0.84     12500
    Positive       0.82      0.88      0.85     12500

    accuracy                           0.85     25000
   macro avg       0.85      0.85      0.85     25000
weighted avg       0.85      0.85      0.85     25000

