In [None]:
import tensorflow_datasets as tfds

In [None]:
import tensorflow_datasets as tfds

# Load the IMDb reviews dataset
# This will download the data the first time you run it
(train_data, test_data), info = tfds.load(
    'imdb_reviews',
    split=('train', 'test'),
    with_info=True,
    as_supervised=True)

In [None]:
# Let's see what the data looks like
for review, label in train_data.take(2):
  print("Review:", review.numpy().decode('utf-8'))
  print("Label:", label.numpy()) # 0 is negative, 1 is positive
  print("-" * 20)

In [None]:
import pandas as pd
import numpy as np

# The 'as_dataframe' function failed. We will do it manually.
# We will loop through the dataset and pull out every review and label
# one by one and put them into Python lists.
# This will take a moment.

print("Manually extracting training data...")
train_reviews = []
train_labels = []

# This loop goes through the 'train_data' pipe, grabs each (review, label) pair
for review, label in train_data:
    # .numpy() converts from a TensorFlow 'Tensor' object to a numpy value
    # .decode('utf-8') converts from bytes (b'...') to a normal string
    train_reviews.append(review.numpy().decode('utf-8'))
    train_labels.append(label.numpy())

print("Manually extracting testing data...")
test_reviews = []
test_labels = []

# Do the same for the test data
for review, label in test_data:
    test_reviews.append(review.numpy().decode('utf-8'))
    test_labels.append(label.numpy())

# Now, create the DataFrame (the "workbench table") from our lists.
# This is a reliable, standard way to build a DataFrame.
train_df = pd.DataFrame({
    'text': train_reviews,    # Column 1 is our list of review strings
    'label': train_labels     # Column 2 is our list of 0s and 1s
})

test_df = pd.DataFrame({
    'text': test_reviews,
    'label': test_labels
})

# Now, prove it worked. This is your sanity check.
print("\nTraining DataFrame Head (Manual Build):")
print(train_df.head())

In [None]:
# Install the spaCy library and its small English model
# The '-q' makes the output less noisy.
!pip install -U spacy -q
!python -m spacy download en_core_web_sm -q


In [None]:
import spacy
import re

# Load the small English model.
# We disable the 'parser' and 'ner' components because we don't need them.
# We ONLY need the 'tagger' (for parts of speech) and 'lemmatizer'.
# This makes it MUCH faster.
print("Loading spaCy model...")
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])
print("Model loaded.")

def clean_text(text):
    # 1. Remove HTML tags using a simple regex
    text = re.sub(r'<[^>]+>', '', text)

    # 2. Convert to lowercase
    text = text.lower()

    # 3. Create a spaCy 'doc' object.
    doc = nlp(text)

    # 4. Create a list of clean tokens (words)
    clean_tokens = []
    for token in doc:
        # We check two things:
        # 1. Is it NOT a stop word (like 'the', 'a', 'is')?
        # 2. Is it alphabetic (NOT punctuation, NOT a number)?
        if not token.is_stop and token.is_alpha:
            # If it passes, we get its 'lemma_' (the root form)
            clean_tokens.append(token.lemma_)

    # 5. Join the clean tokens back into a single string
    return ' '.join(clean_tokens)

In [None]:
# WARNING: This will take several minutes. Be patient.
print("Cleaning training data (this will take a while)...")
train_df['clean_text'] = train_df['text'].apply(clean_text)

print("Cleaning testing data (this will also take a while)...")
test_df['clean_text'] = test_df['text'].apply(clean_text)

print("Cleaning complete.")

# Prove it worked. Show me the first review, raw vs. clean.
print("\n--- ORIGINAL REVIEW (Index 0) ---")
print(train_df['text'].iloc[0])
print("\n--- CLEANED REVIEW (Index 0) ---")
print(train_df['clean_text'].iloc[0])

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# 1. Initialize the vectorizer.
# We'll limit it to the top 10,000 most frequent words.
# If we used every single word, your model would be pointlessly huge.
vectorizer = TfidfVectorizer(max_features=10000)

print("Vectorizing training data...")
# 2. FIT and TRANSFORM on the training data.
# This learns the vocabulary AND converts the text to numbers.
X_train_tfidf = vectorizer.fit_transform(train_df['clean_text'])

print("Vectorizing testing data...")
# 3. ONLY TRANSFORM on the testing data.
# We use the *same vocabulary* learned from the training data.
# This prevents 'data leakage' and is a critical step.
X_test_tfidf = vectorizer.transform(test_df['clean_text'])

# 4. Get the labels. These are already 0s and 1s, so they are ready.
y_train = train_df['label']
y_test = test_df['label']

# 5. Prove it worked. Show me the shape of your new data.
print("\n--- SHAPE OF TF-IDF MATRICES ---")
print("Train data shape:", X_train_tfidf.shape)
print("Test data shape:", X_test_tfidf.shape)
print("\n--- SHAPE OF LABELS ---")
print("Train labels shape:", y_train.shape)
print("Test labels shape:", y_test.shape)

In [None]:
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix
import time

# --- STEP 4: Build and Train the Model ---

# 1. Initialize the model. We'll use the defaults.
# It's a workhorse. It doesn't need much tuning.
model = LinearSVC(dual=True) # dual=True is often faster for n_samples > n_features, but good to set. We'll stick with True for robustness.

print("Training the LinearSVC model...")

# 2. Train the model.
start_time = time.time()
model.fit(X_train_tfidf, y_train)
end_time = time.time()

print(f"--- Training finished in {end_time - start_time:.2f} seconds ---")


# --- STEP 5: Evaluate the Model (The RIGHT Way) ---

print("\nMaking predictions on the test data...")
# 1. Make predictions on the unseen test data.
y_pred = model.predict(X_test_tfidf)

# 2. Generate the Confusion Matrix.
# This shows you exactly where you were wrong.
print("\n--- Confusion Matrix ---")
print("   (Predicted Neg) (Predicted Pos)")
print(confusion_matrix(y_test, y_pred))

# 3. Generate the Classification Report.
# This is your main result. It shows precision, recall, and f1-score.
print("\n--- Classification Report ---")
print(classification_report(y_test, y_pred, target_names=['Negative (0)', 'Positive (1)']))

In [None]:
print("--- NER Demonstration on Sample Reviews ---")

# We're not using the 'clean_text' function. NER needs real, raw text.
# We must load a model that *has* an NER component.
# nlp = spacy.load('en_core_web_sm') # If you disabled NER, load a fresh one

sample_reviews = [
    "I bought a new Sony PlayStation 5 from Amazon, and it's amazing.",
    "My old Samsung TV is better than this new Vizio one.",
    "The new album by U2 is not as good as their work in the 90s."
]

for review in sample_reviews:
    print(f"\nREVIEW: '{review}'")
    doc = nlp(review)

    if not doc.ents:
        print("  -> No entities found.")
    else:
        print("  -> ENTITIES FOUND:")
        for ent in doc.ents:
            # ent.text is the word, ent.label_ is the category
            print(f"    - {ent.text} ({ent.label_})")