### 1. Import & Load Data

In [None]:
%run ./1-preprocessing.ipynb

### 2. Choosing Methods

In [6]:
# Word Counts with CountVectorizer (scikit-learn)
from sklearn.feature_extraction.text import CountVectorizer

# Use two sample cleaned reviews (space-joined, no stop words)
documents = [
    " ".join(df["review_no_stop"].iloc[0]),
    " ".join(df["review_no_stop"].iloc[1])
]

vectorizer = CountVectorizer()
vectorizer.fit(documents)

print("Vocabulary:", vectorizer.vocabulary_)    # dict term→column index
counts = vectorizer.transform(documents)
print("Count vectors:\n", counts.toarray())

Vocabulary: {'ten': 157, 'years': 174, 'since': 143, 'wildside': 172, 'aired': 10, 'nothing': 114, 'really': 128, 'come': 30, 'close': 28, 'quality': 124, 'local': 96, 'production': 122, 'includes': 83, 'two': 162, 'series': 137, 'enjoyable': 52, 'overrated': 118, 'underbelly': 164, 'brought': 22, 'life': 94, 'events': 58, 'recent': 129, 'criminal': 36, 'history': 76, 'sydney': 155, 'melbourne': 103, 'miniseries': 105, 'blue': 19, 'murder': 109, 'also': 13, 'starred': 148, 'tony': 161, 'martin': 100, 'someone': 144, 'side': 141, 'law': 90, 'may': 102, 'exceptionbr': 61, 'br': 21, 'currently': 37, 'repeated': 133, 'late': 88, 'night': 113, 'abc': 3, 'watched': 168, 'show': 140, 'quite': 125, 'im': 81, 'still': 149, 'impressed': 82, 'uncompromising': 163, 'story': 150, 'lines': 95, 'human': 80, 'characters': 26, 'cast': 25, 'excellent': 60, 'detective': 43, 'haunted': 73, 'disappearance': 46, 'son': 146, 'rachael': 126, 'blake': 18, 'later': 89, 'hooked': 78, 'real': 127, 'community': 31

In [None]:
# Word Frequencies with TfidfVectorizer (scikit-learn)
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()
tfidf.fit(documents)

print("Vocabulary:", tfidf.vocabulary_)
print("IDF values:", tfidf.idf_)

tfidf_vec = tfidf.transform([documents[0]])
print("TF-IDF vector for first doc:\n", tfidf_vec.toarray())

Vocabulary: {'ten': 157, 'years': 174, 'since': 143, 'wildside': 172, 'aired': 10, 'nothing': 114, 'really': 128, 'come': 30, 'close': 28, 'quality': 124, 'local': 96, 'production': 122, 'includes': 83, 'two': 162, 'series': 137, 'enjoyable': 52, 'overrated': 118, 'underbelly': 164, 'brought': 22, 'life': 94, 'events': 58, 'recent': 129, 'criminal': 36, 'history': 76, 'sydney': 155, 'melbourne': 103, 'miniseries': 105, 'blue': 19, 'murder': 109, 'also': 13, 'starred': 148, 'tony': 161, 'martin': 100, 'someone': 144, 'side': 141, 'law': 90, 'may': 102, 'exceptionbr': 61, 'br': 21, 'currently': 37, 'repeated': 133, 'late': 88, 'night': 113, 'abc': 3, 'watched': 168, 'show': 140, 'quite': 125, 'im': 81, 'still': 149, 'impressed': 82, 'uncompromising': 163, 'story': 150, 'lines': 95, 'human': 80, 'characters': 26, 'cast': 25, 'excellent': 60, 'detective': 43, 'haunted': 73, 'disappearance': 46, 'son': 146, 'rachael': 126, 'blake': 18, 'later': 89, 'hooked': 78, 'real': 127, 'community': 31

### 3. Building Model

In [None]:
from datasets import load_dataset
import pandas as pd

# Grab IMDB from HF
ds = load_dataset("stanfordnlp/imdb")
train_df = pd.DataFrame(ds["train"])
test_df  = pd.DataFrame(ds["test"])

# Apply the same remove-punctuation, tokenize, remove-stopwords
train_df["no_punct"] = train_df["text"].apply(remove_punctuation)
train_df["tokens"] = train_df["no_punct"].apply(tokenize)
train_df["review_no_stop"] = train_df["tokens"].apply(remove_stopwords)

# Quick check:
train_df[["text", "no_punct", "tokens", "review_no_stop"]].head()

Unnamed: 0,text,no_punct,tokens,review_no_stop
0,I rented I AM CURIOUS-YELLOW from my video sto...,I rented I AM CURIOUSYELLOW from my video stor...,"[i, rented, i, am, curiousyellow, from, my, vi...","[rented, curiousyellow, video, store, controve..."
1,"""I Am Curious: Yellow"" is a risible and preten...",I Am Curious Yellow is a risible and pretentio...,"[i, am, curious, yellow, is, a, risible, and, ...","[curious, yellow, risible, pretentious, steami..."
2,If only to avoid making this type of film in t...,If only to avoid making this type of film in t...,"[if, only, to, avoid, making, this, type, of, ...","[avoid, making, type, film, future, film, inte..."
3,This film was probably inspired by Godard's Ma...,This film was probably inspired by Godards Mas...,"[this, film, was, probably, inspired, by, goda...","[film, probably, inspired, godards, masculin, ..."
4,"Oh, brother...after hearing about this ridicul...",Oh brotherafter hearing about this ridiculous ...,"[oh, brotherafter, hearing, about, this, ridic...","[oh, brotherafter, hearing, ridiculous, film, ..."


In [21]:
# Re-join precomputed tokens into raw strings
train_corpus = [" ".join(ast.literal_eval(toks)) for toks in df_train["review_no_stop"]]
test_corpus  = [" ".join(ast.literal_eval(toks)) for toks in df_test ["review_no_stop"]]

# TF-IDF vectorization
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2))
X_train = tfidf.fit_transform(train_corpus)
X_test  = tfidf.transform(test_corpus)

# Labels
y_train = df_train["label"]
y_test  = df_test ["label"]