In [1]:
!wget http://nlp.stanford.edu/data/glove.6B.zip

!unzip -q glove.6B.zip -d glove

--2026-02-20 11:50:53--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2026-02-20 11:50:54--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2026-02-20 11:50:54--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

# **Task 1 - Data Preparation**

In [4]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split

nltk.download('punkt')
nltk.download('punkt_tab')

df = pd.read_csv('movies.csv')
df.rename(columns={'genres': 'genre', 'vote_average': 'voting_average'}, inplace=True)

allowed_cols = ['genre', 'keywords', 'tagline', 'overview', 'voting_average']
df = df[allowed_cols].dropna()

# Text Preprocessing Function
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE) # Remove URLs
    text = re.sub(r'[^\w\s]', '', text) # Remove punctuation
    text = re.sub(r'\d+', '', text) # Remove numbers
    return word_tokenize(text)

# Clean the input text columns
for col in ['overview', 'tagline', 'keywords']:
    df[col + '_clean'] = df[col].apply(clean_text)

# Format genres into a list
df['genre_list'] = df['genre'].apply(lambda x: str(x).split())

# Train/Validation/Test Split (70/15/15)
train_df, temp_df = train_test_split(df, test_size=0.30, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.50, random_state=42)

print("Data splits:", len(train_df), len(val_df), len(test_df))

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Data splits: 2631 564 564


# **Task 2 - GloVe Embedding Pipeline**

Embedding used: 100d

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Load 100D GloVe vectors
embeddings_index = {}
with open('glove/glove.6B.100d.txt', 'r', encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embeddings_index[word] = coefs

# Report embedding coverage
vocab = set(word for tokens in train_df['overview_clean'] for word in tokens)
covered = len([w for w in vocab if w in embeddings_index])
print(f"Embedding Coverage: {covered / len(vocab) * 100:.2f}%")

# Construct TF-IDF weighted document embeddings
def get_doc_embeddings(df_split, text_col):
    tfidf = TfidfVectorizer()
    # Fit only on training data to prevent data leakage
    tfidf.fit([" ".join(tokens) for tokens in train_df[text_col]])
    word2tfidf = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))

    doc_embeddings = []
    for tokens in df_split[text_col]:
        vecs, weights = [], []
        for word in tokens:
            if word in embeddings_index and word in word2tfidf:
                vecs.append(embeddings_index[word])
                weights.append(word2tfidf[word])

        # Calculate weighted average or return zeroes if empty
        doc_emb = np.average(vecs, axis=0, weights=weights) if vecs else np.zeros(100)
        doc_embeddings.append(doc_emb)
    return np.array(doc_embeddings)

# Generate embeddings for the 'overview' column
X_train = get_doc_embeddings(train_df, 'overview_clean')
X_test = get_doc_embeddings(test_df, 'overview_clean')

y_train_reg = train_df['voting_average'].values
y_test_reg = test_df['voting_average'].values

Embedding Coverage: 91.25%


# **Task 3 - Model A: Rating Prediction (Regression)**

In [6]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.metrics import mean_squared_error
import numpy as np

class RegressionNet(nn.Module):
    def __init__(self):
        super(RegressionNet, self).__init__()
        # Input: 100D GloVe vector -> Hidden Layer: 64 neurons
        self.fc1 = nn.Linear(100, 64)
        self.relu = nn.ReLU()
        # Output: 1 single continuous number (the predicted rating)
        self.fc2 = nn.Linear(64, 1)

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

# Reusable Training and Evaluation Function
def train_and_evaluate_regression(text_col):
    print(f"--- Training Model A (Regression) using: {text_col} ---")

    # Get embeddings for this specific text column using the function from Task 2
    X_train_col = get_doc_embeddings(train_df, text_col)
    X_test_col = get_doc_embeddings(test_df, text_col)

    # Create PyTorch DataLoaders
    train_dataset = TensorDataset(torch.tensor(X_train_col, dtype=torch.float32),
                                  torch.tensor(y_train_reg, dtype=torch.float32).unsqueeze(1))
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    # Initialize Model, Loss Function (MSE), and Optimizer
    model = RegressionNet()
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

    # Training Loop
    epochs = 15
    for epoch in range(epochs):
        epoch_loss = 0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            predictions = model(inputs)
            loss = criterion(predictions, targets)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        if (epoch + 1) % 5 == 0: # Print loss every 5 epochs
            print(f"Epoch {epoch+1}/{epochs} | Training Loss: {epoch_loss/len(train_loader):.4f}")

    # Evaluation phase
    model.eval()
    with torch.no_grad():
        test_preds = model(torch.tensor(X_test_col, dtype=torch.float32)).numpy()

    # Calculate Metrics
    mse = mean_squared_error(y_test_reg, test_preds)
    rmse = np.sqrt(mse)

    # Baseline: Just predict the mean rating of the training set for every test sample
    global_mean = train_df['voting_average'].mean()
    baseline_preds = np.full_like(y_test_reg, global_mean)
    baseline_mse = mean_squared_error(y_test_reg, baseline_preds)

    print(f"-> Baseline MSE: {baseline_mse:.4f}")
    print(f"-> Model MSE:    {mse:.4f}")
    print(f"-> Model RMSE:   {rmse:.4f}\n")

# 3. Run for two different inputs to compare performance
train_and_evaluate_regression('overview_clean')
train_and_evaluate_regression('tagline_clean')

--- Training Model A (Regression) using: overview_clean ---
Epoch 5/15 | Training Loss: 1.1102
Epoch 10/15 | Training Loss: 0.9732
Epoch 15/15 | Training Loss: 0.8983
-> Baseline MSE: 0.8977
-> Model MSE:    0.9198
-> Model RMSE:   0.9591

--- Training Model A (Regression) using: tagline_clean ---
Epoch 5/15 | Training Loss: 1.0206
Epoch 10/15 | Training Loss: 0.8448
Epoch 15/15 | Training Loss: 0.7831
-> Baseline MSE: 0.8977
-> Model MSE:    1.0071
-> Model RMSE:   1.0035



# **Task 4 - Model B: Genre Prediction (Multi-Label Classification)**

In [7]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score, hamming_loss

#  Binarize the genres (Convert list of text genres into 1s and 0s)
mlb = MultiLabelBinarizer()
# Fit on train data to learn all possible genres, then transform both sets
y_train_cls = mlb.fit_transform(train_df['genre_list'])
y_test_cls = mlb.transform(test_df['genre_list'])

num_classes = len(mlb.classes_)
print(f"Total unique genres found: {num_classes}")

# Define the Multi-Label PyTorch Network
class MultiLabelNet(nn.Module):
    def __init__(self, num_classes):
        super(MultiLabelNet, self).__init__()
        self.fc1 = nn.Linear(100, 128) # 100D GloVe input
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(128, num_classes) # Output size equals number of genres
        # There's no need for Sigmoid here as BCEWithLogitsLoss handles it automatically.

    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

# Reusable Training and Evaluation Function
def train_and_evaluate_classification(text_col):
    print(f"\n--- Training Model B (Classification) using: {text_col} ---")

    # Get embeddings for this specific text column
    X_train_col = get_doc_embeddings(train_df, text_col)
    X_test_col = get_doc_embeddings(test_df, text_col)

    # Create PyTorch DataLoaders
    train_dataset = TensorDataset(torch.tensor(X_train_col, dtype=torch.float32),
                                  torch.tensor(y_train_cls, dtype=torch.float32))
    train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

    # Initialize Model, Loss Function (BCEWithLogitsLoss), and Optimizer
    model = MultiLabelNet(num_classes)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.005)

    # Training Loop
    epochs = 15
    for epoch in range(epochs):
        model.train()
        epoch_loss = 0
        for inputs, targets in train_loader:
            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            epoch_loss += loss.item()

        if (epoch + 1) % 5 == 0:
            print(f"Epoch {epoch+1}/{epochs} | Training Loss: {epoch_loss/len(train_loader):.4f}")

    # Evaluation Phase
    model.eval()
    with torch.no_grad():
        # Get raw scores
        raw_logits = model(torch.tensor(X_test_col, dtype=torch.float32))
        # Turn raw scores into percentages (0.0 to 1.0) using sigmoid
        predicted_probs = torch.sigmoid(raw_logits)
        # If the model is more than 50% sure, call it a 1 (genre present)
        predictions = (predicted_probs > 0.5).int().numpy()

    # Calculate Required Metrics
    micro_f1 = f1_score(y_test_cls, predictions, average='micro', zero_division=0)
    macro_f1 = f1_score(y_test_cls, predictions, average='macro', zero_division=0)
    h_loss = hamming_loss(y_test_cls, predictions)

    print(f"-> Micro-F1:     {micro_f1:.4f}")
    print(f"-> Macro-F1:     {macro_f1:.4f}")
    print(f"-> Hamming Loss: {h_loss:.4f}")

# 4. Run for two different inputs to compare performance
train_and_evaluate_classification('overview_clean')
train_and_evaluate_classification('tagline_clean')

Total unique genres found: 22

--- Training Model B (Classification) using: overview_clean ---
Epoch 5/15 | Training Loss: 0.2256
Epoch 10/15 | Training Loss: 0.2096
Epoch 15/15 | Training Loss: 0.1975
-> Micro-F1:     0.5232
-> Macro-F1:     0.3695
-> Hamming Loss: 0.1018

--- Training Model B (Classification) using: tagline_clean ---
Epoch 5/15 | Training Loss: 0.2679
Epoch 10/15 | Training Loss: 0.2478
Epoch 15/15 | Training Loss: 0.2253
-> Micro-F1:     0.3270
-> Macro-F1:     0.1623
-> Hamming Loss: 0.1260


# **Task 5 - Frequent Words per Genre**

In [8]:
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

print("--- TASK 5: Frequent Words per Genre ---")
# 1. Set up an empty list for every single genre
genre_words = {genre: [] for genre in mlb.classes_}

# 2. Go through our movie dataset and throw every word into its matching genre bucket
for idx, row in train_df.iterrows():
    for genre in row['genre_list']:
        genre_words[genre].extend(row['overview_clean'])

# Note: We are only printing the first 3 genres here to keep your screen clean.
# To see all 22 genres for your assignment table, just remove the '[:3]' below!
for genre in list(mlb.classes_)[:3]:
    # Count the words, keeping only those that appear at least 3 times
    counts = {k: v for k, v in Counter(genre_words[genre]).items() if v >= 3}
    if counts:
        sorted_w = sorted(counts.items(), key=lambda x: x[1], reverse=True)
        print(f"\nGenre: {genre}")
        print(f"Top 10: {[w[0] for w in sorted_w[:10]]}")
        print(f"Bottom 10: {[w[0] for w in sorted_w[-10:]]}")

--- TASK 5: Frequent Words per Genre ---

Genre: Action
Top 10: ['the', 'a', 'to', 'of', 'and', 'his', 'in', 'is', 'with', 'on']
Bottom 10: ['wiley', 'jill', 'seattle', 'thor', 'wade', 'immigrants', 'averill', 'cavendich', 'tao', 'bazil']

Genre: Adventure
Top 10: ['the', 'a', 'to', 'and', 'of', 'in', 'his', 'is', 'with', 'on']
Bottom 10: ['drug', 'taylor', 'thor', 'wade', 'assistant', 'austin', 'cavendich', 'redferne', 'undead', 'alice']

Genre: Animation
Top 10: ['the', 'a', 'and', 'to', 'of', 'in', 'his', 'is', 'an', 'he']
Bottom 10: ['mike', 'penguins', 'planet', 'amadeo', 'table', 'five', 'season', 'turtle', 'kung', 'fu']


# **Task 6 - Genre-Indicative Words Using TF-IDF**

In [11]:
print("\n--- TASK 6: Genre-Indicative Words Using TF-IDF ---")
# 1. Set up TF-IDF to find words that are unique, not just common
tfidf = TfidfVectorizer(max_features=5000)
X_tfidf = tfidf.fit_transform([" ".join(tokens) for tokens in train_df['overview_clean']])
feature_names = tfidf.get_feature_names_out()

for i, genre in enumerate(list(mlb.classes_)[:10]):
    # 2. Train a simple linear model to look for strong connections
    lr = LogisticRegression(class_weight='balanced', max_iter=1000)
    lr.fit(X_tfidf, y_train_cls[:, i])

    # 3. Pull out the top 10 words with the highest positive weights
    top_indices = lr.coef_[0].argsort()[-10:][::-1]
    indicative_words = [feature_names[idx] for idx in top_indices]
    print(f"\n{genre} Indicative Words: {indicative_words}")


--- TASK 6: Genre-Indicative Words Using TF-IDF ---

Action Indicative Words: ['agent', 'the', 'forces', 'james', 'bond', 'cop', 'battle', 'must', 'against', 'avenge']

Adventure Indicative Words: ['the', 'bond', 'adventure', 'find', 'to', 'park', 'captain', 'and', 'must', 'earth']

Animation Indicative Words: ['land', 'when', 'adventure', 'human', 'shrek', 'dragon', 'animated', 'garfield', 'world', 'away']

Comedy Indicative Words: ['comedy', 'just', 'when', 'big', 'up', 'best', 'all', 'christmas', 'their', 'romance']

Crime Indicative Words: ['police', 'drug', 'murder', 'detective', 'cop', 'fbi', 'agent', 'mob', 'mafia', 'criminal']

Documentary Indicative Words: ['documentary', 'me', 'look', 'afghanistan', 'part', 'intimate', 'penguins', 'the', 'amanda', 'americans']

Drama Indicative Words: ['story', 'father', 'life', 'was', 'his', 'left', 'love', 'her', 'leads', 'he']

Family Indicative Words: ['dog', 'adventures', 'boy', 'land', 'the', 'save', 'when', 'christmas', 'world', 'anim