In [110]:
#| include: false

import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import numpy as np
import time
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_white"
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category = UserWarning)
from transformers import BertTokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
import torch.nn.functional as F

In [194]:
df = pd.read_csv("clean_data/embeddings.csv")
# df["label"] = (df["vote_average"] // 0.5).astype(int)
# make label as bins of popularity
df["label"] = pd.qcut(df["revenue"], q=5, labels=False, duplicates='drop')


display(df.head())


Unnamed: 0,movie_id,vote_average,popularity,revenue,final_ids,label
0,862,7.7,21.946943,373554033.0,"[31, 12898, 7167, 12899, 12900, 7907, 8873, 11...",3
1,8844,6.9,17.015539,262797249.0,"[2157, 8537, 205, 145151, 5149, 10739, 58563, ...",3
2,11862,5.7,8.387519,76578911.0,"[67773, 3092, 519, 70696, 59222, 18793, 14592,...",2
3,949,7.7,17.924927,187436818.0,"[1158, 380, 5576, 10127, 3197, 6200, 15851, 15...",3
4,11860,6.2,6.677277,0.0,"[3, 15887, 17141, 4301, 12957, 8937, 16554, 34...",0


In [195]:
# calculate the baseline accuracy
baseline_accuracy = df["label"].value_counts(normalize=True).max()
print(f"Baseline accuracy: {baseline_accuracy:.2f}")

Baseline accuracy: 0.40


In [196]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

def preprocess(df):
    X = df["final_ids"]
    # convert string representation of list to actual list
    X = [eval(ids) for ids in X]
    y = list(df["label"])
    # subtract 2 from y to make it zero-indexed
    # y = [label - 5 for label in y]
    return X, y

class ToDF(Dataset):
    def __init__(self, df):
        self.X, self.y = preprocess(df)

    def __getitem__(self, ix):
        return self.X[ix], self.y[ix]

    def __len__(self):
        return len(self.y)
    
train = ToDF(train_df)
val = ToDF(val_df)

def collate(data):
    X = torch.tensor([d[0] for d in data])
    y = torch.tensor([d[1] for d in data])
    return X, y 

train_loader = DataLoader(train, batch_size=32, shuffle=True, collate_fn=collate)
val_loader = DataLoader(val, batch_size=32, shuffle=False, collate_fn=collate)

In [186]:
vocab_size = max([item for sublist in df["final_ids"].apply(eval).tolist() for item in sublist])
max_len = max([len(item) for item in df["final_ids"].apply(eval).tolist()])
print(f"Vocab size: {vocab_size}")
print(f"Max length: {max_len}")

Vocab size: 1908262
Max length: 1017


In [174]:
y_train = train_df["label"].values - 5
classes = np.arange(0, max(y_train) + 1)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
weights = torch.tensor(weights, dtype=torch.float)

In [197]:
class Model(nn.Module):

    def __init__(self, vocab_size, embedding_dim, max_len, num_class):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size+1, embedding_dim)
        self.fc = nn.Sequential(
            nn.Dropout(0.3),
            nn.Linear(max_len * embedding_dim, 256),
            nn.Dropout(0.3),
            nn.Linear(256, 128),
            nn.Dropout(0.3),
            nn.Linear(128, num_class)
        )

    def forward(self, X):
        x = self.embedding(X)
        x = torch.flatten(x, 1)
        x = self.fc(x)
        return x

embedding_dim = 25
num_class = 90

In [198]:
def train(dataloader, model):
    epoch_start_time = time.time()
    total_acc, total_count = 0, 0
    
    for X, y in dataloader:
        optimizer.zero_grad()
        predicted_label = model(X)
        loss = loss_fn(predicted_label, y)
        loss.backward()
        optimizer.step()
                
        total_acc   += (predicted_label.argmax(1) == y).sum().item()
        total_count += y.size(0)

    print(f'| epoch {epoch:3d} | train accuracy {total_acc/total_count:8.3f} | time: {time.time() - epoch_start_time:5.2f}s')

def accuracy(dataloader, model):

    total_acc, total_count = 0, 0
    predictions = []
    labels = []

    with torch.no_grad():
        for X, y in dataloader:
            predicted_label = model(X)
            predictions.extend(predicted_label.argmax(1).tolist())
            labels.extend(y.tolist())
            total_acc += (predicted_label.argmax(1) == y).sum().item()
            total_count += y.size(0)
            
    # predictions = [x + 2 for x in predictions]
    # labels = [x + 2 for x in labels]
    return total_acc/total_count, predictions, labels

model = Model(vocab_size, embedding_dim, max_len, num_class).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=.001)
loss_fn = torch.nn.CrossEntropyLoss()

EPOCHS = 20
for epoch in range(1, EPOCHS + 1):
    train(train_loader, model)

val_acc, predictions, labels = accuracy(val_loader, model)
print(f'validation accuracy: {val_acc:.3f}')
print(predictions)
print(labels)

| epoch   1 | train accuracy    0.331 | time: 14.64s
| epoch   2 | train accuracy    0.362 | time: 14.98s
| epoch   3 | train accuracy    0.391 | time: 15.06s
| epoch   4 | train accuracy    0.431 | time: 14.76s
| epoch   5 | train accuracy    0.488 | time: 14.49s
| epoch   6 | train accuracy    0.542 | time: 14.36s
| epoch   7 | train accuracy    0.567 | time: 14.26s
| epoch   8 | train accuracy    0.608 | time: 14.30s
| epoch   9 | train accuracy    0.643 | time: 14.31s
| epoch  10 | train accuracy    0.699 | time: 14.33s
| epoch  11 | train accuracy    0.717 | time: 14.23s
| epoch  12 | train accuracy    0.772 | time: 15.27s
| epoch  13 | train accuracy    0.792 | time: 14.24s
| epoch  14 | train accuracy    0.837 | time: 14.21s
| epoch  15 | train accuracy    0.870 | time: 14.19s
| epoch  16 | train accuracy    0.898 | time: 14.21s
| epoch  17 | train accuracy    0.911 | time: 14.31s
| epoch  18 | train accuracy    0.918 | time: 14.32s
| epoch  19 | train accuracy    0.941 | time: 

In [156]:
def scores(predictions, labels):
    preds = np.array(predictions)
    labels = np.array(labels)

    average_distance = np.mean(np.abs(preds - labels))

    max_label_distance = labels.max() - labels.min()
    abs_diff = np.abs(preds - labels)
    proximity_scores = 1 - (abs_diff / max_label_distance)
    average_proximity_score = np.mean(proximity_scores)

    return average_distance, average_proximity_score


In [199]:
mae, proximity = scores(predictions, labels)
baseline_predictions = np.array([np.mean(labels)] * len(labels))
baseline_mae, baseline_proximity = scores(baseline_predictions, labels)

print(f"Average distance from correct label: {mae:.3f}")
print(f"Average proximity score: {proximity:.3f}")
print(f"Baseline average distance from correct label: {baseline_mae:.3f}")
print(f"Baseline average proximity score: {baseline_proximity:.3f}")

Average distance from correct label: 0.779
Average proximity score: 0.740
Baseline average distance from correct label: 1.042
Baseline average proximity score: 0.653


In [103]:
# Combine predictions and labels into one DataFrame
df = pd.DataFrame({
    "value": predictions + labels,
    "type": ["Prediction"] * len(predictions) + ["Actual"] * len(labels)
})

# Create grouped histogram
fig = px.histogram(df, x="value", color="type", barmode="group",
                   title="Predictions vs Actual")
fig.update_layout(xaxis_title="Class", yaxis_title="Count")
fig.show()


Notes of what I did:
Use of crew, cast, genres, and production companies as embeddings
Padding the embeddings to the same length
Weighting the training data to account for imbalance
Proximity score to account for multi-class context
Predicting vote average proved difficult. Wasn't able to beat the baseline using just the embeddings.
Predicting popularity marginally beat the baseline using just the embeddings.
Predicting revenue was the most successful, beating the baseline by a larger margin using just the embeddings (and had a better proximity score).