In [2]:
#| include: false

import pandas as pd
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import LabelEncoder
import numpy as np
import time
import plotly.express as px
import plotly.io as pio
pio.templates.default = "plotly_white"
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore", category = UserWarning)
from transformers import BertTokenizer
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
from sklearn.utils.class_weight import compute_class_weight
import torch.nn.functional as F

In [3]:
target_col = "revenue"
ids_col = "final_ids"
adjustment = 0
weight_bool = False

In [4]:
df = pd.read_csv("clean_data/embeddings.csv")
# drop columns with 0 revenue

if target_col == "revenue":
    # drop columns with 0 revenue
    df = df[df["revenue"] > 0]
    # make label as bins of revenue
    df["label"] = pd.qcut(df["revenue"], q=5, labels=False, duplicates='drop')
    adjustment = 0
    weight_bool = False
elif target_col == "vote_average":
    # drop columns with less than 100 votes
    df = df[df["vote_count"] > 100]
    df["label"] = df["vote_average"].apply(np.floor).astype(int)
    # df["label"] = df["vote_average"].apply(lambda x: x / 0.5).astype(int)
    adjustment = 2
    weight_bool = True
elif target_col == "popularity":
    df["label"] = pd.qcut(df["popularity"], q=5, labels=False, duplicates='drop')
    adjustment = 0
    weight_bool = False

display(df.head())


Unnamed: 0,movie_id,cast_ids,crew_ids,vote_average,vote_count,combined_ids,popularity,revenue,final_ids,label
0,862,"[31, 12898, 7167, 12899, 12900, 7907, 8873, 11...","[0, 0, 0, 0, 0, 0, 7883, 7961, 1458006, 174870...",7.7,5415.0,"[16, 35, 10751, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, ...",21.946943,373554033.0,"[31, 12898, 7167, 12899, 12900, 7907, 8873, 11...",4
1,8844,"[2157, 8537, 205, 145151, 5149, 10739, 58563, ...","[0, 0, 0, 0, 0, 0, 9967, 0, 0, 0, 0, 0, 0, 0, ...",6.9,2413.0,"[12, 14, 10751, 0, 0, 0, 0, 0, 559, 2550, 1020...",17.015539,262797249.0,"[2157, 8537, 205, 145151, 5149, 10739, 58563, ...",4
3,31357,"[8851, 9780, 18284, 51359, 66804, 352, 87118, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",6.1,34.0,"[35, 18, 10749, 0, 0, 0, 0, 0, 306, 0, 0, 0, 0...",3.859495,81452156.0,"[8851, 9780, 18284, 51359, 66804, 352, 87118, ...",3
4,11862,"[67773, 3092, 519, 70696, 59222, 18793, 14592,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",5.7,173.0,"[35, 0, 0, 0, 0, 0, 0, 0, 5842, 9195, 0, 0, 0,...",8.387519,76578911.0,"[67773, 3092, 519, 70696, 59222, 18793, 14592,...",3
5,949,"[1158, 380, 5576, 10127, 3197, 6200, 15851, 15...","[0, 0, 0, 0, 0, 0, 11411, 15843, 13677, 147927...",7.7,1886.0,"[28, 80, 18, 53, 0, 0, 0, 0, 508, 675, 6194, 0...",17.924927,187436818.0,"[1158, 380, 5576, 10127, 3197, 6200, 15851, 15...",4


In [5]:
label_counts = df["label"].value_counts()
print(label_counts)
print(len(df))

label
4    1486
2    1486
0    1486
3    1485
1    1485
Name: count, dtype: int64
7428


In [6]:
# calculate the baseline accuracy
baseline_accuracy = df["label"].value_counts(normalize=True).max()
print(f"Baseline accuracy: {baseline_accuracy:.2f}")

Baseline accuracy: 0.20


In [7]:
train_df, test_df = train_test_split(df, test_size=0.4, random_state=42)
test_df, val_df = train_test_split(test_df, test_size=0.5, random_state=42)

def preprocess(df, x_col=ids_col):
    X = df[x_col]
    # convert string representation of list to actual list
    X = [eval(ids) for ids in X]
    y = list(df["label"])
    # subtract adjustment from y to make it zero-indexed
    y = [label - adjustment for label in y]
    return X, y

class ToDF(Dataset):
    def __init__(self, df):
        self.X, self.y = preprocess(df)

    def __getitem__(self, ix):
        return self.X[ix], self.y[ix]

    def __len__(self):
        return len(self.y)
    
train = ToDF(train_df)
val = ToDF(val_df)
test = ToDF(test_df)

def collate(data):
    X = torch.tensor([d[0] for d in data])
    y = torch.tensor([d[1] for d in data])
    return X, y 

train_loader = DataLoader(train, batch_size=32, shuffle=True, collate_fn=collate)
val_loader = DataLoader(val, batch_size=32, shuffle=False, collate_fn=collate)
test_loader = DataLoader(test, batch_size=32, shuffle=False, collate_fn=collate)

In [8]:
vocab_size = max([item for sublist in df[ids_col].apply(eval).tolist() for item in sublist])
max_len = max([len(item) for item in df[ids_col].apply(eval).tolist()])
print(f"Vocab size: {vocab_size}")
print(f"Max length: {max_len}")

Vocab size: 1908262
Max length: 1018


In [9]:
y_train = train_df["label"].values - adjustment
classes = np.arange(0, max(y_train) + 1)
num_class = len(classes)
weights = compute_class_weight(class_weight='balanced', classes=classes, y=y_train)
weights = torch.tensor(weights, dtype=torch.float)

In [10]:
class AttentionPooling(nn.Module):
    def __init__(self, embedding_dim):
        super().__init__()
        self.attention = nn.Linear(embedding_dim, 1)

    def forward(self, x):
        weights = F.softmax(self.attention(x), dim=1)
        weighted = x * weights
        return weighted.sum(dim=1)

class Model(nn.Module):

    def __init__(self, vocab_size, embedding_dim, max_len, num_class):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size+1, embedding_dim)
        self.position_embedding = nn.Embedding(max_len, embedding_dim)
        self.attn_pool = AttentionPooling(embedding_dim)
        self.fc1 = nn.Linear(embedding_dim, 128)
        self.fc2 = nn.Linear(128, 32)
        self.fc3 = nn.Linear(32, num_class)

        self.dropout = nn.Dropout(0.3)
        self.norm1 = nn.LayerNorm(128)
        self.norm2 = nn.LayerNorm(32)

    def forward(self, X):
        positions = torch.arange(X.size(1)).unsqueeze(0).expand_as(X)
        x = self.embedding(X) + self.position_embedding(positions)
        x = self.attn_pool(x)
        x = self.dropout(F.relu(self.norm1(self.fc1(x))))
        x = self.dropout(F.relu(self.norm2(self.fc2(x))))
        x = self.fc3(x)
        return x

embedding_dim = 25

In [12]:
def train(dataloader, model):
    epoch_start_time = time.time()
    total_acc, total_count = 0, 0
    
    for X, y in dataloader:
        optimizer.zero_grad()
        predicted_label = model(X)
        loss = loss_fn(predicted_label, y)
        loss.backward()
        optimizer.step()
                
        total_acc   += (predicted_label.argmax(1) == y).sum().item()
        total_count += y.size(0)

    val_acc, _, _ = accuracy(val_loader, model)
    print(f'| epoch {epoch:3d} | train accuracy {total_acc/total_count:8.3f} | val accuracy {val_acc:8.3f} | time: {time.time() - epoch_start_time:5.2f}s')

def accuracy(dataloader, model):

    total_acc, total_count = 0, 0
    predictions = []
    labels = []

    with torch.no_grad():
        for X, y in dataloader:
            predicted_label = model(X)
            predictions.extend(predicted_label.argmax(1).tolist())
            labels.extend(y.tolist())
            total_acc += (predicted_label.argmax(1) == y).sum().item()
            total_count += y.size(0)
    
    if adjustment != 0:
        predictions = [x + adjustment for x in predictions]
        labels = [x + adjustment for x in labels]
    return total_acc/total_count, predictions, labels

model = Model(vocab_size, embedding_dim, max_len, num_class).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=.001)

if weight_bool:
    loss_fn = torch.nn.CrossEntropyLoss(weight=weights)
else:
    loss_fn = torch.nn.CrossEntropyLoss()

EPOCHS = 20
for epoch in range(1, EPOCHS + 1):
    train(train_loader, model)

test_acc, predictions, labels = accuracy(test_loader, model)
print(f'test accuracy: {test_acc:.3f}')
print(predictions)
print(labels)

| epoch   1 | train accuracy    0.214 | val accuracy    0.224 | time: 12.84s
| epoch   2 | train accuracy    0.304 | val accuracy    0.312 | time: 13.21s
| epoch   3 | train accuracy    0.353 | val accuracy    0.328 | time: 14.07s
| epoch   4 | train accuracy    0.387 | val accuracy    0.353 | time: 14.07s
| epoch   5 | train accuracy    0.428 | val accuracy    0.368 | time: 13.09s
| epoch   6 | train accuracy    0.485 | val accuracy    0.375 | time: 12.91s
| epoch   7 | train accuracy    0.559 | val accuracy    0.371 | time: 12.32s
| epoch   8 | train accuracy    0.619 | val accuracy    0.376 | time: 12.75s
| epoch   9 | train accuracy    0.683 | val accuracy    0.373 | time: 14.40s
| epoch  10 | train accuracy    0.741 | val accuracy    0.357 | time: 12.46s
| epoch  11 | train accuracy    0.785 | val accuracy    0.378 | time: 13.13s
| epoch  12 | train accuracy    0.837 | val accuracy    0.381 | time: 13.13s
| epoch  13 | train accuracy    0.875 | val accuracy    0.381 | time: 13.32s

In [16]:
# Create a DataFrame with final_ids, predictions, and labels
result_df = pd.DataFrame({
    "final_ids": test_df["final_ids"].tolist(),
    "prediction": predictions,
    "label": labels
})

display(result_df.head())

# Save the DataFrame to a CSV file
result_df.to_csv("results/revenue_results.csv", index=False)

Unnamed: 0,final_ids,prediction,label
0,"[141910, 141911, 141912, 141913, 21709, 141914...",2,0
1,"[26100, 142689, 136761, 143595, 38529, 574833,...",2,0
2,"[40325, 40327, 20664, 40329, 40330, 40331, 167...",4,0
3,"[84223, 12111, 11664, 59919, 55466, 77070, 170...",2,4
4,"[116, 110, 17288, 190895, 1925, 1846, 36463, 4...",0,2


In [702]:
def scores(predictions, labels):
    preds = np.array(predictions)
    labels = np.array(labels)

    average_distance = np.mean(np.abs(preds - labels))

    max_label_distance = labels.max() - labels.min()
    abs_diff = np.abs(preds - labels)
    proximity_scores = 1 - (abs_diff / max_label_distance)
    average_proximity_score = np.mean(proximity_scores)

    return average_distance, average_proximity_score

In [760]:
mae, proximity = scores(predictions, labels)
baseline_predictions = np.array([np.mean(labels)] * len(labels))
baseline_mae, baseline_proximity = scores(baseline_predictions, labels)

print(f"Average distance from correct label: {mae:.3f}")
print(f"Average proximity score: {proximity:.3f}")
print(f"Baseline average distance from correct label: {baseline_mae:.3f}")
print(f"Baseline average proximity score: {baseline_proximity:.3f}")

Average distance from correct label: 1.054
Average proximity score: 0.789
Baseline average distance from correct label: 0.617
Baseline average proximity score: 0.877


In [103]:
# Combine predictions and labels into one DataFrame
df = pd.DataFrame({
    "value": predictions + labels,
    "type": ["Prediction"] * len(predictions) + ["Actual"] * len(labels)
})

# Create grouped histogram
fig = px.histogram(df, x="value", color="type", barmode="group",
                   title="Predictions vs Actual")
fig.update_layout(xaxis_title="Class", yaxis_title="Count")
fig.show()


Notes of what I did:
Use of crew, cast, genres, and production companies as embeddings
Padding the embeddings to the same length
Weighting the training data to account for imbalance
Proximity score to account for multi-class context
Predicting vote average proved difficult. Wasn't able to beat the baseline using just the embeddings.
Predicting popularity marginally beat the baseline using just the embeddings.
Predicting revenue was the most successful, beating the baseline by a larger margin using just the embeddings (and had a better proximity score).
Tried attention pooling with layer normalization which didn't have much of an effect.
Added positional embeddings to capture the order of the embeddings.
Then tried self-attention to capture the relationships between the embeddings.