In [1]:
import torch
import torch.nn as nn
from torchvision import models, transforms
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel

  from .autonotebook import tqdm as notebook_tqdm


In [47]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW
from transformers import BertTokenizer, BertModel
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import numpy as np
from collections import defaultdict
import pandas as pd

df_test = pd.read_csv('movie_data_test.csv')

df_test['title_overview'] = df_test['original_title'] + ': ' + df_test['overview']

df_test = pd.DataFrame({
    'title_overview': df_test['title_overview'],
    'tags': df_test['tags'].fillna(''),
    'revenue': df_test['revenue']
})

df_test['revenue'] = np.log1p(df_test['revenue'])
device = torch.device("mps" if torch.backends.mps.is_available() else
                      "cuda" if torch.cuda.is_available() else "cpu")
df_test['tags'] = df_test['tags'].apply(lambda x: [tag.strip().lower() for tag in x.split(',') if tag.strip()])

class TagCNNEncoder(nn.Module):
    def __init__(self, vocab_size, embed_dim=300, num_filters=128, kernel_sizes=(2, 3, 4), dropout=0.2):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.convs = nn.ModuleList([
            nn.Conv1d(embed_dim, num_filters, k) for k in kernel_sizes
        ])
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        embedded = self.embedding(x).transpose(1, 2)
        conv_outs = [torch.relu(conv(embedded)).max(dim=2)[0] for conv in self.convs]
        out = torch.cat(conv_outs, dim=1)
        return self.dropout(out)

class BERTWithTagCNNRegressor(nn.Module):
    def __init__(self, tag_vocab_size, dropout=0.3):
        super().__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.tag_encoder = TagCNNEncoder(tag_vocab_size)
        self.dropout = nn.Dropout(dropout)

        self.regressor = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size + 384, 512),
            nn.ReLU(),
            nn.Dropout(dropout),
            nn.Linear(512, 128),
            nn.ReLU(),
            nn.Linear(128, 1)
        )

    def forward(self, text_input_ids, text_attention_mask, tag_token_ids):

        bert_output = self.bert(input_ids=text_input_ids, attention_mask=text_attention_mask)
        text_cls = bert_output.pooler_output  

        tag_feat = self.tag_encoder(tag_token_ids)

        fused = torch.cat([text_cls, tag_feat], dim=1)
        return self.regressor(self.dropout(fused))
    
class MovieDatasetWithTags(nn.Module):
    def __init__(self, texts, tags, targets, tokenizer, tag_vocab, max_text_len=256, max_tag_len=20):
        self.texts = texts
        self.tags = tags
        self.targets = targets
        self.tokenizer = tokenizer
        self.tag_vocab = tag_vocab
        self.max_text_len = max_text_len
        self.max_tag_len = max_tag_len

    def __len__(self):
        return len(self.texts)

    def encode_tags(self, tag_list):
        # Encode tags as indices from vocab, pad to max_tag_len
        tag_ids = [self.tag_vocab.get(tag.lower(), self.tag_vocab['[UNK]']) for tag in tag_list]
        tag_ids = tag_ids[:self.max_tag_len]
        tag_ids += [self.tag_vocab['[PAD]']] * (self.max_tag_len - len(tag_ids))
        return torch.tensor(tag_ids, dtype=torch.long)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tags = self.tags[idx]  # list of strings
        target = torch.tensor(self.targets[idx], dtype=torch.float)

        # Tokenize title+overview
        text_enc = self.tokenizer(
            text,
            padding='max_length',
            truncation=True,
            max_length=self.max_text_len,
            return_tensors='pt'
        )

        tag_tensor = self.encode_tags(tags)

        return {
            'input_ids': text_enc['input_ids'].squeeze(0),
            'attention_mask': text_enc['attention_mask'].squeeze(0),
            'tags': tag_tensor,
            'target': target
        }

train_texts = df_train['title_overview'].tolist()
train_tags = df_train['tags'].tolist()
train_targets = df_train['revenue'].tolist()

test_texts = df_test['title_overview'].tolist()
test_tags = df_test['tags'].tolist()
test_targets = df_test['revenue'].tolist()

def build_tag_vocab(tag_lists, min_freq=1):
    tag_freq = defaultdict(int)
    for tags in tag_lists:
        for tag in tags:
            tag_freq[tag.lower()] += 1

    vocab = {'[PAD]': 0, '[UNK]': 1}
    for tag, freq in tag_freq.items():
        if freq >= min_freq:
            vocab[tag] = len(vocab)

    return vocab

tag_vocab = build_tag_vocab(train_tags + test_tags)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


test_dataset = MovieDatasetWithTags(test_texts, test_tags, test_targets, tokenizer, tag_vocab)
test_loader = DataLoader(test_dataset, batch_size=16)

model = BERTWithTagCNNRegressor(tag_vocab_size=len(tag_vocab)).to(device)

criterion = nn.MSELoss()
optimizer = AdamW(model.parameters(), lr=3e-5)
checkpoint = torch.load('title_overview_two_tower_model.pt', map_location=torch.device('cuda'))
model.load_state_dict(checkpoint['model_state_dict'])
model.eval()

predictions = []
actuals = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        tags = batch['tags'].to(device)
        targets = batch['target'].cpu().numpy()
        
        outputs = model(input_ids, attention_mask, tags).squeeze().cpu().numpy()
        
        predictions.extend(outputs)
        actuals.extend(targets)

predictions = np.expm1(predictions)
actuals = np.expm1(actuals)

rmse = np.sqrt(mean_squared_error(actuals, predictions))
r2 = r2_score(actuals, predictions)

print(f"\nFinal Test RMSE: ${rmse/1000000:.2f}M")
print(f"Final Test R²: {r2:.4f}")

title_predictions = predictions

  checkpoint = torch.load('title_overview_two_tower_model.pt', map_location=torch.device('cuda'))



Final Test RMSE: $190.97M
Final Test R²: 0.2004


In [None]:
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

class MultiImageDataset(Dataset):
    def __init__(self, df, poster_dir, backdrop_dir, thumbnail_dir, transform):
        self.df = df
        self.poster_dir = poster_dir
        self.backdrop_dir = backdrop_dir
        self.thumbnail_dir = thumbnail_dir
        self.transform = transform
        self.valid_ids = []

        for idx, row in df.iterrows():
            movie_id = str(int(row['id']))
            if all(os.path.exists(os.path.join(d, f"{movie_id}.jpg")) for d in [poster_dir, backdrop_dir, thumbnail_dir]):
                self.valid_ids.append(idx)

    def __len__(self):
        return len(self.valid_ids)

    def __getitem__(self, idx):
        df_idx = self.valid_ids[idx]
        row = self.df.iloc[df_idx]
        movie_id = str(int(row['id']))
        revenue = np.log1p(row['revenue'])

        def load_image(directory):
            image = Image.open(os.path.join(directory, f"{movie_id}.jpg")).convert("RGB")
            return self.transform(image)

        return {
            "poster": load_image(self.poster_dir),
            "backdrop": load_image(self.backdrop_dir),
            "thumbnail": load_image(self.thumbnail_dir),
            "revenue": torch.tensor(revenue, dtype=torch.float)
        }

def get_resnet_backbone():
    resnet = models.resnet50(pretrained=True)
    for param in resnet.parameters():
        param.requires_grad = False
    for param in resnet.layer4.parameters():
        param.requires_grad = True
    for param in resnet.avgpool.parameters():
        param.requires_grad = True
    return nn.Sequential(*list(resnet.children())[:-1])

class FineTunedEnsemble(nn.Module):
    def __init__(self):
        super().__init__()
        self.poster_net = get_resnet_backbone()
        self.backdrop_net = get_resnet_backbone()
        self.thumbnail_net = get_resnet_backbone()

        self.mlp = nn.Sequential(
            nn.Linear(2048*3, 1024),
            nn.ReLU(),
            nn.Dropout(0.4),
            nn.Linear(1024, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, 64),
            nn.ReLU(),
            nn.Linear(64, 1)
        )

    def forward(self, poster, backdrop, thumbnail):
        p = self.poster_net(poster)
        b = self.backdrop_net(backdrop)
        t = self.thumbnail_net(thumbnail)

        x = torch.cat([p.view(p.size(0), -1), b.view(b.size(0), -1), t.view(t.size(0), -1)], dim=1)
        return self.mlp(x)
    

df_test = pd.read_csv('movie_data_test.csv')

model = FineTunedEnsemble().to(device)
checkpoint = torch.load('best_ensemble_model.pt', map_location=torch.device('cuda'))
model.load_state_dict(checkpoint)
model.eval()
criterion = nn.MSELoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)

test_dataset = MultiImageDataset(df_test, "poster_dataset", "backdrop_dataset", "thumbnail_dataset", transform)
test_loader = DataLoader(test_dataset, batch_size=16)

test_preds, test_targets = [], []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        p = batch["poster"].to(device)
        b = batch["backdrop"].to(device)
        t = batch["thumbnail"].to(device)
        y = batch["revenue"].to(device)
        y_hat = model(p, b, t).squeeze()
        test_preds.extend(y_hat.cpu().view(-1).tolist())
        test_targets.extend(y.cpu().view(-1).tolist())

test_r2 = r2_score(np.expm1(test_targets), np.expm1(test_preds))
test_rmse = np.sqrt(mean_squared_error(np.expm1(test_targets), np.expm1(test_preds)))

print(f"\nTest Results:")
print(f"Test RMSE: ${test_rmse/1e6:.2f}M - Test R²: {test_r2:.4f}")

ensemble_predictions = np.expm1(test_preds)

  checkpoint = torch.load('best_ensemble_model.pt', map_location=torch.device('cuda'))
Testing: 100%|██████████| 48/48 [00:20<00:00,  2.35it/s]


Test Results:
Test RMSE: $230.74M - Test R²: -0.0426





In [55]:
import re
from transformers import AutoImageProcessor

# Load image processor
processor = AutoImageProcessor.from_pretrained("facebook/timesformer-base-finetuned-k400")

def numerical_sort_key(filename):
    return [int(text) if text.isdigit() else text.lower()
            for text in re.split(r'(\d+)', filename)]

class MovieKeyframeDataset(Dataset):
    def __init__(self, dataframe, frame_dir, image_processor, num_frames=8):
        self.dataframe = dataframe
        self.frame_dir = frame_dir
        self.image_processor = image_processor
        self.num_frames = num_frames

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        trailer_id = row['trailer']
        label = torch.tensor(row['log_revenue'], dtype=torch.float32)

        # Get frame folder path
        frame_folder = os.path.join(self.frame_dir, str(trailer_id))

        if not os.path.exists(frame_folder):
            # Return zero tensor if folder doesn't exist

            dummy_frames = torch.zeros((self.num_frames, 3, 224, 224))
            return {
                "pixel_values": dummy_frames,
                "labels": label
            }

        # Get all available frames
        frame_files = sorted([
            f for f in os.listdir(frame_folder) if f.endswith(".jpg")
        ], key=numerical_sort_key)

        selected_frames = frame_files[3:self.num_frames+3]

        frames = []
        for fname in selected_frames:
            img_path = os.path.join(frame_folder, fname)
            img = Image.open(img_path).convert("RGB")
            frames.append(np.array(img))  # Convert to numpy for feature_extractor

        # Preprocess using HuggingFace extractor (returns dict with 'pixel_values')
        pixel_values = self.image_processor(frames, return_tensors="pt")["pixel_values"][0]

        return {
            "pixel_values": pixel_values,  # [T, C, H, W]
            "labels": label                # scalar
        }
    
from transformers import TimesformerForVideoClassification

class TimeSformer(nn.Module):
    def __init__(self, model_name="facebook/timesformer-base-finetuned-k400"):
        super().__init__()
        self.backbone = TimesformerForVideoClassification.from_pretrained(model_name)

        hidden_size = self.backbone.config.hidden_size  # 768

        # Replace classification head with custom regression head
        self.backbone.classifier = nn.Sequential(
            nn.Linear(hidden_size, hidden_size // 2),
            nn.ReLU(),
            nn.Linear(hidden_size // 2, 1)
        )

    def forward(self, pixel_values):  # input: [B, T, C, H, W]
        outputs = self.backbone(pixel_values)
        return outputs.logits.view(-1)  # output: [B]
    
model = TimeSformer().to(device) 
checkpoint = torch.load('best_trailer_model.pt', map_location=torch.device('cuda'))
model.load_state_dict(checkpoint)
model.eval()

df_test = pd.read_csv('movie_data_test.csv')
df_test['log_revenue'] = np.log1p(df_test['revenue'])
test_dataset = MovieKeyframeDataset(df_test, "frames", processor, num_frames=8)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

y_true_val, y_pred_val = [], []

with torch.no_grad():
    for batch in tqdm(test_loader, desc="Testing"):
        pixel_values = batch["pixel_values"].to(device)
        labels = batch["labels"].to(device)

        outputs = model(pixel_values)
        loss = criterion(outputs, labels)

        y_true_val.extend(labels.cpu().numpy())
        y_pred_val.extend(outputs.cpu().numpy())


val_r2 = r2_score(np.expm1(y_true_val), np.expm1(y_pred_val))
val_rmse = np.sqrt(mean_squared_error(np.expm1(y_true_val), np.expm1(y_pred_val)))

print(f" R²: {val_r2:.4f} | RMSE: {val_rmse:.4f} " )


  checkpoint = torch.load('best_trailer_model.pt', map_location=torch.device('cuda'))
Testing: 100%|██████████| 871/871 [16:27<00:00,  1.13s/it]

 R²: -0.0347 | RMSE: 217239750.5946 





In [59]:
trailer_predictions = np.expm1(y_true_val)

In [103]:
import xgboost as xgb
from sklearn.impute import SimpleImputer

df = pd.read_csv('movie_data_train.csv')

numerical_features = ['budget', 'runtime', 'viewCount', 'likeCount', 'favoriteCount', 'commentCount']

df['release_month'] = pd.to_datetime(df['release_date']).dt.month

df['genres_list'] = df['genres'].str.split(',')

X_numeric = df[numerical_features + ['release_month']]
y = df['revenue']

genres_exploded = df['genres_list'].explode()
unique_genres = genres_exploded.dropna().unique()

for genre in unique_genres:
    df[f'genre_{genre.strip()}'] = df['genres_list'].apply(
        lambda x: 1 if x is not None and genre in [g.strip() for g in x] else 0
    )

genre_columns = [col for col in df.columns if col.startswith('genre_')]

X_combined = pd.concat([X_numeric, df[genre_columns]], axis=1)

X_train, X_val, y_train, y_val = train_test_split(
    X_combined, y, test_size=0.2, random_state=42
)

imputer = SimpleImputer(strategy='constant', fill_value=0)

X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)
test_df = pd.read_csv('movie_data_test.csv')

test_df['release_month'] = pd.to_datetime(test_df['release_date']).dt.month
test_df['genres_list'] = test_df['genres'].str.split(',')

X_test_numeric = test_df[numerical_features + ['release_month']]

for genre in unique_genres:
    test_df[f'genre_{genre.strip()}'] = test_df['genres_list'].apply(
        lambda x: 1 if x is not None and genre in [g.strip() for g in x] else 0
    )

X_test = pd.concat([X_test_numeric, test_df[genre_columns]], axis=1)

print(f"Test data shape: {X_test.shape}")

X_test = SimpleImputer(strategy='constant', fill_value=0).fit_transform(X_test)

y_test = test_df['revenue']

model = xgb.XGBRegressor(
            n_estimators=2000,
            learning_rate=0.015,
            max_leaves=10,
            subsample=0.5,
            colsample_bytree=0.6,
            reg_alpha=0.1,
            reg_lambda=0.1,
            random_state=42,
            min_child_weight=40,
            tree_method="hist", 
            verbosity=0
        )

model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=False
)

y_train_pred = model.predict(X_train)
y_val_pred = model.predict(X_val)
y_test_pred = model.predict(X_test)


r2_train = r2_score(y_train, y_train_pred)
r2_val = r2_score(y_val, y_val_pred)
r2_test = r2_score(y_test, y_test_pred)

rmse_train = np.sqrt(mean_squared_error(y_train, y_train_pred))
rmse_val = np.sqrt(mean_squared_error(y_val, y_val_pred))
rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))

mae_train = mean_absolute_error(y_train, y_train_pred)
mae_val = mean_absolute_error(y_val, y_val_pred)
mae_test = mean_absolute_error(y_test, y_test_pred)

results = {
    'train': {'r2': r2_train, 'rmse': rmse_train, 'mae': mae_train},
    'val': {'r2': r2_val, 'rmse': rmse_val, 'mae': mae_val},
    'test': {'r2': r2_test, 'rmse': rmse_test, 'mae': mae_test}
}

print(f"  R² Train={r2_train:.4f}, Val={r2_val:.4f}, Test={r2_test:.4f}")
print(f"  RMSE Train={rmse_train:.3e}, Val={rmse_val:.3e}, Test={rmse_test:.3e}")
print(f"  MAE Train={mae_train:.3e}, Val={mae_val:.3e}, Test={mae_test:.3e}")

comment_predictions = y_test_pred

Test data shape: (871, 26)
  R² Train=0.7234, Val=0.5760, Test=0.6084
  RMSE Train=9.814e+07, Val=1.435e+08, Test=1.336e+08
  MAE Train=5.034e+07, Val=6.441e+07, Test=6.304e+07


In [105]:
ensemble_predictions = ensemble_predictions.reshape(-1, 1)
title_predictions = title_predictions.reshape(-1, 1)
trailer_predictions = trailer_predictions.reshape(-1, 1)
comment_predictions = comment_predictions.reshape(-1, 1)

actuals = actuals.reshape(-1,1).to(device)

meta_inputs_np = np.concatenate([ensemble_predictions, title_predictions, trailer_predictions, comment_predictions], axis=1)
meta_inputs = torch.from_numpy(meta_inputs_np).to(device)



In [106]:
class MetaLearner(nn.Module):
    def __init__(self, input_dim=3):
        super().__init__()
        self.mlp = nn.Sequential(
            nn.Linear(input_dim, 16),
            nn.ReLU(),
            nn.Linear(16, 1)  # output: predicted revenue
        )

    def forward(self, x):
        return self.mlp(x)

In [107]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import torch
import torch.nn as nn

def train_meta_model(meta_inputs, actuals, input_dim=3, device="cuda", epochs=10, lr=1e-3):
    model = MetaLearner(input_dim=input_dim).to(device)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    criterion = nn.MSELoss()

    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()

        outputs = model(meta_inputs)
        loss = criterion(outputs, actuals)
        loss.backward()
        optimizer.step()

        # Evaluation
        model.eval()
        with torch.no_grad():
            preds = model(meta_inputs)

        # Metrics
        preds_np = preds.cpu().numpy()
        actuals_np_eval = actuals.cpu().numpy()

        r2 = r2_score(actuals_np_eval, preds_np)
        rmse = np.sqrt(mean_squared_error(actuals_np_eval, preds_np))
        mae = mean_absolute_error(actuals_np_eval, preds_np)

        print(f"Epoch {epoch+1:02d}: Loss = {loss.item():.4f} | R² = {r2:.4f} | RMSE = {rmse/1000000:.4f} | MAE = {mae/1000000:.4f}")

    return model

In [109]:
meta_model = train_meta_model(
    meta_inputs=meta_inputs,  # shape [N, 3]
    actuals=actuals,          # shape [N]
    input_dim=4,
    device=device,
    epochs=100,
    lr=1e-3
)

Epoch 01: Loss = 13835569276649472.0000 | R² = 0.7030 | RMSE = 116.3953 | MAE = 47.3043
Epoch 02: Loss = 13547861228650496.0000 | R² = 0.7092 | RMSE = 115.1702 | MAE = 46.8203
Epoch 03: Loss = 13264166827589632.0000 | R² = 0.7153 | RMSE = 113.9506 | MAE = 46.3499
Epoch 04: Loss = 12984728739119104.0000 | R² = 0.7214 | RMSE = 112.7357 | MAE = 45.8904
Epoch 05: Loss = 12709336509841408.0000 | R² = 0.7273 | RMSE = 111.5255 | MAE = 45.4343
Epoch 06: Loss = 12437938600148992.0000 | R² = 0.7332 | RMSE = 110.3210 | MAE = 44.9893
Epoch 07: Loss = 12170712177442816.0000 | R² = 0.7389 | RMSE = 109.1217 | MAE = 44.5544
Epoch 08: Loss = 11907552015024128.0000 | R² = 0.7446 | RMSE = 107.9290 | MAE = 44.1232
Epoch 09: Loss = 11648663197581312.0000 | R² = 0.7502 | RMSE = 106.7426 | MAE = 43.6949
Epoch 10: Loss = 11393982374346752.0000 | R² = 0.7557 | RMSE = 105.5629 | MAE = 43.2714
Epoch 11: Loss = 11143533167640576.0000 | R² = 0.7611 | RMSE = 104.3903 | MAE = 42.8607
Epoch 12: Loss = 108973273886228