In [18]:
!pip install requests pandas tqdm pillow torch torchvision scikit-learn xgboost



In [19]:
import torch
from torch.utils.data import Dataset
from PIL import Image
import pandas as pd
from torchvision import transforms
import numpy as np
import os

class MoviePosterDataset(Dataset):
    def __init__(self, csv_file, img_dir):
        df = pd.read_csv(csv_file)

        # chỉ giữ những dòng có file ảnh
        self.df = df[
            df["id"].apply(
                lambda x: os.path.exists(f"{img_dir}/{x}.jpg")
            )
        ].reset_index(drop=True)

        print("After filter:", len(df))

        self.img_dir = img_dir
        self.transform = transforms.Compose([
            transforms.Resize((224, 224)),
            transforms.ToTensor(),
            transforms.Normalize(
                mean=[0.485, 0.456, 0.406],
                std=[0.229, 0.224, 0.225]
            )
        ])

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img = Image.open(f"{self.img_dir}/{row.id}.jpg").convert("RGB")
        img = self.transform(img)

        y = row.revenue
        return img, torch.tensor(y, dtype=torch.float32)


In [20]:
import torchvision.models as models
import torch.nn as nn

device = torch.device("cuda")

resnet = models.resnet50(pretrained=True)
resnet.fc = nn.Identity()
resnet = resnet.to(device)
resnet.eval()



ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

In [21]:
from torch.utils.data import DataLoader
import torch
import numpy as np

dataset = MoviePosterDataset("/kaggle/input/movies/movies_dataset_revenue.csv", "/kaggle/input/posters/kaggle/working/posters")
loader = DataLoader(
    dataset,
    batch_size=64,       # GPU T4 chịu tốt
    shuffle=False,
    num_workers=2,       # Kaggle giới hạn
    pin_memory=True
)

X_img = []

with torch.no_grad():
    for imgs, _ in loader:
        imgs = imgs.to(device)
        emb = resnet(imgs)
        X_img.append(emb.cpu().numpy())

X_img = np.vstack(X_img)



After filter: 11187


In [22]:
df = pd.read_csv("/kaggle/input/movies/movies_dataset_revenue.csv")

valid_ids = dataset.df["id"].values

df_tabular = df[df["id"].isin(valid_ids)].reset_index(drop=True)

X_tabular = np.log1p(df_tabular[["budget"]].fillna(0).values)
y = np.log1p(df_tabular["revenue"].values)
X_full = np.hstack([X_tabular, X_img])

In [23]:
X_train, X_test, y_train, y_test = train_test_split(
    X_full, y, test_size=0.2, random_state=42
)

model_img = XGBRegressor(n_estimators=300, max_depth=5)
model_img.fit(X_train, y_train)

pred = model_img.predict(X_test)
print("RMSE + poster:", mean_squared_error(y_test, pred))


RMSE + poster: 4.39279055018976
