# Installations & Configuration

In [3]:
!pip install kaggle

import os
import torch
import pandas as pd
import torchvision.transforms as transforms

from PIL import Image
from google.colab import files
from torch.utils.data import Dataset, DataLoader


# Kaggle API Config (Please Upload your kaggle.json file below 👇)
files.upload()
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json






Saving kaggle.json to kaggle (2).json
Dataset URL: https://www.kaggle.com/datasets/rezaunderfit/48k-imdb-movies-with-posters
License(s): CC0-1.0
Downloading 48k-imdb-movies-with-posters.zip to /content
 99% 617M/621M [00:24<00:00, 30.3MB/s]
100% 621M/621M [00:24<00:00, 26.9MB/s]
unzip:  cannot find or open archive.zip, archive.zip.zip or archive.zip.ZIP.


# Data Setup

In [None]:
# Get the movie posters from kaggle
!kaggle datasets download -d rezaunderfit/48k-imdb-movies-with-posters
!unzip 48k-imdb-movies-with-posters.zip

# Now upload the title.basics.tsv file
files.upload()
title_basics = pd.read_csv('title.basics.tsv', sep='\t', na_values='\\N')

# Define the image transformations
image_transforms = transforms.Compose([
    transforms.Resize((182, 268)),
    transforms.ToTensor()
])

class MovieDataset(Dataset):
    def __init__(self, metadata, img_dir, transform=None, genre_to_index=None):
        self.metadata = metadata
        self.img_dir = img_dir
        self.transform = transform
        self.genre_to_index = genre_to_index

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.metadata.iloc[idx]['tconst'], f"{self.metadata.iloc[idx]['tconst']}.jpg")
        image = Image.open(img_name).convert('RGB')
        if self.transform:
            image = self.transform(image)
        genres = self.metadata.iloc[idx]['genres']
        genre_tensor = self.genres_to_tensor(genres)
        return image, genre_tensor

    def genres_to_tensor(self, genres):
        first_genre = genres.split(',')[0] if pd.notna(genres) else 'Unknown'
        genre_index = self.genre_to_index.get(first_genre, self.genre_to_index['Unknown'])
        return torch.tensor(genre_index, dtype=torch.long)


# Create a mapping from genre to index
all_genres = set(g.split(',')[0] for g in title_basics['genres'].dropna())
genre_to_index = {genre: idx for idx, genre in enumerate(all_genres)}
genre_to_index['Unknown'] = len(genre_to_index)

# Directory containing images
img_dir = 'Poster'

# Create the dataset
movie_dataset = MovieDataset(metadata=title_basics, img_dir=img_dir, transform=image_transforms, genre_to_index=genre_to_index)

# Create the DataLoader
data_loader = DataLoader(movie_dataset, batch_size=32, shuffle=True, num_workers=4)

# Example of using the DataLoader
for images, genres in data_loader:
    print(images.shape)  # Shape: (batch_size, 3, 182, 268)
    print(genres.shape)  # Shape: (batch_size,)
    break

# Transfer Learning --> Fine Tuning