# Installations & Configuration

In [1]:
!pip install kaggle

import os
import torch
import pandas as pd
import torchvision.transforms as transforms

from PIL import Image
from google.colab import files
from torch.utils.data import Dataset, DataLoader
from google.colab import drive

drive.mount('/content/drive')
kaggle_json_path = '/content/drive/ColabNotebooks/A5/kaggle.json'

# Copy kaggle.json to the correct location
!mkdir -p ~/.kaggle
!cp {kaggle_json_path} ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Verify the Kaggle configuration
!kaggle datasets list



Saving kaggle.json to kaggle.json


# Data Setup

In [2]:
# Get the movie posters from kaggle
!kaggle datasets download -d rezaunderfit/48k-imdb-movies-with-posters
!unzip 48k-imdb-movies-with-posters.zip

# Load title basics
tsv_path = '/content/drive/ColabNotebooks/A5/title.basics.tsv'
title_basics = pd.read_csv(tsv_path, sep='\t', na_values='\\N')

# Define the image transformations
image_transforms = transforms.Compose([
    transforms.Resize((182, 268)),
    transforms.ToTensor()
])

class MovieDataset(Dataset):
    def __init__(self, metadata, img_dir, transform=None, genre_to_index=None):
        self.metadata = metadata
        self.img_dir = img_dir
        self.transform = transform
        self.genre_to_index = genre_to_index

    def __len__(self):
        return len(self.metadata)

    def __getitem__(self, idx):
        img_name = os.path.join(self.img_dir, self.metadata.iloc[idx]['tconst'], f"{self.metadata.iloc[idx]['tconst']}.jpg")
        image = Image.open(img_name).convert('RGB')
        if self.transform:
            image = self.transform(image)
        genres = self.metadata.iloc[idx]['genres']
        genre_tensor = self.genres_to_tensor(genres)
        return image, genre_tensor

    def genres_to_tensor(self, genres):
        first_genre = genres.split(',')[0] if pd.notna(genres) else 'Unknown'
        genre_index = self.genre_to_index.get(first_genre, self.genre_to_index['Unknown'])
        return torch.tensor(genre_index, dtype=torch.long)


# Create a mapping from genre to index
all_genres = set(g.split(',')[0] for g in title_basics['genres'].dropna())
genre_to_index = {genre: idx for idx, genre in enumerate(all_genres)}
genre_to_index['Unknown'] = len(genre_to_index)

# Directory containing images
img_dir = 'Poster'

# Create the dataset
movie_dataset = MovieDataset(metadata=title_basics, img_dir=img_dir, transform=image_transforms, genre_to_index=genre_to_index)

# Create the DataLoader
data_loader = DataLoader(movie_dataset, batch_size=32, shuffle=True, num_workers=4)

# Example of using the DataLoader
for images, genres in data_loader:
    print(images.shape)  # Shape: (batch_size, 3, 182, 268)
    print(genres.shape)  # Shape: (batch_size,)
    break

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: Poster/2015/tt3132632/tt3132632.jpg  
  inflating: Poster/2015/tt3133722/tt3133722.jpg  
  inflating: Poster/2015/tt3136112/tt3136112.jpg  
  inflating: Poster/2015/tt3136646/tt3136646.jpg  
  inflating: Poster/2015/tt3137546/tt3137546.jpg  
  inflating: Poster/2015/tt3137764/tt3137764.jpg  
  inflating: Poster/2015/tt3138192/tt3138192.jpg  
  inflating: Poster/2015/tt3138558/tt3138558.jpg  
  inflating: Poster/2015/tt3139538/tt3139538.jpg  
  inflating: Poster/2015/tt3139764/tt3139764.jpg  
  inflating: Poster/2015/tt3140724/tt3140724.jpg  
  inflating: Poster/2015/tt3142366/tt3142366.jpg  
  inflating: Poster/2015/tt3144266/tt3144266.jpg  
  inflating: Poster/2015/tt3144582/tt3144582.jpg  
  inflating: Poster/2015/tt3148348/tt3148348.jpg  
  inflating: Poster/2015/tt3148502/tt3148502.jpg  
  inflating: Poster/2015/tt3149038/tt3149038.jpg  
  inflating: Poster/2015/tt3150574/tt3150574.jpg  
  inflating: Post

KeyboardInterrupt: 

# Transfer Learning --> Fine Tuning