In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os
import ast
import json
import torch

from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, Trainer, TrainingArguments, AlbertForSequenceClassification, AlbertTokenizer
from sklearn.preprocessing import MultiLabelBinarizer

In [4]:
data_path = 'data/movies'
metadata = pd.read_csv(os.path.join(data_path, 'movies_metadata.csv'), low_memory=False)

def convert_metadata(metadata):
    metadata['release_date'] = pd.to_datetime(metadata['release_date'], errors='coerce')
    metadata['budget'] = pd.to_numeric(metadata['budget'], errors='coerce')
    metadata['revenue'] = pd.to_numeric(metadata['revenue'], errors='coerce')
    metadata['runtime'] = pd.to_numeric(metadata['runtime'], errors='coerce')
    return metadata

metadata = convert_metadata(metadata)

metadata = metadata[pd.notnull(metadata['title'])]
metadata = metadata[pd.notnull(metadata['budget'])]
metadata = metadata[pd.notnull(metadata['revenue'])]
metadata = metadata[pd.notnull(metadata['runtime'])]
metadata = metadata[pd.notnull(metadata['release_date'])]

metadata['year'] = pd.to_datetime(metadata['release_date'], errors='coerce').dt.year.astype('Int64')
metadata['genre_list'] = metadata['genres'].apply(lambda x: [genre['name'] for genre in ast.literal_eval(x)])
metadata.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 45130 entries, 0 to 45465
Data columns (total 26 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   adult                  45130 non-null  object        
 1   belongs_to_collection  4481 non-null   object        
 2   budget                 45130 non-null  float64       
 3   genres                 45130 non-null  object        
 4   homepage               7766 non-null   object        
 5   id                     45130 non-null  object        
 6   imdb_id                45118 non-null  object        
 7   original_language      45119 non-null  object        
 8   original_title         45130 non-null  object        
 9   overview               44435 non-null  object        
 10  popularity             45130 non-null  object        
 11  poster_path            44808 non-null  object        
 12  production_companies   45130 non-null  object        
 13  p

In [5]:
!pip install transformers torch accelerate -U




In [9]:
import torch
from transformers import AlbertForSequenceClassification, AlbertTokenizer, Trainer, TrainingArguments
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split

# Convert genre_list to multi-label format
mlb = MultiLabelBinarizer()
y = mlb.fit_transform(metadata['genre_list'])

# Load the ALBERT tokenizer and model
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
model = AlbertForSequenceClassification.from_pretrained('albert-base-v2', num_labels=len(mlb.classes_))

# Tokenize the movie titles
tokenized_inputs = tokenizer(list(metadata['title']), truncation=True, padding=True, return_tensors="pt")
labels = torch.tensor(y, dtype=torch.float32)  # Convert labels to torch tensor

# Split the data into training and evaluation sets
train_inputs, eval_inputs, train_labels, eval_labels = train_test_split(tokenized_inputs['input_ids'], labels, test_size=0.2, random_state=42)

# Define a custom dataset class for training and evaluation
class MovieGenreDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {
            'input_ids': self.encodings[idx],
            'attention_mask': self.encodings[idx].bool(),
            'labels': self.labels[idx]
        }
        return item

    def __len__(self):
        return len(self.labels)

# Create instances of the custom dataset for training and evaluation
train_dataset = MovieGenreDataset(train_inputs, train_labels)
eval_dataset = MovieGenreDataset(eval_inputs, eval_labels)

# Define the Trainer for fine-tuning
training_args = TrainingArguments(
    per_device_train_batch_size=64,
    evaluation_strategy="epoch",
    logging_dir="./logs",
    output_dir="./results",
    num_train_epochs=2,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_albert_model")


Some weights of AlbertForSequenceClassification were not initialized from the model checkpoint at albert-base-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,0.2876,0.267689
2,0.2663,0.259359


In [20]:
test_title = "Toy Story"
inputs = tokenizer(test_title, return_tensors="pt", truncation=True, padding=True)
inputs.to(torch.device('cuda'))  # Move inputs to CUDA device

model.to(torch.device('cuda'))  # Move model to CUDA device
outputs = model(**inputs)
predicted_scores = torch.sigmoid(outputs.logits).detach().cpu().numpy()  # Move predictions back to CPU

# Convert scores to genre labels
threshold = 0.0  # Adjust threshold as needed
predicted_indices = predicted_scores[0] > threshold
predicted_genres = mlb.classes_[predicted_indices]
predicted_probabilities = predicted_scores[0][predicted_indices]

# Create a list of tuples (genre, probability) and sort by probability in descending order
genre_prob_pairs = [(genre, probability) for genre, probability in zip(predicted_genres, predicted_probabilities)]
sorted_genre_prob_pairs = sorted(genre_prob_pairs, key=lambda x: x[1], reverse=True)

# Print the sorted genres and probabilities
for genre, probability in sorted_genre_prob_pairs:
    print(f"Genre: {genre}, Probability: {probability:.4f}")

Genre: Drama, Probability: 0.5135
Genre: Comedy, Probability: 0.4347
Genre: Romance, Probability: 0.2279
Genre: Documentary, Probability: 0.0789
Genre: Thriller, Probability: 0.0706
Genre: Crime, Probability: 0.0596
Genre: Action, Probability: 0.0591
Genre: Family, Probability: 0.0570
Genre: Music, Probability: 0.0436
Genre: Horror, Probability: 0.0364
Genre: Foreign, Probability: 0.0364
Genre: Adventure, Probability: 0.0349
Genre: Fantasy, Probability: 0.0307
Genre: Mystery, Probability: 0.0300
Genre: History, Probability: 0.0262
Genre: Animation, Probability: 0.0223
Genre: Science Fiction, Probability: 0.0220
Genre: War, Probability: 0.0205
Genre: Western, Probability: 0.0163
Genre: TV Movie, Probability: 0.0160
