# Section 1: Imports & Setup

In [1]:
import pandas as pd
import numpy as np
import os
import torch
from sklearn.preprocessing import LabelEncoder
from transformers import CLIPProcessor, CLIPModel
from torchvision import transforms
from PIL import Image
import matplotlib.pyplot as plt

# Set device

In [3]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


# Import and clean data

In [4]:
# read movielens
ratings = pd.read_csv('raw_data/ratings.csv')
links = pd.read_csv('raw_data/links.csv')

# get list of movies that have ratings
movie_ids = ratings['movieId'].unique()

# get list of imdb ids from movie ids
valid_movie_ids = links['movieId'].isin(movie_ids)
imdb_ids = links[valid_movie_ids]['imdbId'].tolist()

# add leading 0s to 6 digit ids
imdb_ids = [str(num).zfill(7) for num in imdb_ids]

In [5]:
# read imdb5000 metadata
metadata_raw = pd.read_csv('raw_data/movie_metadata.csv')

# get metadata of movies that have ratings
metadata = metadata_raw[metadata_raw['movie_imdb_link'].str[28:35].isin(imdb_ids)]

In [6]:
# read posters
posters = pd.read_csv('raw_data/movie_posters.csv', encoding='latin1')

# get posters that have movies with user ratings
posters['imdbId'] = posters['imdbId'].astype(str).str.zfill(7)
posters = posters[posters['imdbId'].isin(imdb_ids)]

In [7]:
# read plots
plots = pd.read_csv('raw_data/movie_plots.csv')

# get movie titles
titles = [title[:-1] for title in metadata['movie_title'].values]

# get plots only from valid movie titles
plots = plots[plots['Title'].isin(titles)]

# Preprocessing

In [11]:
print(metadata.columns)

Index(['color', 'director_name', 'num_critic_for_reviews', 'duration',
       'director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_name',
       'actor_1_facebook_likes', 'gross', 'genres', 'actor_1_name',
       'movie_title', 'num_voted_users', 'cast_total_facebook_likes',
       'actor_3_name', 'facenumber_in_poster', 'plot_keywords',
       'movie_imdb_link', 'num_user_for_reviews', 'language', 'country',
       'content_rating', 'budget', 'title_year', 'actor_2_facebook_likes',
       'imdb_score', 'aspect_ratio', 'movie_facebook_likes'],
      dtype='object')
