In [19]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

# get final occurence of a 4 digit number in the title string
def extract_year(title):
    match = re.search(r'\((\d{4})\)$', title)
    return int(match.group(1)) if match else None

def clean_title(title):
    cleaned = re.sub(r'\s*\(\d{4}\)$', '', title).strip()
    if cleaned.endswith(', The'):
        cleaned = 'The ' + cleaned[:-5]
    return cleaned


movies_df = pd.read_csv('../data/raw/ml-32m/movies.csv')
print(f"loaded {len(movies_df):,} movies")

movies_df['year'] = movies_df['title'].apply(extract_year)
movies_df['clean_title'] = movies_df['title'].apply(clean_title)

# take out all movies that don't have years, not a good idea to guess the year
# or leave NaN values, so just deleting
print(f"movies without years: {movies_df['year'].isna().sum()}")
movies_df = movies_df.dropna(subset=['year']).copy()

# was showing as "[year].0" before fix
movies_df['year'] = movies_df['year'].astype(int)

# split genres into list unless no genres listed
movies_df['genre_list'] = movies_df['genres'].apply(
    lambda x: [] if x == '(no genres listed)' else x.split('|')
)

# encode genres into binary for each movie
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies_df['genre_list'])
genre_columns = [f'genre_{genre.lower().replace("-", "_")}' for genre in mlb.classes_]

genre_df = pd.DataFrame(genre_matrix, columns=genre_columns, index=movies_df.index)
movies_features = pd.concat([
    movies_df[['movieId', 'clean_title', 'year']],
    genre_df
], axis=1)

movies_features = movies_features.drop_duplicates()

print(f"final movies: {len(movies_features):,}")
print(f"genres: {len(genre_columns)}")
print(f"movies with no genres: {(movies_df['genres'] == '(no genres listed)').sum()}")

movies_features.head(20)

loaded 87,585 movies
movies without years: 771
final movies: 86,814
genres: 19
movies with no genres: 6707


Unnamed: 0,movieId,clean_title,year,genre_action,genre_adventure,genre_animation,genre_children,genre_comedy,genre_crime,genre_documentary,...,genre_film_noir,genre_horror,genre_imax,genre_musical,genre_mystery,genre_romance,genre_sci_fi,genre_thriller,genre_war,genre_western
0,1,Toy Story,1995,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,6,Heat,1995,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
6,7,Sabrina,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
7,8,Tom and Huck,1995,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,9,Sudden Death,1995,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10,GoldenEye,1995,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [20]:
# load ratings data
ratings_df = pd.read_csv('../data/raw/ml-32m/ratings.csv', nrows=2000000)
print(f"loaded {len(ratings_df):,} ratings")
print(f"users: {ratings_df['userId'].nunique():,}")
print(f"movies: {ratings_df['movieId'].nunique():,}")

# only merge movies with user ratings AND year data
ratings_with_features = ratings_df.merge(movies_features, on='movieId', how='inner')
print(f"ratings with movie features: {len(ratings_with_features):,}")
print(f"movies matched: {ratings_with_features['movieId'].nunique():,}")

ratings_with_features.head()

loaded 2,000,000 ratings
users: 12,773
movies: 36,603
ratings with movie features: 1,997,633
movies matched: 36,350


Unnamed: 0,userId,movieId,rating,timestamp,clean_title,year,genre_action,genre_adventure,genre_animation,genre_children,...,genre_film_noir,genre_horror,genre_imax,genre_musical,genre_mystery,genre_romance,genre_sci_fi,genre_thriller,genre_war,genre_western
0,1,17,4.0,944249077,Sense and Sensibility,1995,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,1,25,1.0,944250228,Leaving Las Vegas,1995,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
2,1,29,2.0,943230976,"City of Lost Children, The (Cité des enfants p...",1995,0,1,0,0,...,0,0,0,0,1,0,1,0,0,0
3,1,30,5.0,944249077,Shanghai Triad (Yao a yao yao dao waipo qiao),1995,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,32,5.0,943228858,Twelve Monkeys (a.k.a. 12 Monkeys),1995,0,0,0,0,...,0,0,0,0,1,0,1,1,0,0


In [21]:
shuffled = ratings_with_features.sample(frac=1, random_state=42).reset_index(drop=True)
split_idx = int(0.8 * len(shuffled))

train_data = shuffled[:split_idx]
test_data = shuffled[split_idx:]

print(f"split: {len(train_data):,} train, {len(test_data):,} test")

split: 1,598,106 train, 399,527 test
