# duplicate of notebook 05 but intended to export the final model

### training on 5 million ratings (all my laptop could muster)

In [6]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

# get final occurence of a 4 digit number in the title string
def extract_year(title):
    match = re.search(r'\((\d{4})\)$', title)
    return int(match.group(1)) if match else None

def clean_title(title):
    cleaned = re.sub(r'\s*\(\d{4}\)$', '', title).strip()
    if cleaned.endswith(', The'):
        cleaned = 'The ' + cleaned[:-5]
    return cleaned

movies_df = pd.read_csv('../data/raw/ml-32m/movies.csv')
print(f"loaded {len(movies_df):,} movies")

movies_df['year'] = movies_df['title'].apply(extract_year)
movies_df['clean_title'] = movies_df['title'].apply(clean_title)

# take out all movies that don't have years, not a good idea to guess the year
# or leave NaN values, so just deleting
print(f"movies without years: {movies_df['year'].isna().sum()}")
movies_df = movies_df.dropna(subset=['year']).copy()

# was showing as "[year].0" before fix
movies_df['year'] = movies_df['year'].astype(int)

# split genres into list unless no genres listed
movies_df['genre_list'] = movies_df['genres'].apply(
    lambda x: [] if x == '(no genres listed)' else x.split('|')
)

# encode genres into binary for each movie
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies_df['genre_list'])
genre_columns = [f'genre_{genre.lower().replace("-", "_")}' for genre in mlb.classes_]

genre_df = pd.DataFrame(genre_matrix, columns=genre_columns, index=movies_df.index)
movies_features = pd.concat([
    movies_df[['movieId', 'clean_title', 'year']],
    genre_df
], axis=1)

movies_features = movies_features.drop_duplicates()

print(f"final movies: {len(movies_features):,}")
print(f"genres: {len(genre_columns)}")
print(f"movies with no genres: {(movies_df['genres'] == '(no genres listed)').sum()}")

movies_features.head(20)

loaded 87,585 movies
movies without years: 771
final movies: 86,814
genres: 19
movies with no genres: 6707


Unnamed: 0,movieId,clean_title,year,genre_action,genre_adventure,genre_animation,genre_children,genre_comedy,genre_crime,genre_documentary,...,genre_film_noir,genre_horror,genre_imax,genre_musical,genre_mystery,genre_romance,genre_sci_fi,genre_thriller,genre_war,genre_western
0,1,Toy Story,1995,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,6,Heat,1995,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
6,7,Sabrina,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
7,8,Tom and Huck,1995,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,9,Sudden Death,1995,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10,GoldenEye,1995,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [None]:
# load last 10 million ratings for recency
ratings_original_df = pd.read_csv('../data/raw/ml-32m/ratings.csv')
ratings_df = ratings_original_df.tail(10_000_000)
print(f"loaded {len(ratings_df):,} ratings")
print(f"users: {ratings_df['userId'].nunique():,}")
print(f"movies: {ratings_df['movieId'].nunique():,}")

shuffled_ratings = ratings_df.sample(frac=1, random_state=42).reset_index(drop=True)


# only merge movies with user ratings AND year data
ratings_with_features = ratings_df.merge(movies_features, on='movieId', how='inner')

# encode users and movies to continuous indices
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

ratings_with_features['user_idx'] = user_encoder.fit_transform(ratings_with_features['userId'])
ratings_with_features['movie_idx'] = movie_encoder.fit_transform(ratings_with_features['movieId'])

# get decade values, a bit easier to work with than individual years
ratings_with_features['decade'] = (ratings_with_features['year'] // 10) * 10
decade_encoder = LabelEncoder()

# sequential indexes for decades
ratings_with_features['decade_idx'] = decade_encoder.fit_transform(ratings_with_features['decade'])

print(f"ratings with movie features: {len(ratings_with_features):,}")
print(f"movies matched: {ratings_with_features['movieId'].nunique():,}")

ratings_with_features.tail()

loaded 10,000,000 ratings
users: 62,609
movies: 60,588
ratings with movie features: 9,988,308
movies matched: 60,133


Unnamed: 0,userId,movieId,rating,timestamp,clean_title,year,genre_action,genre_adventure,genre_animation,genre_children,...,genre_mystery,genre_romance,genre_sci_fi,genre_thriller,genre_war,genre_western,user_idx,movie_idx,decade,decade_idx
9988303,62609,1272,4.0,945824540,Patton,1970,0,0,0,0,...,0,0,0,0,1,0,62608,1238,1970,10
9988304,62609,1292,4.0,945824479,Being There,1979,0,0,0,0,...,0,0,0,0,0,0,62608,1258,1970,10
9988305,62609,1527,5.0,945824665,The Fifth Element,1997,1,1,0,0,...,0,0,1,0,0,0,62608,1474,1990,12
9988306,62609,1573,4.0,945824719,Face/Off,1997,1,0,0,0,...,0,0,0,1,0,0,62608,1516,1990,12
9988307,62609,1617,4.0,945824479,L.A. Confidential,1997,0,0,0,0,...,1,0,0,1,0,0,62608,1558,1990,12


In [8]:
n_users = len(user_encoder.classes_)
n_movies = len(movie_encoder.classes_)
n_genres = len([col for col in ratings_with_features.columns if col.startswith('genre_')])
n_decades = len(decade_encoder.classes_)

print(decade_encoder.classes_)

print(f"users: {n_users:,}")
print(f"movies: {n_movies:,}")
print(f"genres: {n_genres}")
print(f"decades: {n_decades}")
print(f"ratings: {len(ratings_with_features):,}")

[1870 1880 1890 1900 1910 1920 1930 1940 1950 1960 1970 1980 1990 2000
 2010 2020]
users: 62,609
movies: 60,133
genres: 19
decades: 16
ratings: 9,988,308


In [6]:
split_idx = int(0.8 * len(ratings_with_features))
train_data = ratings_with_features[:split_idx]
test_data = ratings_with_features[split_idx:]

print(f"split: {len(train_data):,} train, {len(test_data):,} test")

# create tensors
train_users = torch.tensor(train_data['user_idx'].values, dtype=torch.long)
train_movies = torch.tensor(train_data['movie_idx'].values, dtype=torch.long)
train_ratings = torch.tensor(train_data['rating'].values, dtype=torch.float)
train_decades = torch.tensor(train_data['decade_idx'].values, dtype=torch.long)
train_genres = torch.tensor(train_data.filter(regex='^genre_').values, dtype=torch.float)

test_users = torch.tensor(test_data['user_idx'].values, dtype=torch.long)
test_movies = torch.tensor(test_data['movie_idx'].values, dtype=torch.long)
test_ratings = torch.tensor(test_data['rating'].values, dtype=torch.float)
test_decades = torch.tensor(test_data['decade_idx'].values, dtype=torch.long)
test_genres = torch.tensor(test_data.filter(regex='^genre_').values, dtype=torch.float)

print(f"training tensor shapes:")
print(f"users: {train_users.shape}")
print(f"movies: {train_movies.shape}")
print(f"ratings: {train_ratings.shape}")
print(f"decades: {train_decades.shape}")
print(f"genres: {train_genres.shape}")

split: 7,990,612 train, 1,997,653 test
training tensor shapes:
users: torch.Size([7990612])
movies: torch.Size([7990612])
ratings: torch.Size([7990612])
decades: torch.Size([7990612])
genres: torch.Size([7990612, 19])


In [7]:
global_bias = train_ratings.mean()

class MatrixFactorization(nn.Module):
    def __init__(self, n_users, n_movies, n_genres, n_decades, n_factors, dropout=0.2):
        super().__init__()

        # collaborative features
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.movie_factors = nn.Embedding(n_movies, n_factors)
        self.user_biases = nn.Embedding(n_users, 1)
        self.movie_biases = nn.Embedding(n_movies, 1)
        self.global_bias = nn.Parameter(torch.tensor([global_bias]))
        
        # content features
        self.user_genre_prefs = nn.Embedding(n_users, n_genres)
        self.user_decade_prefs = nn.Embedding(n_users, n_decades)

        self.dropout = nn.Dropout(dropout)

        self.user_factors.weight.data.normal_(0, 0.1)
        self.movie_factors.weight.data.normal_(0, 0.1)
        self.user_biases.weight.data.normal_(0, 0.01)
        self.movie_biases.weight.data.normal_(0, 0.01)
        self.user_genre_prefs.weight.data.normal_(0, 0.01)
        self.user_decade_prefs.weight.data.normal_(0, 0.01)

    def forward(self, user_ids, movie_ids, genre_features, decade_indices):
        user_vec = self.dropout(self.user_factors(user_ids))
        movie_vec = self.dropout(self.movie_factors(movie_ids))
        collaborative_score = (user_vec * movie_vec).sum(dim=1)

        user_genre_pref = self.user_genre_prefs(user_ids)
        genre_score = (user_genre_pref * genre_features).sum(dim=1)

        user_decade_pref = self.user_decade_prefs(user_ids)
        decade_score = user_decade_pref.gather(1, decade_indices.unsqueeze(1)).squeeze()

        prediction = (
            self.global_bias + 
            self.user_biases(user_ids).squeeze() +
            self.movie_biases(movie_ids).squeeze() +
            collaborative_score + 
            genre_score +
            decade_score
        )

        return prediction

In [8]:
print(f"users={n_users}, movies={n_movies}, genres={n_genres}, decades={n_decades}")
model = MatrixFactorization(n_users=n_users, n_movies=n_movies, n_genres=n_genres, n_decades=n_decades, n_factors=35)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)
loss_fn = nn.MSELoss()

print(f"model: {n_users:,} users, {n_movies:,} movies, embedding_dim=35")
print(f"total parameters: {sum(p.numel() for p in model.parameters()):,}")

# initial predictions
with torch.no_grad():
    initial_pred = model(train_users[:100], train_movies[:100], train_genres[:100], train_decades[:100])
    print(f"sample initial predictions: {initial_pred[:5]}")
    print(f"sample actual ratings: {train_ratings[:5]}")

users=63199, movies=64719, genres=19, decades=16
model: 63,199 users, 64,719 movies, embedding_dim=35
total parameters: 6,817,014
sample initial predictions: tensor([3.5147, 3.5827, 3.7297, 3.6400, 3.6438])
sample actual ratings: tensor([2.5000, 2.5000, 4.5000, 2.5000, 4.0000])


In [None]:
print("starting training...")

with torch.no_grad():
    initial_pred = model(train_users, train_movies, train_genres, train_decades)
    initial_loss = loss_fn(initial_pred, train_ratings)
    print(f"initial loss: {initial_loss:.4f}")

for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    
    predictions = model(train_users, train_movies, train_genres, train_decades)
    loss = loss_fn(predictions, train_ratings)
    
    loss.backward()
    optimizer.step()
    
    if epoch % 1 == 0:
        print(f"epoch {epoch}, loss: {loss.item():.4f}")

print("training done")

starting training...
initial loss: 1.1223


In [None]:
# evaluate on test set
model.eval()
with torch.no_grad():
    train_pred = model(train_users, train_movies, train_genres, train_decades)
    test_pred = model(test_users, test_movies, test_genres, test_decades)
    
    train_loss = loss_fn(train_pred, train_ratings)
    test_loss = loss_fn(test_pred, test_ratings)
    test_rmse = torch.sqrt(test_loss)
    train_rmse = torch.sqrt(train_loss)
    
    print(f"\nfinal results:")
    print(f"train loss: {train_loss:.4f}")
    print(f"test loss: {test_loss:.4f}")
    print(f"test RMSE: {test_rmse:.4f}")
    print(f"train RMSE: {train_rmse:.4f}")
    
    print(f"\nsample predictions vs actual (20 samples from testset):")
    for i in range(20):
        print(f"predicted: {test_pred[i]:.2f}, actual: {test_ratings[i]:.2f}")