# duplicate of notebook 05 but intended to export the final model

### training on 5 million ratings (all my laptop could muster)

In [2]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

# get final occurence of a 4 digit number in the title string
def extract_year(title):
    match = re.search(r'\((\d{4})\)$', title)
    return int(match.group(1)) if match else None

def clean_title(title):
    cleaned = re.sub(r'\s*\(\d{4}\)$', '', title).strip()
    if cleaned.endswith(', The'):
        cleaned = 'The ' + cleaned[:-5]
    return cleaned

movies_df = pd.read_csv('../data/raw/ml-32m/movies.csv')
print(f"loaded {len(movies_df):,} movies")

movies_df['year'] = movies_df['title'].apply(extract_year)
movies_df['clean_title'] = movies_df['title'].apply(clean_title)

# take out all movies that don't have years, not a good idea to guess the year
# or leave NaN values, so just deleting
print(f"movies without years: {movies_df['year'].isna().sum()}")
movies_df = movies_df.dropna(subset=['year']).copy()

# was showing as "[year].0" before fix
movies_df['year'] = movies_df['year'].astype(int)

# split genres into list unless no genres listed
movies_df['genre_list'] = movies_df['genres'].apply(
    lambda x: [] if x == '(no genres listed)' else x.split('|')
)

# encode genres into binary for each movie
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies_df['genre_list'])
genre_columns = [f'genre_{genre.lower().replace("-", "_")}' for genre in mlb.classes_]

genre_df = pd.DataFrame(genre_matrix, columns=genre_columns, index=movies_df.index)
movies_features = pd.concat([
    movies_df[['movieId', 'clean_title', 'year']],
    genre_df
], axis=1)

movies_features = movies_features.drop_duplicates()

print(f"final movies: {len(movies_features):,}")
print(f"genres: {len(genre_columns)}")
print(f"movies with no genres: {(movies_df['genres'] == '(no genres listed)').sum()}")

movies_features.head(20)

loaded 87,585 movies
movies without years: 771
final movies: 86,814
genres: 19
movies with no genres: 6707


Unnamed: 0,movieId,clean_title,year,genre_action,genre_adventure,genre_animation,genre_children,genre_comedy,genre_crime,genre_documentary,...,genre_film_noir,genre_horror,genre_imax,genre_musical,genre_mystery,genre_romance,genre_sci_fi,genre_thriller,genre_war,genre_western
0,1,Toy Story,1995,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,6,Heat,1995,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
6,7,Sabrina,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
7,8,Tom and Huck,1995,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,9,Sudden Death,1995,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10,GoldenEye,1995,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [3]:
# load last 10 million ratings for recency
ratings_original_df = pd.read_csv('../data/raw/ml-32m/ratings.csv')
ratings_df = ratings_original_df.tail(10_000_000)
print(f"loaded {len(ratings_df):,} ratings")
print(f"users: {ratings_df['userId'].nunique():,}")
print(f"movies: {ratings_df['movieId'].nunique():,}")

shuffled_ratings = ratings_df.sample(frac=1, random_state=42).reset_index(drop=True)

# only merge movies with user ratings AND year data
ratings_with_features = ratings_df.merge(movies_features, on='movieId', how='inner')

# encode users and movies to continuous indices
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

ratings_with_features['user_idx'] = user_encoder.fit_transform(ratings_with_features['userId'])
ratings_with_features['movie_idx'] = movie_encoder.fit_transform(ratings_with_features['movieId'])


user_counts = ratings_with_features['userId'].value_counts()
movie_counts = ratings_with_features['movieId'].value_counts()

active_users = user_counts[user_counts >= 20].index
reviewed_movies= movie_counts[movie_counts >= 50].index

filtered_ratings = ratings_with_features[
    (ratings_with_features['userId'].isin(active_users)) &
    (ratings_with_features['movieId'].isin(reviewed_movies))
].copy()

print(f"ratings with movie features: {len(filtered_ratings):,}")
print(f"movies matched: {filtered_ratings['movieId'].nunique():,}")

filtered_ratings.tail()

loaded 10,000,000 ratings
users: 63,199
movies: 65,206
ratings with movie features: 9,669,907
movies matched: 10,090


Unnamed: 0,userId,movieId,rating,timestamp,clean_title,year,genre_action,genre_adventure,genre_animation,genre_children,...,genre_imax,genre_musical,genre_mystery,genre_romance,genre_sci_fi,genre_thriller,genre_war,genre_western,user_idx,movie_idx
9988260,200948,79702,4.5,1294412589,Scott Pilgrim vs. the World,2010,1,0,0,0,...,0,1,0,1,0,0,0,0,63198,14897
9988261,200948,79796,1.0,1287216292,Centurion,2010,1,1,0,0,...,0,0,0,0,0,1,1,0,63198,14916
9988262,200948,80350,0.5,1294412671,Vampires Suck,2010,0,0,0,0,...,0,0,0,0,0,0,0,0,63198,15041
9988263,200948,80463,3.5,1350423800,The Social Network,2010,0,0,0,0,...,0,0,0,0,0,0,0,0,63198,15068
9988264,200948,87304,4.5,1350423523,Beginners,2010,0,0,0,0,...,0,0,0,0,0,0,0,0,63198,16318


In [4]:
n_users = len(user_encoder.classes_)
n_movies = len(movie_encoder.classes_)

print(f"users: {n_users:,}")
print(f"movies: {n_movies:,}")
print(f"ratings: {len(filtered_ratings):,}")

users: 63,199
movies: 64,719
ratings: 9,669,907


In [5]:
split_idx = int(0.8 * len(filtered_ratings))
train_data = filtered_ratings[:split_idx]
test_data = filtered_ratings[split_idx:]

print(f"split: {len(train_data):,} train, {len(test_data):,} test")

# create tensors
train_users = torch.tensor(train_data['user_idx'].values, dtype=torch.long)
train_movies = torch.tensor(train_data['movie_idx'].values, dtype=torch.long)
train_ratings = torch.tensor(train_data['rating'].values, dtype=torch.float)

test_users = torch.tensor(test_data['user_idx'].values, dtype=torch.long)
test_movies = torch.tensor(test_data['movie_idx'].values, dtype=torch.long)
test_ratings = torch.tensor(test_data['rating'].values, dtype=torch.float)

print(f"training tensor shapes:")
print(f"users: {train_users.shape}")
print(f"movies: {train_movies.shape}")
print(f"ratings: {train_ratings.shape}")

split: 7,735,925 train, 1,933,982 test
training tensor shapes:
users: torch.Size([7735925])
movies: torch.Size([7735925])
ratings: torch.Size([7735925])


In [6]:
global_bias = train_ratings.mean()

class MatrixFactorization(nn.Module):
    def __init__(self, n_users, n_movies, n_factors, dropout=0.2):
        super().__init__()

        # collaborative features
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.movie_factors = nn.Embedding(n_movies, n_factors)
        self.user_biases = nn.Embedding(n_users, 1)
        self.movie_biases = nn.Embedding(n_movies, 1)
        self.global_bias = nn.Parameter(torch.tensor([global_bias]))

        self.dropout = nn.Dropout(dropout)

        self.user_factors.weight.data.normal_(0, 0.1)
        self.movie_factors.weight.data.normal_(0, 0.1)
        self.user_biases.weight.data.normal_(0, 0.01)
        self.movie_biases.weight.data.normal_(0, 0.01)

    def forward(self, user_ids, movie_ids):
        user_vec = self.dropout(self.user_factors(user_ids))
        movie_vec = self.dropout(self.movie_factors(movie_ids))
        collaborative_score = (user_vec * movie_vec).sum(dim=1)

        prediction = (
            self.global_bias + 
            self.user_biases(user_ids).squeeze() +
            self.movie_biases(movie_ids).squeeze() +
            collaborative_score
        )

        return prediction

In [7]:
print(f"users={n_users}, movies={n_movies}")
model = MatrixFactorization(n_users=n_users, n_movies=n_movies, n_factors=50)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)
loss_fn = nn.MSELoss()

print(f"model: {n_users:,} users, {n_movies:,} movies, embedding_dim=50")
print(f"total parameters: {sum(p.numel() for p in model.parameters()):,}")

# initial predictions
with torch.no_grad():
    initial_pred = model(train_users[:100], train_movies[:100])
    print(f"sample initial predictions: {initial_pred[:5]}")
    print(f"sample actual ratings: {train_ratings[:5]}")

users=63199, movies=64719
model: 63,199 users, 64,719 movies, embedding_dim=50
total parameters: 6,523,819
sample initial predictions: tensor([3.6894, 3.4939, 3.5155, 3.6596, 3.4063])
sample actual ratings: tensor([2.5000, 2.5000, 4.5000, 2.5000, 4.0000])


In [None]:
print("starting training...")

with torch.no_grad():
    initial_pred = model(train_users, train_movies)
    initial_loss = loss_fn(initial_pred, train_ratings)
    print(f"initial loss: {initial_loss:.4f}")

for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    
    predictions = model(train_users, train_movies)
    loss = loss_fn(predictions, train_ratings)
    
    loss.backward()
    optimizer.step()
    
    if epoch % 1 == 0:
        print(f"epoch {epoch}, loss: {loss.item():.4f}")

print("training done")

starting training...
initial loss: 1.1150
epoch 0, loss: 1.1150


In [None]:
# evaluate on test set
model.eval()
with torch.no_grad():
    train_pred = model(train_users, train_movies)
    test_pred = model(test_users, test_movies)
    
    train_loss = loss_fn(train_pred, train_ratings)
    test_loss = loss_fn(test_pred, test_ratings)
    test_rmse = torch.sqrt(test_loss)
    train_rmse = torch.sqrt(train_loss)
    
    print(f"\nfinal results:")
    print(f"train loss: {train_loss:.4f}")
    print(f"test loss: {test_loss:.4f}")

    print(f"train RMSE: {train_rmse:.4f}")
    print(f"test RMSE: {test_rmse:.4f}")
    
    print(f"\nsample predictions vs actual (20 samples from testset):")
    for i in range(59):
        print(f"predicted: {test_pred[i]:.2f}, actual: {test_ratings[i]:.2f}")


final results:
train loss: 0.8116
test loss: 0.9230
train RMSE: 0.9009
test RMSE: 0.9607

sample predictions vs actual (20 samples from testset):
predicted: 3.95, actual: 4.00
predicted: 3.80, actual: 4.50
predicted: 4.16, actual: 3.50
predicted: 4.16, actual: 4.00
predicted: 3.17, actual: 3.50
predicted: 4.17, actual: 4.00
predicted: 3.73, actual: 4.50
predicted: 3.12, actual: 3.50
predicted: 3.48, actual: 4.50
predicted: 3.25, actual: 4.00
predicted: 2.78, actual: 2.50
predicted: 2.67, actual: 2.00
predicted: 3.44, actual: 4.50
predicted: 3.80, actual: 4.50
predicted: 3.20, actual: 3.50
predicted: 3.92, actual: 4.00
predicted: 3.69, actual: 2.00
predicted: 3.68, actual: 4.00
predicted: 3.80, actual: 4.50
predicted: 3.95, actual: 4.50
predicted: 4.23, actual: 4.50
predicted: 3.48, actual: 4.00
predicted: 3.98, actual: 4.00
predicted: 3.71, actual: 4.00
predicted: 4.01, actual: 4.50
predicted: 3.86, actual: 4.00
predicted: 4.07, actual: 3.50
predicted: 3.32, actual: 3.00
predicted: 3.