# duplicate of notebook 05 but intended to export the final model

### training on 5 million ratings (all my laptop could muster)

In [1]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import re
from sklearn.preprocessing import LabelEncoder, MultiLabelBinarizer

# get final occurence of a 4 digit number in the title string
def extract_year(title):
    match = re.search(r'\((\d{4})\)$', title)
    return int(match.group(1)) if match else None

def clean_title(title):
    cleaned = re.sub(r'\s*\(\d{4}\)$', '', title).strip()
    if cleaned.endswith(', The'):
        cleaned = 'The ' + cleaned[:-5]
    return cleaned

movies_df = pd.read_csv('../data/raw/ml-32m/movies.csv')
print(f"loaded {len(movies_df):,} movies")

movies_df['year'] = movies_df['title'].apply(extract_year)
movies_df['clean_title'] = movies_df['title'].apply(clean_title)

# take out all movies that don't have years, not a good idea to guess the year
# or leave NaN values, so just deleting
print(f"movies without years: {movies_df['year'].isna().sum()}")
movies_df = movies_df.dropna(subset=['year']).copy()

# was showing as "[year].0" before fix
movies_df['year'] = movies_df['year'].astype(int)

# split genres into list unless no genres listed
movies_df['genre_list'] = movies_df['genres'].apply(
    lambda x: [] if x == '(no genres listed)' else x.split('|')
)

# encode genres into binary for each movie
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies_df['genre_list'])
genre_columns = [f'genre_{genre.lower().replace("-", "_")}' for genre in mlb.classes_]

genre_df = pd.DataFrame(genre_matrix, columns=genre_columns, index=movies_df.index)
movies_features = pd.concat([
    movies_df[['movieId', 'clean_title', 'year']],
    genre_df
], axis=1)

movies_features = movies_features.drop_duplicates()

print(f"final movies: {len(movies_features):,}")
print(f"genres: {len(genre_columns)}")
print(f"movies with no genres: {(movies_df['genres'] == '(no genres listed)').sum()}")

movies_features.head(20)

loaded 87,585 movies
movies without years: 771
final movies: 86,814
genres: 19
movies with no genres: 6707


Unnamed: 0,movieId,clean_title,year,genre_action,genre_adventure,genre_animation,genre_children,genre_comedy,genre_crime,genre_documentary,...,genre_film_noir,genre_horror,genre_imax,genre_musical,genre_mystery,genre_romance,genre_sci_fi,genre_thriller,genre_war,genre_western
0,1,Toy Story,1995,0,1,1,1,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,Jumanji,1995,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,Grumpier Old Men,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
3,4,Waiting to Exhale,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
4,5,Father of the Bride Part II,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,6,Heat,1995,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
6,7,Sabrina,1995,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0
7,8,Tom and Huck,1995,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,9,Sudden Death,1995,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10,GoldenEye,1995,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [2]:
# load last 5 million ratings for recency
ratings_original_df = pd.read_csv('../data/raw/ml-32m/ratings.csv')
ratings_df = ratings_original_df.tail(5_000_000)
print(f"loaded {len(ratings_df):,} ratings")
print(f"users: {ratings_df['userId'].nunique():,}")
print(f"movies: {ratings_df['movieId'].nunique():,}")

shuffled_ratings = ratings_df.sample(frac=1, random_state=42).reset_index(drop=True)

# only merge movies with user ratings AND year data
ratings_with_features = shuffled_ratings.merge(movies_features, on='movieId', how='inner')

# encode users and movies to continuous indices
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

ratings_with_features['user_idx'] = user_encoder.fit_transform(ratings_with_features['userId'])
ratings_with_features['movie_idx'] = movie_encoder.fit_transform(ratings_with_features['movieId'])


user_counts = ratings_with_features['userId'].value_counts()
movie_counts = ratings_with_features['movieId'].value_counts()

active_users = user_counts[user_counts >= 50].index
reviewed_movies= movie_counts[movie_counts >= 75].index

filtered_ratings = ratings_with_features[
    (ratings_with_features['userId'].isin(active_users)) &
    (ratings_with_features['movieId'].isin(reviewed_movies))
].copy()

print()

print(f"ratings with movie features: {len(filtered_ratings):,}")
print(f"movies matched: {filtered_ratings['movieId'].nunique():,}")

if 5465 in filtered_ratings['movieId'].values:
    print(f"Movie ID 5465 is in filtered dataset")

filtered_ratings.tail()

loaded 5,000,000 ratings
users: 31,287
movies: 55,560

ratings with movie features: 4,303,327
movies matched: 6,314


Unnamed: 0,userId,movieId,rating,timestamp,clean_title,year,genre_action,genre_adventure,genre_animation,genre_children,...,genre_imax,genre_musical,genre_mystery,genre_romance,genre_sci_fi,genre_thriller,genre_war,genre_western,user_idx,movie_idx
4993970,179050,1485,4.0,859704852,Liar Liar,1997,0,0,0,0,...,0,0,0,0,0,0,0,0,9388,1438
4993971,183220,58303,5.0,1451755576,"Counterfeiters, The (Die Fälscher)",2007,0,0,0,0,...,0,0,0,0,0,0,1,0,13558,12037
4993972,200478,337,5.0,833817127,What's Eating Gilbert Grape,1993,0,0,0,0,...,0,0,0,0,0,0,0,0,30816,331
4993973,196584,1080,4.0,1174663454,Monty Python's Life of Brian,1979,0,0,0,0,...,0,0,0,0,0,0,0,0,26922,1051
4993974,179791,71838,4.5,1500342314,Law Abiding Citizen,2009,0,0,0,0,...,0,0,0,0,0,1,0,0,10129,13650


In [3]:
n_users = len(user_encoder.classes_)
n_movies = len(movie_encoder.classes_)

print(f"users: {n_users:,}")
print(f"movies: {n_movies:,}")
print(f"ratings: {len(filtered_ratings):,}")

users: 31,287
movies: 55,162
ratings: 4,303,327


In [4]:
split_idx = int(0.8 * len(filtered_ratings))
train_data = filtered_ratings[:split_idx]
test_data = filtered_ratings[split_idx:]

print(f"split: {len(train_data):,} train, {len(test_data):,} test")

# create tensors
train_users = torch.tensor(train_data['user_idx'].values, dtype=torch.long)
train_movies = torch.tensor(train_data['movie_idx'].values, dtype=torch.long)
train_ratings = torch.tensor(train_data['rating'].values, dtype=torch.float)

test_users = torch.tensor(test_data['user_idx'].values, dtype=torch.long)
test_movies = torch.tensor(test_data['movie_idx'].values, dtype=torch.long)
test_ratings = torch.tensor(test_data['rating'].values, dtype=torch.float)

print(f"training tensor shapes:")
print(f"users: {train_users.shape}")
print(f"movies: {train_movies.shape}")
print(f"ratings: {train_ratings.shape}")

split: 3,442,661 train, 860,666 test
training tensor shapes:
users: torch.Size([3442661])
movies: torch.Size([3442661])
ratings: torch.Size([3442661])


In [5]:
global_bias = train_ratings.mean()

class MatrixFactorization(nn.Module):
    def __init__(self, n_users, n_movies, n_factors, dropout=0.2):
        super().__init__()

        # collaborative features
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.movie_factors = nn.Embedding(n_movies, n_factors)
        self.user_biases = nn.Embedding(n_users, 1)
        self.movie_biases = nn.Embedding(n_movies, 1)
        self.global_bias = nn.Parameter(torch.tensor([global_bias]))

        self.dropout = nn.Dropout(dropout)

        self.user_factors.weight.data.normal_(0, 0.1)
        self.movie_factors.weight.data.normal_(0, 0.1)
        self.user_biases.weight.data.normal_(0, 0.01)
        self.movie_biases.weight.data.normal_(0, 0.01)

    def forward(self, user_ids, movie_ids):
        user_vec = self.dropout(self.user_factors(user_ids))
        movie_vec = self.dropout(self.movie_factors(movie_ids))
        collaborative_score = (user_vec * movie_vec).sum(dim=1)

        prediction = (
            self.global_bias + 
            self.user_biases(user_ids).squeeze() +
            self.movie_biases(movie_ids).squeeze() +
            collaborative_score
        )

        return prediction

In [6]:
print(f"users={n_users}, movies={n_movies}")
model = MatrixFactorization(n_users=n_users, n_movies=n_movies, n_factors=75)

optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-4)
loss_fn = nn.MSELoss()

print(f"model: {n_users:,} users, {n_movies:,} movies, embedding_dim=50")
print(f"total parameters: {sum(p.numel() for p in model.parameters()):,}")

# initial predictions
with torch.no_grad():
    initial_pred = model(train_users[:100], train_movies[:100])
    print(f"sample initial predictions: {initial_pred[:5]}")
    print(f"sample actual ratings: {train_ratings[:5]}")

users=31287, movies=55162
model: 31,287 users, 55,162 movies, embedding_dim=50
total parameters: 6,570,125
sample initial predictions: tensor([3.5386, 3.6034, 3.6341, 3.4678, 3.4841])
sample actual ratings: tensor([4.5000, 5.0000, 4.0000, 3.0000, 4.0000])


In [9]:
print("starting training...")

with torch.no_grad():
    initial_pred = model(train_users, train_movies)
    initial_loss = loss_fn(initial_pred, train_ratings)
    print(f"initial loss: {initial_loss:.4f}")

for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    
    predictions = model(train_users, train_movies)
    loss = loss_fn(predictions, train_ratings)
    
    loss.backward()
    optimizer.step()
    
    if epoch % 10 == 0:
        print(f"epoch {epoch}, loss: {loss.item():.4f}")

print("training done")

starting training...
initial loss: 0.9820
epoch 0, loss: 0.9821
epoch 10, loss: 0.8962
epoch 20, loss: 0.8381
epoch 30, loss: 0.7974
epoch 40, loss: 0.7762
epoch 50, loss: 0.7690
epoch 60, loss: 0.7661
epoch 70, loss: 0.7641
epoch 80, loss: 0.7624
epoch 90, loss: 0.7609
training done


In [13]:
# evaluate on test set
model.eval()
with torch.no_grad():
    train_pred = model(train_users, train_movies)
    test_pred = model(test_users, test_movies)
    
    train_loss = loss_fn(train_pred, train_ratings)
    test_loss = loss_fn(test_pred, test_ratings)
    test_rmse = torch.sqrt(test_loss)
    train_rmse = torch.sqrt(train_loss)
    
    print(f"\nfinal results:")
    print(f"train loss: {train_loss:.4f}")
    print(f"test loss: {test_loss:.4f}")

    print(f"train RMSE: {train_rmse:.4f}")
    print(f"test RMSE: {test_rmse:.4f}")
    
    print(f"\nsample predictions vs actual (20 samples from testset):")
    for i in range(90):
        print(f"predicted: {test_pred[i]:.2f}, actual: {test_ratings[i]:.2f}")


final results:
train loss: 0.7598
test loss: 0.7646
train RMSE: 0.8717
test RMSE: 0.8744

sample predictions vs actual (20 samples from testset):
predicted: 3.45, actual: 4.50
predicted: 3.49, actual: 4.00
predicted: 2.76, actual: 1.00
predicted: 3.71, actual: 4.00
predicted: 3.76, actual: 4.00
predicted: 3.42, actual: 5.00
predicted: 3.91, actual: 3.00
predicted: 3.42, actual: 3.50
predicted: 3.26, actual: 3.00
predicted: 2.73, actual: 2.50
predicted: 3.37, actual: 4.00
predicted: 4.00, actual: 3.50
predicted: 3.48, actual: 3.00
predicted: 4.01, actual: 5.00
predicted: 3.90, actual: 4.00
predicted: 3.69, actual: 3.00
predicted: 3.80, actual: 4.50
predicted: 2.94, actual: 3.00
predicted: 3.38, actual: 4.00
predicted: 3.91, actual: 4.50
predicted: 3.45, actual: 3.00
predicted: 3.30, actual: 4.00
predicted: 3.42, actual: 4.50
predicted: 2.96, actual: 1.50
predicted: 3.84, actual: 4.00
predicted: 3.69, actual: 4.00
predicted: 2.86, actual: 2.50
predicted: 3.49, actual: 3.00
predicted: 3.

In [15]:
print(f"\nverbose stats:")
with torch.no_grad():
    for i in range(20):
          user_idx = test_users[i].item()
          movie_idx = test_movies[i].item()

          user_id = user_encoder.inverse_transform([user_idx])[0]
          movie_id = movie_encoder.inverse_transform([movie_idx])[0]

          movie_info = movies_features[movies_features['movieId'] == movie_id]
          if len(movie_info) > 0:
              movie_title = movie_info['clean_title'].iloc[0]
              movie_year = movie_info['year'].iloc[0]
          else:
              movie_title = "Unknown"
              movie_year = "Unknown"

          user_train_count = len(train_data[train_data['user_idx'] == user_idx])
 
          movie_train_count = len(train_data[train_data['movie_idx'] == movie_idx])

          print(f"#{i+1:2d}: user {user_id} (trained on {user_train_count} movies) | " +
                f"movie {movie_id}: '{movie_title}' ({movie_year}) (trained on {movie_train_count} ratings)")
          print(f"     predicted: {test_pred[i]:.2f}, actual: {test_ratings[i]:.2f} | " +
                f"error: {abs(test_pred[i] - test_ratings[i]):.2f}")
          print()


verbose stats:
# 1: user 184503 (trained on 1016 movies) | movie 8042: 'Mean Streets' (1973) (trained on 268 ratings)
     predicted: 3.45, actual: 4.50 | error: 1.05

# 2: user 169678 (trained on 289 movies) | movie 2: 'Jumanji' (1995) (trained on 3283 ratings)
     predicted: 3.49, actual: 4.00 | error: 0.51

# 3: user 200753 (trained on 100 movies) | movie 1544: 'The Lost World: Jurassic Park' (1997) (trained on 2207 ratings)
     predicted: 2.76, actual: 1.00 | error: 1.76

# 4: user 176914 (trained on 129 movies) | movie 7263: 'Miracle' (2004) (trained on 334 ratings)
     predicted: 3.71, actual: 4.00 | error: 0.29

# 5: user 199722 (trained on 306 movies) | movie 2728: 'Spartacus' (1960) (trained on 742 ratings)
     predicted: 3.76, actual: 4.00 | error: 0.24

# 6: user 200939 (trained on 74 movies) | movie 661: 'James and the Giant Peach' (1996) (trained on 1117 ratings)
     predicted: 3.42, actual: 5.00 | error: 1.58

# 7: user 193607 (trained on 138 movies) | movie 720: 'W

In [16]:
import os

models_dir = '../models/'
os.makedirs(models_dir, exist_ok=True)

# export
torch.save({
    # model weights, parameters
    'model_state_dict': model.state_dict(),
    'n_users': n_users,
    'n_movies': n_movies, 
    'n_factors': 75,
    
    # encoders
    'user_encoder': user_encoder,
    'movie_encoder': movie_encoder,
    
    # movie metadata
    'movies_features': movies_features,
    
    # training data
    'filtered_ratings': filtered_ratings,
    
    # metrics
    'test_rmse': test_rmse.item(),
    'train_rmse': train_rmse.item(),
    
    # metadata
    'model_info': {
        'architecture': 'pure_collaborative_filtering',
        'dataset_size': len(filtered_ratings),
        'training_users': n_users,
        'training_movies': n_movies,
        'embedding_dimension': 75,
        'dropout': 0.2,
        'lr': 0.01,
        'weight_decay': 1e-4,
        'epochs': 100
    }
}, f'{models_dir}/trained_model_5M_75factors.pth')

print(f"model exported to {models_dir}/trained_model_5M_75factors.pth")
print(f"performance: train RMSE {train_rmse:.4f}, test RMSE {test_rmse:.4f}")
print(f"ready for production API")

file_size_gb = os.path.getsize(f'{models_dir}/trained_model_5M_75factors.pth') / (1024**3)
print(f"file size: {file_size_gb:.2f} GB")

model exported to ../models//trained_model_5M_75factors.pth
performance: train RMSE 0.8717, test RMSE 0.8744
ready for production API
file size: 0.95 GB
