# matrix factorization with millions of ratings

In [18]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

# load 2 million ratings
print("loading 2 million ratings...")
ratings_df = pd.read_csv('../data/raw/ml-32m/ratings.csv', nrows=2000000)
print(f"loaded: {len(ratings_df):,} ratings")
print(f"users: {ratings_df['userId'].nunique():,}")
print(f"movies: {ratings_df['movieId'].nunique():,}")

# one number per user/movie
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

ratings_df['user_idx'] = user_encoder.fit_transform(ratings_df['userId'])
ratings_df['movie_idx'] = movie_encoder.fit_transform(ratings_df['movieId'])

n_users = len(user_encoder.classes_)
n_movies = len(movie_encoder.classes_)

print(f"encoded - users: {n_users:,}, movies: {n_movies:,}")
print(f"rating distribution:")
print(ratings_df['rating'].value_counts().sort_index())

loading 2 million ratings...
loaded: 2,000,000 ratings
users: 12,773
movies: 36,603
encoded - users: 12,773, movies: 36,603
rating distribution:
rating
0.5     35379
1.0     60992
1.5     31635
2.0    123674
2.5    103883
3.0    374510
3.5    268803
4.0    523517
4.5    189480
5.0    288127
Name: count, dtype: int64


In [28]:
shuffled = ratings_df.sample(frac=1, random_state=42).reset_index(drop=True)
print(shuffled.head())
split_idx = int(0.8 * len(shuffled))

train_data = shuffled[:split_idx]
test_data = shuffled[split_idx:]

print(f"split: {len(train_data):,} train, {len(test_data):,} test")

train_users = torch.tensor(train_data['user_idx'].values, dtype=torch.long)
train_movies = torch.tensor(train_data['movie_idx'].values, dtype=torch.long)  
train_ratings = torch.tensor(train_data['rating'].values, dtype=torch.float)

test_users = torch.tensor(test_data['user_idx'].values, dtype=torch.long)
test_movies = torch.tensor(test_data['movie_idx'].values, dtype=torch.long)
test_ratings = torch.tensor(test_data['rating'].values, dtype=torch.float)

print(f"train tensors: {train_users.shape}")
print(f"test tensors: {test_users.shape}")
print(f"rating range: {train_ratings.min():.1f} - {train_ratings.max():.1f}")
print(f"average rating: {train_ratings.mean():.2f}")

   userId  movieId  rating   timestamp  user_idx  movie_idx
0   11597      555     4.0   831728797     11596        544
1    7794     1729     4.0  1115993740      7793       1625
2    1271     1292     3.0  1035652014      1270       1225
3   10410      266     4.0  1040314968     10409        261
4    1239     3578     3.5  1225301411      1238       3401
split: 1,600,000 train, 400,000 test
train tensors: torch.Size([1600000])
test tensors: torch.Size([400000])
rating range: 0.5 - 5.0
average rating: 3.54


In [29]:
global_bias = train_ratings.mean()

class MatrixFactorization(nn.Module):
    def __init__(self, n_users, n_movies, n_factors=50, dropout=0.1):
        super().__init__()
        self.user_factors = nn.Embedding(n_users, n_factors)
        self.movie_factors = nn.Embedding(n_movies, n_factors)

        # some users rate higher than others
        self.user_biases = nn.Embedding(n_users, 1)

        # some movies are rated higher than others
        self.movie_biases = nn.Embedding(n_movies, 1)

        # bias globally average rating
        self.global_bias = nn.Parameter(torch.tensor(global_bias))

        # less overfit
        self.dropout = nn.Dropout(dropout)
        
        self.user_factors.weight.data.normal_(0, 0.1)
        self.movie_factors.weight.data.normal_(0, 0.1)
        self.user_biases.weight.data.normal_(0, 0.01)
        self.movie_biases.weight.data.normal_(0, 0.01)
        
    def forward(self, user_ids, movie_ids):
        user_vec = self.user_factors(user_ids)
        movie_vec = self.movie_factors(movie_ids)
        user_bias = self.user_biases(user_ids).squeeze()
        movie_bias = self.movie_biases(movie_ids).squeeze()
        
        # apply dropout
        user_vec = self.dropout(user_vec)
        movie_vec = self.dropout(movie_vec)
        
        # using dot product + bias
        dot_product = (user_vec * movie_vec).sum(dim=1)
        raw_prediction = (
            self.global_bias + 
            user_bias + 
            movie_bias + 
            dot_product
        )
        
        return raw_prediction

In [30]:
model = MatrixFactorization(n_users=n_users, n_movies=n_movies, n_factors=20)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
loss_fn = nn.MSELoss()

print(f"model: {n_users:,} users, {n_movies:,} movies, embedding_dim=20")
print(f"total parameters: {sum(p.numel() for p in model.parameters()):,}")

# initial predictions
with torch.no_grad():
    initial_pred = model(train_users[:100], train_movies[:100])
    print(f"sample initial predictions: {initial_pred[:5]}")
    print(f"sample actual ratings: {train_ratings[:5]}")

model: 12,773 users, 36,603 movies, embedding_dim=20
total parameters: 1,036,897
sample initial predictions: tensor([3.5777, 3.5022, 3.5398, 3.6582, 3.5514])
sample actual ratings: tensor([4.0000, 4.0000, 3.0000, 4.0000, 3.5000])


  self.global_bias = nn.Parameter(torch.tensor(global_bias))


In [31]:
print("starting training...")

with torch.no_grad():
    initial_pred = model(train_users, train_movies)
    initial_loss = loss_fn(initial_pred, train_ratings)
    print(f"initial loss: {initial_loss:.4f}")

for epoch in range(100):
    model.train()
    optimizer.zero_grad()
    
    predictions = model(train_users, train_movies)
    loss = loss_fn(predictions, train_ratings)
    
    loss.backward()
    optimizer.step()
    
    if epoch % 20 == 0:
        print(f"epoch {epoch}, loss: {loss.item():.4f}")

print("training done")

# evaluate on test set
model.eval()
with torch.no_grad():
    train_pred = model(train_users, train_movies)
    test_pred = model(test_users, test_movies)
    
    train_loss = loss_fn(train_pred, train_ratings)
    test_loss = loss_fn(test_pred, test_ratings)
    test_rmse = torch.sqrt(test_loss)
    
    print(f"\nfinal results:")
    print(f"train loss: {train_loss:.4f}")
    print(f"test loss: {test_loss:.4f}")
    print(f"test RMSE: {test_rmse:.4f}")
    
    print(f"\nsample predictions vs actual (10 samples from testset):")
    for i in range(10):
        print(f"predicted: {test_pred[i]:.2f}, actual: {test_ratings[i]:.2f}")

starting training...
initial loss: 1.1356
epoch 0, loss: 1.1357
epoch 20, loss: 0.7546
epoch 40, loss: 0.5649
epoch 60, loss: 0.5013
epoch 80, loss: 0.4704
training done

final results:
train loss: 0.4052
test loss: 0.7181
test RMSE: 0.8474

sample predictions vs actual (10 samples from testset):
predicted: 2.88, actual: 2.00
predicted: 3.70, actual: 4.00
predicted: 3.07, actual: 4.00
predicted: 3.61, actual: 3.00
predicted: 3.15, actual: 3.50
predicted: 4.72, actual: 5.00
predicted: 4.02, actual: 5.00
predicted: 3.69, actual: 4.00
predicted: 1.63, actual: 1.00
predicted: 3.63, actual: 3.00


In [32]:
movies_df = pd.read_csv('../data/raw/ml-32m/movies.csv')
print(f"loaded {len(movies_df):,} movies")

def get_movie_titles(movie_indices, movie_encoder, movies_df):
    original_movie_ids = movie_encoder.inverse_transform(movie_indices.numpy())
    titles = []
    for movie_id in original_movie_ids:
        movie_row = movies_df[movies_df['movieId'] == movie_id]
        if len(movie_row) > 0:
            titles.append(movie_row['title'].iloc[0])
        else:
            titles.append("unknown movie")
    return titles

print(f"\nsample predictions with movie titles:")
sample_indices = range(10)

with torch.no_grad():
    sample_users_idx = test_users[sample_indices]
    sample_movies_idx = test_movies[sample_indices]
    sample_predictions = model(sample_users_idx, sample_movies_idx)
    sample_actual = test_ratings[sample_indices]

movie_titles = get_movie_titles(sample_movies_idx, movie_encoder, movies_df)
original_user_ids = user_encoder.inverse_transform(sample_users_idx.numpy())

for i in range(10):
    print(f"user {original_user_ids[i]}: '{movie_titles[i]}'")
    print(f"  predicted: {sample_predictions[i]:.2f}, actual: {sample_actual[i]:.2f}")
    print()

loaded 87,585 movies

sample predictions with movie titles:
user 4287: 'Mr. & Mrs. Smith (2005)'
  predicted: 2.88, actual: 2.00

user 2364: '20,000 Leagues Under the Sea (1954)'
  predicted: 3.70, actual: 4.00

user 2863: 'American Pie (1999)'
  predicted: 3.07, actual: 4.00

user 2679: 'Batman (1989)'
  predicted: 3.61, actual: 3.00

user 10584: 'Aladdin (1992)'
  predicted: 3.15, actual: 3.50

user 10639: 'Nymphomaniac: Volume II (2013)'
  predicted: 4.72, actual: 5.00

user 4167: 'Her (2013)'
  predicted: 4.02, actual: 5.00

user 10584: 'Twelve Monkeys (a.k.a. 12 Monkeys) (1995)'
  predicted: 3.69, actual: 4.00

user 6424: 'Scary Movie 4 (2006)'
  predicted: 1.63, actual: 1.00

user 8197: 'Kill Bill: Vol. 2 (2004)'
  predicted: 3.63, actual: 3.00



for myself:

i think i've gotten to a pretty good RMSE with just collaborative filtering, but i think i should start to factor in content a bit. might add in genre data for each movie and account for users' genre preferences. same with movie year, but might break it down into decades rather than continuous years