In [339]:
import torch
import torch.nn as nn
import numpy as np

user_ids = torch.tensor([1, 2, 3, 1, 2])
movie_ids = torch.tensor([1, 1, 2, 2, 3])
ratings = torch.tensor([5.0, 4.0, 3.0, 4.5, 2.0])

print(f"sample data\n")
print(f"users: {user_ids}")
print(f"movies: {movie_ids}")
print(f"ratings: {ratings}")

sample data

users: tensor([1, 2, 3, 1, 2])
movies: tensor([1, 1, 2, 2, 3])
ratings: tensor([5.0000, 4.0000, 3.0000, 4.5000, 2.0000])


In [340]:
# filtering
u1_mask = (user_ids == 1)
print(f"user 1's movies: {movie_ids[u1_mask]}")
print(f"user 1's ratings: {ratings[u1_mask]}")

user 1's movies: tensor([1, 2])
user 1's ratings: tensor([5.0000, 4.5000])


In [341]:
n_users = 4
n_movies = 3
dim = 5

user_embedding = nn.Embedding(n_users, dim)
movie_embedding = nn.Embedding(n_movies, dim)

for i in range(n_users):
    user_vec = user_embedding(torch.tensor(i))
    print(f"user {i}: {user_vec.detach().numpy().round(3)}") 

print('\n')

for i in range(n_movies):
    movie_vec = movie_embedding(torch.tensor(i))
    print(f"movie {i}: {movie_vec.detach().numpy().round(3)}")

user 0: [-0.369  0.64   1.357 -2.306  0.014]
user 1: [ 0.819 -0.166 -0.02   0.334 -0.618]
user 2: [-0.194  0.634  1.155  0.971 -0.898]
user 3: [ 0.093  1.078 -0.156 -0.394  0.302]


movie 0: [-1.103  1.071 -1.077  1.314  0.294]
movie 1: [-0.685  0.576 -0.404 -0.814  0.785]
movie 2: [-0.38   0.997  0.246 -1.995  0.48 ]


In [None]:
user_ids_indexed = user_ids - 1
movie_ids_indexed = movie_ids - 1

class BasicRecommender(nn.Module):
    def __init__(self, n_users, n_movies, dim):
        super().__init__()
        self.user_embedding = nn.Embedding(n_users, dim)
        self.movie_embedding = nn.Embedding(n_movies, dim)
    
    def forward(self, user_ids, movie_ids):
        user_vecs = self.user_embedding(user_ids)
        movie_vecs = self.movie_embedding(movie_ids)
        predictions = torch.sigmoid((user_vecs * movie_vecs).sum(dim=1)) * 4.5 + 0.5
        return predictions

model = BasicRecommender(n_users=3, n_movies=3, dim=10)
print(f"user embedding shape: {model.user_embedding.weight.shape}")
print(f"movie embedding shape: {model.movie_embedding.weight.shape}")

user embedding shape: torch.Size([3, 10])
movie embedding shape: torch.Size([3, 10])


In [343]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
loss_fn = nn.MSELoss()

with torch.no_grad():
    predictions = model(user_ids_indexed, movie_ids_indexed)
    print(f"predictions: {predictions.detach().numpy().round(2)}")
    print(f"actual: {ratings.numpy()}")
    initial_loss = loss_fn(predictions, ratings)
    print(f"initial loss: {initial_loss:.4f}")

print("\ntraining...")
for epoch in range(100):
    optimizer.zero_grad()
    predictions = model(user_ids_indexed, movie_ids_indexed)
    loss = loss_fn(predictions, ratings)
    loss.backward()
    optimizer.step()

    if epoch % 20 == 0:
             print(f"epoch {epoch}, Loss: {loss.item():.4f}")

with torch.no_grad():
    predictions = model(user_ids_indexed, movie_ids_indexed)
    print(f"predictions: {predictions.detach().numpy().round(2)}")
    print(f"actual: {ratings.numpy()}")
    initial_loss = loss_fn(predictions, ratings)
    print(f"final loss: {initial_loss:.4f}")

predictions: [2.48 4.15 1.43 2.07 4.99]
actual: [5.  4.  3.  4.5 2. ]
initial loss: 4.7349

training...
epoch 0, Loss: 4.7349
epoch 20, Loss: 1.9049
epoch 40, Loss: 1.7949
epoch 60, Loss: 0.9944
epoch 80, Loss: 0.3125
predictions: [4.99 3.97 3.01 4.99 2.42]
actual: [5.  4.  3.  4.5 2. ]
final loss: 0.0828


## sample training with 10k ratings

In [344]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

ratings_df = pd.read_csv('../data/raw/ml-32m/ratings.csv', nrows=10000)
print(f"read {len(ratings_df)} rows")
print(f"cols: {ratings_df.columns.tolist()}")
print(f"users: {ratings_df['userId'].nunique():,}")
print(f"movies: {ratings_df['movieId'].nunique():,}")

# encode users and movies to unique integers
user_encoder = LabelEncoder()
movie_encoder = LabelEncoder()

ratings_df['user_idx'] = user_encoder.fit_transform(ratings_df['userId'])
ratings_df['movie_idx'] = movie_encoder.fit_transform(ratings_df['movieId'])

n_users = len(user_encoder.classes_)
n_movies = len(movie_encoder.classes_)

print(f"encoded - Users: {n_users}, Movies: {n_movies}")
print(f"user indices: 0 to {n_users-1}")
print(f"movie indices: 0 to {n_movies-1}")

read 10000 rows
cols: ['userId', 'movieId', 'rating', 'timestamp']
users: 66
movies: 3,965
encoded - Users: 66, Movies: 3965
user indices: 0 to 65
movie indices: 0 to 3964


In [345]:
# 80/20 training split
shuffled_data = ratings_df.sample(frac=1, random_state=42).reset_index(drop=True)
split_idx = int(0.8 * len(shuffled_data))

train_data = shuffled_data[:split_idx]
test_data = shuffled_data[split_idx:]

print(f"split: {len(train_data):,} train, {len(test_data):,} test")

train_users = torch.tensor(train_data['user_idx'].values, dtype=torch.long)
train_movies = torch.tensor(train_data['movie_idx'].values, dtype=torch.long)  
train_ratings = torch.tensor(train_data['rating'].values, dtype=torch.float)

test_users = torch.tensor(test_data['user_idx'].values, dtype=torch.long)
test_movies = torch.tensor(test_data['movie_idx'].values, dtype=torch.long)
test_ratings = torch.tensor(test_data['rating'].values, dtype=torch.float)

print(f"train tensors: {train_users.shape}")
print(f"test tensors: {test_users.shape}")
print(f"rating range: {train_ratings.min():.1f} - {train_ratings.max():.1f}")

split: 8,000 train, 2,000 test
train tensors: torch.Size([8000])
test tensors: torch.Size([2000])
rating range: 0.5 - 5.0


In [346]:
model = BasicRecommender(n_users=66, n_movies=3965, dim=10)
optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
loss_fn = nn.MSELoss()
print(f"total parameters: {sum(p.numel() for p in model.parameters()):,}")

with torch.no_grad():
    predictions = model(train_users, train_movies)
    initial_loss = loss_fn(predictions, train_ratings)
    print(f"initial train loss: {initial_loss:.4f}")

print("\ntraining...")
for epoch in range(100):
    optimizer.zero_grad()
    predictions = model(train_users, train_movies)
    loss = loss_fn(predictions, train_ratings)
    loss.backward()
    optimizer.step()

    if epoch % 20 == 0:
        print(f"epoch {epoch}, Loss: {loss.item():.4f}")

print("\nevaluating...")
with torch.no_grad():
    train_predictions = model(train_users, train_movies)
    test_predictions = model(test_users, test_movies)
    
    train_loss = loss_fn(train_predictions, train_ratings)
    test_loss = loss_fn(test_predictions, test_ratings)
    
    print(f"final train loss: {train_loss:.4f}")
    print(f"final test loss: {test_loss:.4f}")
    
    print(f"\nsample predictions vs actual:")
    for i in range(5):
        print(f"predicted: {test_predictions[i]:.2f}, actual: {test_ratings[i]:.2f}")

total parameters: 40,310
initial train loss: 4.3708

training...
epoch 0, Loss: 4.3708
epoch 20, Loss: 4.3333
epoch 40, Loss: 4.2710
epoch 60, Loss: 4.2005
epoch 80, Loss: 4.1217

evaluating...
final train loss: 4.0315
final test loss: 4.0412

sample predictions vs actual:
predicted: 1.71, actual: 3.00
predicted: 4.71, actual: 3.00
predicted: 4.60, actual: 4.50
predicted: 5.00, actual: 2.50
predicted: 1.94, actual: 4.00


## training with 100k ratings

In [None]:
print("loading 100k ratings...")
ratings_100k = pd.read_csv('../data/raw/ml-32m/ratings.csv', nrows=100000)
print(f"loaded: {len(ratings_100k):,} ratings")
print(f"users: {ratings_100k['userId'].nunique():,}")
print(f"movies: {ratings_100k['movieId'].nunique():,}")

# encode users and movies
user_encoder_100k = LabelEncoder()
movie_encoder_100k = LabelEncoder()

ratings_100k['user_idx'] = user_encoder_100k.fit_transform(ratings_100k['userId'])
ratings_100k['movie_idx'] = movie_encoder_100k.fit_transform(ratings_100k['movieId'])

n_users_100k = len(user_encoder_100k.classes_)
n_movies_100k = len(movie_encoder_100k.classes_)

print(f"encoded - Users: {n_users_100k:,}, Movies: {n_movies_100k:,}")

# train/test split
shuffled_100k = ratings_100k.sample(frac=1, random_state=42).reset_index(drop=True)
split_idx_100k = int(0.8 * len(shuffled_100k))

train_data_100k = shuffled_100k[:split_idx_100k]
test_data_100k = shuffled_100k[split_idx_100k:]

print(f"split: {len(train_data_100k):,} train, {len(test_data_100k):,} test")

# convert to tensors
train_users_100k = torch.tensor(train_data_100k['user_idx'].values, dtype=torch.long)
train_movies_100k = torch.tensor(train_data_100k['movie_idx'].values, dtype=torch.long)  
train_ratings_100k = torch.tensor(train_data_100k['rating'].values, dtype=torch.float)

test_users_100k = torch.tensor(test_data_100k['user_idx'].values, dtype=torch.long)
test_movies_100k = torch.tensor(test_data_100k['movie_idx'].values, dtype=torch.long)
test_ratings_100k = torch.tensor(test_data_100k['rating'].values, dtype=torch.float)

loading 100k ratings...
loaded: 100,000 ratings
users: 626
movies: 10,225
encoded - Users: 626, Movies: 10,225
split: 80,000 train, 20,000 test


In [None]:
# make model for 100k dataset
model_100k = BasicRecommender(n_users=n_users_100k, n_movies=n_movies_100k, dim=20)
optimizer_100k = torch.optim.SGD(model_100k.parameters(), lr=0.01, momentum=0.9)
loss_fn_100k = nn.MSELoss()

print(f"model: {n_users_100k:,} users, {n_movies_100k:,} movies, dim=20")
print(f"total parameters: {sum(p.numel() for p in model_100k.parameters()):,}")

with torch.no_grad():
    initial_predictions = model_100k(train_users_100k, train_movies_100k)
    initial_loss = loss_fn_100k(initial_predictions, train_ratings_100k)
    print(f"initial train loss: {initial_loss:.4f}")

print("\ntraining 100k dataset...")
for epoch in range(100):
    optimizer_100k.zero_grad()
    predictions = model_100k(train_users_100k, train_movies_100k)
    loss = loss_fn_100k(predictions, train_ratings_100k)
    loss.backward()
    optimizer_100k.step()

    if epoch % 20 == 0:
        print(f"epoch {epoch}, Loss: {loss.item():.4f}")

print("\nevaluating 100k model...")
with torch.no_grad():
    final_train_pred = model_100k(train_users_100k, train_movies_100k)
    final_test_pred = model_100k(test_users_100k, test_movies_100k)
    
    final_train_loss = loss_fn_100k(final_train_pred, train_ratings_100k)
    final_test_loss = loss_fn_100k(final_test_pred, test_ratings_100k)
    
    print(f"final train loss: {final_train_loss:.4f}")
    print(f"final test loss: {final_test_loss:.4f}")
    print(f"test RMSE: {torch.sqrt(final_test_loss):.4f}")
    
    print(f"\nsample predictions vs actual:")
    for i in range(10):
        print(f"predicted: {final_test_pred[i]:.2f}, actual: {test_ratings_100k[i]:.2f}")

model: 626 users, 10,225 movies, dim=20
total parameters: 217,020
initial train loss: 5.0899

training 100k dataset...
epoch 0, Loss: 5.0899
epoch 20, Loss: 5.0870
epoch 40, Loss: 5.0826
epoch 60, Loss: 5.0780
epoch 80, Loss: 5.0733

evaluating 100k model...
final train loss: 5.0686
final test loss: 5.0885
test RMSE: 2.2558

sample predictions vs actual:
predicted: 4.26, actual: 5.00
predicted: 0.65, actual: 4.00
predicted: 5.00, actual: 3.50
predicted: 4.32, actual: 5.00
predicted: 4.93, actual: 5.00
predicted: 3.61, actual: 3.00
predicted: 0.50, actual: 3.00
predicted: 2.12, actual: 4.00
predicted: 0.50, actual: 4.00
predicted: 2.84, actual: 3.50
