In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

In [2]:

# Load MovieLens Small dataset
data = pd.read_csv("../../datasets/ml-latest-small/ratings.csv", sep=",")
#prefs = pd.read_csv("ratings.csv", sep=",")
#prefs = pd.read_csv("drive/MyDrive/PycharmProjects/datasets/ml-latest-small/ratings.csv", sep=",")

data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
# Map user and movie IDs to unique indices
user_ids = data['userId'].unique()
movie_ids = data['movieId'].unique()

user_mapping = {user_id: idx for idx, user_id in enumerate(user_ids)}
movie_mapping = {movie_id: idx for idx, movie_id in enumerate(movie_ids)}

data['userId'] = data['userId'].map(user_mapping)
data['movieId'] = data['movieId'].map(movie_mapping)

In [4]:
class MovieLensDataset(Dataset):
    def __init__(self, dataframe):
        self.data = dataframe[['userId', 'movieId']].values
        self.ratings = dataframe['rating'].values.astype(np.float32)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.LongTensor(self.data[idx]), torch.FloatTensor([self.ratings[idx]])


In [39]:
# Define the matrix factorization model
class MatrixFactorization(nn.Module):
    def __init__(self, num_users, num_movies, embedding_size=10):
        super(MatrixFactorization, self).__init__()
        self.user_embedding = nn.Embedding(num_users, embedding_size)
        self.movie_embedding = nn.Embedding(num_movies, embedding_size)

    def forward(self, X):
        user_embedding = self.user_embedding(X[:,0])
        movie_embedding = self.movie_embedding(X[:,1])
        #print("***********")
        #print(user_embedding)
        #print("***********")
        #print(movie_embedding)
        #print("***********")
        prediction = torch.sum(torch.cat((user_embedding, movie_embedding)), dim=1)
        prediction = torch.sum(user_embedding + movie_embedding, dim=1)
        prediction = torch.sum(user_embedding * movie_embedding, dim=1)
        #print(prediction)
        #print("***********")

        return prediction

In [36]:
# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.1)

# Initialize dataset and data loaders
train_dataset = MovieLensDataset(train_data)
test_dataset = MovieLensDataset(test_data)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [30]:
a = torch.Tensor([[-0.4266, -0.4373, -0.4427, -0.7173,  0.1183],
        [ 0.5720,  1.3187, -1.5295, -1.7553,  0.9981]])
b = torch.Tensor([[-0.1013,  0.8980, -0.3187,  0.5583, -1.4790],
        [ 0.1501,  0.8112, -0.4382,  0.5182,  0.3780]])
torch.sum(a * b, dim=1)

tensor([-0.7838,  1.2935])

In [37]:
# Initialize the model and optimizer
num_users = len(user_ids)
num_movies = len(movie_ids)
model = MatrixFactorization(num_users, num_movies, embedding_size=5)
criterion = nn.MSELoss()
optimizer = optim.SGD(model.parameters(), lr=0.03, momentum=0.9)

# Training loop
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    total_loss = 0.0
    absolute_errors = []

    for batch_idx, (X, y) in enumerate(train_loader):
        optimizer.zero_grad()
        predictions = model(X).squeeze()
        loss = criterion(predictions, y)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
        absolute_errors.extend(torch.abs(predictions - y).tolist()[0])
        


    print(f"Epoch {epoch + 1}/{num_epochs}, Loss: {total_loss / len(train_loader)}")
    print(f"Train MAE: {np.mean(absolute_errors)}")



  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1/20, Loss: 6.961808997081601
Train MAE: 1.9764682991965115
Epoch 2/20, Loss: 3.4000893672363377
Train MAE: 1.317171505664847
Epoch 3/20, Loss: 2.626991707406024
Train MAE: 1.1751197286382677
Epoch 4/20, Loss: 2.252529650296075
Train MAE: 1.0970759160182215
Epoch 5/20, Loss: 2.027180241680952
Train MAE: 1.0252675947543413
Epoch 6/20, Loss: 1.8730281814617566
Train MAE: 1.01046812196695
Epoch 7/20, Loss: 1.7597913827395069
Train MAE: 0.9989574343653734
Epoch 8/20, Loss: 1.6724679549507093
Train MAE: 0.923383530015382
Epoch 9/20, Loss: 1.60428967197624
Train MAE: 0.9512799793594269
Epoch 10/20, Loss: 1.5478148763875867
Train MAE: 0.9226388404326782
Epoch 11/20, Loss: 1.5018978620365746
Train MAE: 0.9226502189609749
Epoch 12/20, Loss: 1.4619037617833388
Train MAE: 0.91149575994905
Epoch 13/20, Loss: 1.428564539160816
Train MAE: 0.8948324308234789
Epoch 14/20, Loss: 1.399082693083498
Train MAE: 0.8872047227762024
Epoch 15/20, Loss: 1.373966230996405
Train MAE: 0.8943056144475101
Epoc

In [38]:
# Evaluation on the test set
model.eval()
test_loss = 0.0
absolute_errors = []
with torch.no_grad():
    for batch_idx, (X, y) in enumerate(test_loader):
        predictions = model(X).squeeze()
        loss = criterion(predictions, y)
        test_loss += loss.item()
        absolute_errors.extend(torch.abs(predictions - y).tolist()[0])

print(f"Test Loss: {test_loss / len(test_loader)}")
print(f"Test MAE: {np.mean(absolute_errors)}")

Test Loss: 1.561858127388773
Test MAE: 0.8138730464099018


  return F.mse_loss(input, target, reduction=self.reduction)
