### Loading dataset

In [13]:
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader

import sys
sys.path.append('..')
from src.dataset import MovieDataset
from src.train import train
from src.model import RecSysModel
from benchmark.evaluate import validate_model
data_relative_path = "./../data/raw/ml-100k/u.data"
moives_relative_path = "./../data/raw/ml-100k/u.item"
user_relative_path = "./../data/raw/ml-100k/u.user"

column_names = ['user', 'movie', 'rating', 'time']
users_columns_names = ["id", "age", "gender", "occupation", "zip code"]
movies_columns_names = ["id",
                        "name",
                        "date",
                        "empty",
                        "url",
                        "unknown",
                        "Action",
                        "Adventure",
                        "Animation",
                        "Children's",
                        "Comedy",
                        "Crime",
                        "Documentary",
                        "Drama",
                        "Fantasy",
                        "Film-Noir",
                        "Horror",
                        "Musical",
                        "Mystery",
                        "Romance",
                        "Sci-Fi",
                        "Thriller",
                        "War",
                        "Western"]



train_df = pd.read_csv(data_relative_path + f'/../ua.base', sep = '\t', header = None, names = column_names).drop('time', axis = 1)
val_df = pd.read_csv(data_relative_path + f'/../ua.test', sep = '\t', header = None, names = column_names).drop('time', axis = 1)

users_df = pd.get_dummies(pd.read_csv(user_relative_path, sep = "|", header = None, names = users_columns_names).drop('zip code' , axis = 1))
users_df['age'] = users_df['age'].apply(lambda x: (x-users_df['age'].mean())/ users_df['age'].std())
movies_df = pd.read_csv(moives_relative_path, sep = '|', header = None, encoding='ISO-8859-1', names = movies_columns_names).drop(['name', 'date', 'empty', 'url'], axis = 1 )

train_dataset = MovieDataset(users_df, movies_df , train_df)
val_dataset = MovieDataset(users_df, movies_df , val_df)

train_dataloader = DataLoader(train_dataset, batch_size=128, shuffle=True) 


### Hyper-Parameters

In [14]:
embedding_size =  100
hidden_layers =  (256, 128, 64)
lr = 0.0005

### Defining The Model 

In [15]:
num_users = 944
num_movies = 1683


model = RecSysModel(num_users, num_movies, embedding_size, hidden_layers)
model.to('cuda')
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.LinearLR(optimizer)


### Training The model

In [16]:
train(model, criterion, optimizer, scheduler, train_dataloader)

100%|████████████████████████████████████████████████████████████████████████████████████| 5/5 [04:52<00:00, 58.51s/it]


### Evaluating The Model

In [17]:
avg_rmse, avg_dcg = validate_model(model, val_dataset, num_users)

Model Validation Started


100%|███████████████████████████████████████████████████████████████████████████████| 944/944 [00:04<00:00, 215.75it/s]


In [18]:
print(avg_rmse)

0.9983987128579143


In [19]:
print(avg_dcg)

16.81041332556034
