In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%cd ..

/home/iovcharenko/Documents/NotWork/UCU/liner-algebra/ucu-linear-algebra-final-project


In [3]:
import torch
import datetime

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

from pathlib import Path
from tqdm.cli import tqdm 

from surprise import SVD
from surprise import Dataset
from surprise import Reader

from src.als import AlsSVD
from src.funk import Funk
from src.nnmf import NNMFModel
from src.neural_svd import NeuralSVD
from src.metrics import rmse

In [4]:
data_folder = Path("data")

In [5]:
df = pd.read_csv(data_folder / "subsets" / "low-sparsity" / "records.csv")
# df = pd.read_csv(data_folder / "subsets" / "mid-sparsity" / "records.csv")
# df = pd.read_csv(data_folder / "subsets" / "high-sparsity" / "records.csv")

tr_df = df[df.split == "train"]
val_df = df[df.split == "val"]

In [6]:
results_df = val_df.copy()
scores = {}

## ALS-SVD

In [7]:
model = AlsSVD(k=50, n_epochs=40, reg=0.1)
model.fit(tr_df)

val_preds = model.predict(val_df)
val_rmse = rmse(val_df.rating.values, val_preds)

tr_preds = model.predict(tr_df)
train_rmse = rmse(tr_df.rating.values, tr_preds)

model_name = "als-svd"
results_df[model_name] = val_preds
scores[model_name] = {
    "train": train_rmse,
    "val": val_rmse,
}

train loop: 100%|██████████| 40/40 [01:32<00:00,  2.31s/it]
predict loop: 28952it [00:01, 14556.01it/s]
predict loop: 115805it [00:08, 14370.40it/s]


## Funk SVD

In [8]:
model = Funk(lr=0.001, reg=0.005, n_epochs=100, n_factors=20)
model.fit(tr_df, tr_df.sample(frac = 0.7))

val_preds = model.predict(val_df)
val_rmse = rmse(val_df.rating.values, val_preds)

tr_preds = model.predict(tr_df)
train_rmse = rmse(tr_df.rating.values, tr_preds)

model_name = "funk-svd"
results_df[model_name] = val_preds
scores[model_name] = {
    "train": train_rmse,
    "val": val_rmse,
}

Epoch 1/100:
val_loss: 1.01, val_rmse: 1.00, val_mae: 0.83, took 5.9 sec
Epoch 2/100:
val_loss: 0.96, val_rmse: 0.98, val_mae: 0.81, took 5.8 sec
Epoch 3/100:
val_loss: 0.92, val_rmse: 0.96, val_mae: 0.79, took 5.9 sec
Epoch 4/100:
val_loss: 0.89, val_rmse: 0.95, val_mae: 0.77, took 5.7 sec
Epoch 5/100:
val_loss: 0.87, val_rmse: 0.93, val_mae: 0.76, took 5.9 sec
Epoch 6/100:
val_loss: 0.86, val_rmse: 0.93, val_mae: 0.75, took 5.7 sec
Epoch 7/100:
val_loss: 0.84, val_rmse: 0.92, val_mae: 0.74, took 5.7 sec
Epoch 8/100:
val_loss: 0.83, val_rmse: 0.91, val_mae: 0.73, took 5.8 sec
Epoch 9/100:
val_loss: 0.82, val_rmse: 0.91, val_mae: 0.73, took 5.8 sec
Epoch 10/100:
val_loss: 0.82, val_rmse: 0.90, val_mae: 0.72, took 5.9 sec
Epoch 11/100:
val_loss: 0.81, val_rmse: 0.90, val_mae: 0.72, took 5.8 sec
Epoch 12/100:
val_loss: 0.80, val_rmse: 0.90, val_mae: 0.71, took 5.8 sec
Epoch 13/100:
val_loss: 0.80, val_rmse: 0.89, val_mae: 0.71, took 5.9 sec
Epoch 14/100:
val_loss: 0.80, val_rmse: 0.89, v

## NNMF 

In [9]:
model = NNMFModel(n_components=40, max_iter=1000, epsilon=1e-2, verbose=True)
model.fit(tr_df)

val_preds = model.predict(val_df)
val_rmse = rmse(val_df.rating.values, val_preds)

tr_preds = model.predict(tr_df)
train_rmse = rmse(tr_df.rating.values, tr_preds)


model_name = "nnmf"
results_df[model_name] = val_preds
scores[model_name] = {
    "train": train_rmse,
    "val": val_rmse,
}

  X = generate_sparce_matrix(X)
  self._set_arrayXarray(i, j, x)
train loop: 100%|██████████| 1000/1000 [01:03<00:00, 15.67it/s]
predict loop: 28952it [00:01, 14538.10it/s]
predict loop: 115805it [00:08, 14336.93it/s]


## Neural SVD

In [10]:
model = NeuralSVD(
    k=50, n_epochs=50, reg=0.0,
    lr=0.1, batch_size=128,
    device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    use_scheduler=True,
    scheduler_step=20,
    scheduler_gamma=0.1,
)
model.fit(tr_df, val_df)


val_preds = model.predict(val_df)
val_rmse = rmse(val_df.rating.values, val_preds)

tr_preds = model.predict(tr_df)
train_rmse = rmse(tr_df.rating.values, tr_preds)

model_name = "neural-svd"
results_df[model_name] = val_preds
scores[model_name] = {
    "train": train_rmse,
    "val": val_rmse,
}

train loop, loss 0.9594854772464069, lr 0.0010000000000000002, val rmse 0.9809690192599654: 100%|██████████| 50/50 [02:14<00:00,  2.69s/it]


## Surprise SVD 

In [11]:
reader = Reader()
data = Dataset.load_from_df(tr_df[['customer_id', 'movie_id', 'rating']], reader)
trainset = data.build_full_trainset()

model = SVD()
model.fit(trainset)

def predict(surprise_model, df):
    preds = []
    for _, item in tqdm(df.iterrows()):
        preds.append(surprise_model.predict(item.customer_id, item.movie_id).est)
    return preds


val_preds = predict(model, val_df)
val_rmse = rmse(val_df.rating.values, val_preds)

tr_preds = predict(model, tr_df)
train_rmse = rmse(tr_df.rating.values, tr_preds)

model_name = "surprise-svd"
results_df[model_name] = val_preds
scores[model_name] = {
    "train": train_rmse,
    "val": val_rmse,
}

28952it [00:02, 11772.01it/s]
115805it [00:09, 11744.05it/s]


## Dummy predictor (mean value)

In [12]:
val_rmse = rmse(val_df.rating.values, np.repeat(val_df.rating.mean(), len(val_df)))\

model_name = "dummy"
scores[model_name] = {
    "val": val_rmse,
}

## Save results table

In [13]:
results_folder = Path("results")
cur_date = datetime.datetime.now().strftime("%d-%m-%y")
cur_results_folder = results_folder / cur_date
cur_results_folder.mkdir(parents=True, exist_ok=True)

In [14]:
scores_df = pd.DataFrame(scores)
scores_df = scores_df.round(3)
scores_df.to_csv(cur_results_folder / "scores.csv", index=True)
scores_df

Unnamed: 0,als-svd,funk-svd,nnmf,neural-svd,surprise-svd,dummy
train,0.629,0.882,0.837,0.961,0.666,
val,0.886,0.914,1.0,0.981,0.895,1.049


In [15]:
results_df.to_csv(cur_results_folder / "val_preds.csv", index=False)
results_df.head()

Unnamed: 0,movie_id,customer_id,rating,date,split,als-svd,funk-svd,nnmf,neural-svd,surprise-svd
1,540,646,3.0,1999-12-31,val,2.880528,2.59978,3.658644,3.270864,2.87707
6,723,646,2.0,2000-01-06,val,3.271187,3.434981,3.869058,3.270864,3.480361
10,398,509,4.0,2000-01-06,val,3.882678,4.064758,4.153707,3.762923,4.03936
11,391,509,4.0,2000-01-06,val,3.999982,4.081499,4.102665,3.762923,4.003301
13,412,646,1.0,2000-01-06,val,2.90866,3.536602,3.888122,3.270864,3.700313
