In [1]:
%cd ..

/Users/antonbabenko/UCU/linear-algebra/ucu-linear-algebra-final-project


In [2]:
%load_ext autoreload
%autoreload 2
%aimport src.nnmf
%aimport src.data

In [3]:
import warnings

import numpy as np
import numpy.linalg as la
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns

from pathlib import Path
from tqdm import tqdm
from sklearn.decomposition import NMF as SKNMF
from surprise import NMF as SurNMF
from surprise import Dataset
from surprise import Reader

from src.nnmf import NNMFModel
from src.metrics import rmse
from src.data import generate_sparce_matrix

warnings.filterwarnings("ignore")

In [4]:
data_folder = Path("data")

In [5]:
df = pd.read_csv(data_folder / "subsets" / "low-sparsity" / "records.csv")
# df = pd.read_csv(data_folder / "subsets" / "mid-sparsity" / "records.csv")
# df = pd.read_csv(data_folder / "subsets" / "high-sparsity" / "records.csv")

tr_df = df[df.split == "train"]
val_df = df[df.split == "val"]

### Check custom NNMF

In [6]:
%%time
model = NNMFModel(n_components=30, max_iter=1000, epsilon=1e-2, verbose=True)
model.fit(tr_df)

train loop: 100%|██████████| 1000/1000 [00:54<00:00, 18.51it/s]

CPU times: user 2min 43s, sys: 31.5 s, total: 3min 15s
Wall time: 54.3 s





<src.nnmf.NNMFModel at 0x107d70190>

In [7]:
val_preds = model.predict(val_df)
print("val rmse:", rmse(val_df.rating.values, np.array(val_preds)))

predict loop: 28952it [00:02, 10561.48it/s]

val rmse: 0.99633628954298





In [8]:
tr_preds = model.predict(tr_df)
print("train rmse:", rmse(tr_df.rating.values, tr_preds))

predict loop: 115805it [00:10, 10955.00it/s]

train rmse: 0.8355799068235551





### Check Dummy predictor (mean value)

In [9]:
rmse(val_df.rating.values, np.repeat(val_df.rating.mean(), len(val_df)))

1.0485102138998519

### Check SVD from Surprise lib

In [10]:
reader = Reader()
data = Dataset.load_from_df(tr_df[['customer_id', 'movie_id', 'rating']], reader)
trainset = data.build_full_trainset()

model = SurNMF()
model.fit(trainset)

def predict(surprise_model, df):
    preds = []
    for _, item in tqdm(df.iterrows()):
        preds.append(surprise_model.predict(item.customer_id, item.movie_id).est)
    return preds

val_preds = predict(model, val_df)
rmse(val_df.rating.values, val_preds)

28952it [00:02, 10240.22it/s]


0.9161318265293671

### Check SVD from Sklearn lib

In [11]:
matrix = generate_sparce_matrix(tr_df)

model = SKNMF(n_components=15, init='random', solver='mu')
W = model.fit_transform(matrix)
H = model.components_

def predict(W, H, df):
    preds = []
    X = W @ H
    for _, item in tqdm(df.iterrows()):
        preds.append(X[item.customer_id, item.movie_id])
    return preds

val_preds = predict(W, H, val_df)
rmse(val_df.rating.values, val_preds)

28952it [00:02, 11906.57it/s]


1.0023079079467958