# Non-negative Matrix Factorization
## Init

In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
from functools import partial

import plotly.express as px


# import implicit # Matrix Factorization

from sklearn.decomposition import NMF
from sklearn.model_selection import KFold
from cmfrec import CMF_implicit

In [2]:
import sys
sys.path.append("../../")
from src.data_preprocessing import TrainTestGenerator
from src.evaluator import Evaluator

In [3]:
show_ploty = False

In [4]:
data_dir = "../../data/"
data_generator = TrainTestGenerator(data_dir)

## Model

In [6]:
# Model wrapper

class CMF_recommender:
    def __init__(self, k=32):
        self.model = CMF_implicit(
            # verbose=False,
            # method="als",
            k=k,
            random_state=1,
            niter=100,
            # lambda_=1e+1
        )

    def fit(self, data: pd.DataFrame):
        data = data.copy()
        data = data.rename(columns={
            "userID": "UserId",
            "artistID": "ItemId",
            "weight": "Rating"
        })
        self.model.fit(data)

    def recommend(self, user_id, n):
        recommendations = self.model.topN(user_id, n=n)
        return recommendations

## Evaluation

In [7]:
# Evaluator (forward chaining)

evaluator = Evaluator(CMF_recommender, data_generator)
evaluator.evaluate()

evaluator.save_results("../../results/cmf_ranks.csv", "../../results/cmf_times.csv")

In [8]:
# Hit Rate

evaluator.get_hit_rates()

Unnamed: 0,cases,5,10,25,50,500
2008,4556,0.008999,0.014047,0.035338,0.063213,0.219052
2009,4687,0.010241,0.018775,0.037551,0.055046,0.199061
2010,6133,0.016142,0.030491,0.056253,0.082178,0.228762


In [9]:
# Mean Reciprocal Rank

evaluator.get_mrr()

Unnamed: 0,cases,mrr
2008,2608,0.014333
2009,3086,0.013175
2010,4306,0.0212


In [10]:
# Times

evaluator.get_times()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
model_fit,3.0,1.45988,0.361501,1.084808,1.286782,1.488755,1.647417,1.806078
model_init,3.0,2.2e-05,1e-05,1.3e-05,1.7e-05,2.2e-05,2.7e-05,3.3e-05
recommend_user,2336.0,0.001097,0.001784,0.000501,0.000522,0.00078,0.001001,0.030906


In [11]:
evaluator.get_fit_per_year_times()

Unnamed: 0_level_0,tag,time
task,Unnamed: 1_level_1,Unnamed: 2_level_1
model_fit,model_fit_2008,1.084808
model_fit,model_fit_2009,1.488755
model_fit,model_fit_2010,1.806078


## Different numbers of latent features (k)

In [12]:
# Try different numbers of latent features (k): 

results = []
for k in [5, 10, 20, 25, 50, 100]:
    model = partial(CMF_recommender, k=k)
    evaluator = Evaluator(model, data_generator)
    evaluator.evaluate()
    results_partial = evaluator.get_hit_rates()
    results_partial["k"] = k
    
    results.append(results_partial)

results = pd.concat(results)

In [13]:
# Hit-rates
results

Unnamed: 0,cases,5,10,25,50,500,k
2008,4556,0.011194,0.020852,0.037533,0.063213,0.233977,5
2009,4687,0.011948,0.021762,0.039258,0.071261,0.253467,5
2010,6133,0.014675,0.021686,0.03897,0.06946,0.206098,5
2008,4556,0.022388,0.033143,0.059263,0.08604,0.240342,10
2009,4687,0.010028,0.019415,0.03563,0.065074,0.195861,10
2010,6133,0.015327,0.026904,0.059351,0.088374,0.247187,10
2008,4556,0.012291,0.021291,0.039728,0.061896,0.209833,20
2009,4687,0.011308,0.022402,0.041391,0.061233,0.192661,20
2010,6133,0.03685,0.047285,0.093592,0.13517,0.278656,20
2008,4556,0.01295,0.020632,0.037972,0.054873,0.227612,25


In [14]:
results.groupby("k").mean()

Unnamed: 0_level_0,cases,5,10,25,50,500
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,5125.333333,0.012606,0.021433,0.038587,0.067978,0.231181
10,5125.333333,0.015914,0.026487,0.051415,0.079829,0.227797
20,5125.333333,0.02015,0.030326,0.058237,0.0861,0.22705
25,5125.333333,0.014546,0.023902,0.044432,0.0653,0.220423
50,5125.333333,0.012014,0.020013,0.037095,0.054702,0.208486
100,5125.333333,0.009572,0.015875,0.030802,0.050738,0.186061


In [15]:
results_data = results.groupby("k").mean().drop(columns="cases").reset_index().melt(id_vars="k", var_name="n", value_name="hit_rate")

if show_ploty:
    fig = px.line(
        results_data,
        x="k", 
        y="hit_rate", 
        color="n"
    )
    fig.show()

In [16]:
if show_ploty:
    px.imshow(results.groupby("k").mean().drop(columns="cases").values)

In [17]:
results_data = results.reset_index().rename(columns={"index": "year"}).melt(id_vars=["year", "cases", "k"], var_name="n", value_name="hit_rate")

In [18]:
if show_ploty:
    fig = px.line(
        results_data[results_data["k"] == 5],
        x="year", 
        y="hit_rate", 
        color="n"
    )
    fig.show()

In [19]:
if show_ploty:
    fig = px.line(
        results_data[results_data["k"] == 25],
        x="year", 
        y="hit_rate", 
        color="n"
    )
    fig.show()

In [20]:
if show_ploty:
    fig = px.line(
        results_data[results_data["k"] == 50],
        x="year", 
        y="hit_rate", 
        color="n"
    )
    fig.show()