# Non-negative Matrix Factorization
## Init

In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
from functools import partial

import plotly.express as px


# import implicit # Matrix Factorization

from sklearn.decomposition import NMF
from sklearn.model_selection import KFold
from cmfrec import CMF_implicit

In [2]:
import sys
sys.path.append("../")
from src.data_preprocessing import TrainTestGenerator
from src.evaluator import Evaluator

In [3]:
show_ploty = False

In [4]:
data_dir = "../data/"
data_generator = TrainTestGenerator(data_dir)

## Model

In [5]:
# Model wrapper

class CMF_recommender:
    def __init__(self, k=50):
        self.model = CMF_implicit(
            # verbose=False,
            # method="als",
            k=k,
            random_state=1,
            # lambda_=1e+1
        )

    def fit(self, data: pd.DataFrame):
        data = data.copy()
        data = data.rename(columns={
            "userID": "UserId",
            "artistID": "ItemId",
            "weight": "Rating"
        })
        self.model.fit(data)

    def recommend(self, user_id, n):
        recommendations = self.model.topN(user_id, n=n)
        return recommendations

In [6]:
# Evaluator (forward chaining)

evaluator = Evaluator(CMF_recommender, data_generator)
evaluator.evaluate()

evaluator.save_results("../results/nmf.csv")

In [7]:
# Hit Rate

evaluator.get_hit_rates()

Unnamed: 0,cases,5,10,25,50,500
2008,4556,0.00878,0.018218,0.031607,0.06014,0.21971
2009,4687,0.019842,0.026243,0.046725,0.065714,0.221037
2010,6133,0.008479,0.020545,0.046633,0.082668,0.196478
2011,1129,0.021258,0.046058,0.088574,0.127547,0.313552


In [8]:
# Mean Reciprocal Rank

evaluator.get_mrr()

Unnamed: 0,cases,mrr
2008,2608,0.015579
2009,3086,0.023489
2010,4306,0.01358
2011,878,0.027059


## Different numbers of latent features (k)

In [9]:
# Try different numbers of latent features (k): 

results = []
for k in [5, 10, 20, 25, 50, 100]:
    model = partial(CMF_recommender, k=k)
    evaluator = Evaluator(model, data_generator)
    evaluator.evaluate()
    results_partial = evaluator.get_hit_rates()
    results_partial["k"] = k
    
    results.append(results_partial)

results = pd.concat(results)

In [10]:
# Hit-rates
results

Unnamed: 0,cases,5,10,25,50,500,k
2008,4556,0.00856,0.014925,0.037752,0.064969,0.214881,5
2009,4687,0.016855,0.028163,0.057606,0.080862,0.248987,5
2010,6133,0.012718,0.031795,0.052503,0.076308,0.216044,5
2011,1129,0.027458,0.046944,0.097431,0.14349,0.424269,5
2008,4556,0.01712,0.034021,0.053556,0.07331,0.218832,10
2009,4687,0.011308,0.021976,0.040964,0.063153,0.223384,10
2010,6133,0.014838,0.027719,0.045492,0.078917,0.221425,10
2011,1129,0.024801,0.050487,0.091231,0.129318,0.370239,10
2008,4556,0.009658,0.018657,0.035338,0.049605,0.208736,20
2009,4687,0.015575,0.023042,0.037977,0.057179,0.200128,20


In [11]:
results.groupby("k").mean()

Unnamed: 0_level_0,cases,5,10,25,50,500
k,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5,4126.25,0.016398,0.030457,0.061323,0.091407,0.276045
10,4126.25,0.017017,0.033551,0.057811,0.086175,0.25847
20,4126.25,0.018367,0.031058,0.051569,0.075904,0.239483
25,4126.25,0.018178,0.029777,0.05635,0.081189,0.23438
50,4126.25,0.01459,0.027766,0.053385,0.084017,0.237694
100,4126.25,0.014416,0.024144,0.049089,0.074446,0.22838


In [12]:
results_data = results.groupby("k").mean().drop(columns="cases").reset_index().melt(id_vars="k", var_name="n", value_name="hit_rate")

if show_ploty:
    fig = px.line(
        results_data,
        x="k", 
        y="hit_rate", 
        color="n"
    )
    fig.show()

In [13]:
if show_ploty:
    px.imshow(results.groupby("k").mean().drop(columns="cases").values)

In [14]:
results_data = results.reset_index().rename(columns={"index": "year"}).melt(id_vars=["year", "cases", "k"], var_name="n", value_name="hit_rate")

In [15]:
if show_ploty:
    fig = px.line(
        results_data[results_data["k"] == 5],
        x="year", 
        y="hit_rate", 
        color="n"
    )
    fig.show()

In [16]:
if show_ploty:
    fig = px.line(
        results_data[results_data["k"] == 25],
        x="year", 
        y="hit_rate", 
        color="n"
    )
    fig.show()

In [17]:
if show_ploty:
    fig = px.line(
        results_data[results_data["k"] == 50],
        x="year", 
        y="hit_rate", 
        color="n"
    )
    fig.show()