# NMF - Non-negative matrix factorization

## Init

In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
from functools import partial

import plotly.express as px
from sklearn.preprocessing import OrdinalEncoder

In [2]:
import sys
sys.path.append("../")
from src.data_preprocessing import TrainTestGenerator
from src.evaluator import Evaluator

In [3]:
data_dir = "../data/"
data_generator = TrainTestGenerator(data_dir)

## NMF code

In [4]:
def update_H(W, H, V):
    numerator = W.T.dot(V)
    denominator = W.T.dot(W).dot(H) + 1e-10
    H = H*(numerator / denominator)
    return H

def update_W(W, H, V):
    numerator = V.dot(H.T)
    denominator = W.dot(H).dot(H.T) + 1e-10
    W = W*(numerator / denominator)
    return W

In [5]:
def do_nnmf(V, rank=10, iter=100):
    
    # Initialize 
    n, m = V.shape
    
    W = np.abs(np.random.randn(1, n, rank))[0]
    H = np.abs(np.random.randn(1, rank, m))[0]
    
    loss = []
    for i in range(iter):
        H = update_H(W, H, V)
        W = update_W(W, H, V)
        
        loss.append(sum((V - W.dot(H)).flatten()**2))

    return H, W, loss

## Model

In [6]:
# Model wrapper

class NMF_recommender:
    def __init__(self, rank=32, iter=100):
        self.rank = rank
        self.iter = iter
        
    def fit(self, data: pd.DataFrame):
        np.random.seed(1)
        
        # Default rankings when userID is not in training set
        self.default_recommendation = data["artistID"].value_counts().index.tolist()

        data = data.copy()
        
        self.user_encoder = OrdinalEncoder()
        self.artist_encoder = OrdinalEncoder()
        
        data[["userID_ordinal"]] = self.user_encoder.fit_transform(
            data[["userID"]].values
        ).astype(int)
        data[["artistID_ordinal"]] = self.artist_encoder.fit_transform(
            data[["artistID"]].values
        ).astype(int)

        X = np.zeros([data["userID"].nunique(), data["artistID"].nunique()])
        X = np.zeros([data["userID"].nunique(), data["artistID"].nunique()])
        X[data["userID_ordinal"], data["artistID_ordinal"]] = 1        

        H, W, loss = do_nnmf(X, self.rank, self.iter)
        self.H = H
        self.W = W

    def recommend(self, user_id, n):
        try: 
            user_idx = self.user_encoder.transform(np.array([[user_id]]))[0,0]
        except ValueError:
            recommendations = self.default_recommendation
        else:
            user_idx = int(user_idx)

            scores = self.W[user_idx, :].dot(self.H)

            recommendations_idx = np.argsort(scores)
            recommendations_idx = recommendations_idx[:n]

            recommendations = self.artist_encoder.inverse_transform(recommendations_idx.reshape(-1, 1))[:, 0]
        
        return recommendations

## Evaluation

In [7]:
# Evaluator (forward chaining)

evaluator = Evaluator(NMF_recommender, data_generator)
evaluator.evaluate()
evaluator.save_results("../results/nmf_ranks.csv", "../results/nmf_times.csv")

In [8]:
# Hit Rate

evaluator.get_hit_rates()

Unnamed: 0,cases,5,10,25,50,500
2008,4556,0.018876,0.034021,0.054653,0.088894,0.266023
2009,4687,0.024749,0.041391,0.077235,0.110732,0.3083
2010,6133,0.023969,0.052503,0.083972,0.11805,0.302625
2011,1129,0.009743,0.021258,0.043401,0.065545,0.177148


In [9]:
# Mean Reciprocal Rank

evaluator.get_mrr()

Unnamed: 0,cases,mrr
2008,2608,0.025928
2009,3086,0.028162
2010,4306,0.028798
2011,878,0.013328


In [10]:
# Times

evaluator.get_times()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
task,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
model_fit,4.0,42.462536,36.241049,6.84814,19.639112,36.358149,59.181573,90.285706
model_init,4.0,2.2e-05,2.4e-05,3e-06,1e-05,1.4e-05,2.6e-05,5.7e-05
recommend_user,2622.0,0.000405,0.000249,0.000171,0.000199,0.000291,0.000589,0.002007


In [11]:
evaluator.get_fit_per_year_times()

Unnamed: 0_level_0,tag,time
task,Unnamed: 1_level_1,Unnamed: 2_level_1
model_fit,model_fit_2008,6.84814
model_fit,model_fit_2009,23.902769
model_fit,model_fit_2010,48.813529
model_fit,model_fit_2011,90.285706


In [12]:
# Hit Rate
# rank: 20, iter: 150
evaluator = Evaluator(partial(NMF_recommender, rank=20, iter=150), data_generator)
evaluator.evaluate()

evaluator.get_hit_rates()

Unnamed: 0,cases,5,10,25,50,500
2008,4556,0.019315,0.03446,0.055092,0.089772,0.269535
2009,4687,0.024749,0.041178,0.077022,0.110518,0.307873
2010,6133,0.023969,0.052503,0.084135,0.118213,0.302136
2011,1129,0.009743,0.021258,0.043401,0.065545,0.175376
