In [3]:
import RecSysFramework.Utils.get_holdout as gh
from RecSysFramework.Recommender import Recommender
from RecSysFramework.Utils import check_matrix
from RecSysFramework.Evaluation.Evaluator import EvaluatorHoldout, EvaluatorMetrics

import numpy as np
import pandas as pd
import itertools

from sklearn.metrics.pairwise import cosine_similarity

from scipy.sparse import csr_matrix, diags, isspmatrix
from scipy.sparse.linalg import eigs
from scipy.sparse import eye as seye

from sklearn.utils.extmath import randomized_svd

from maxvolpy.maxvol import rect_maxvol

np.random.seed(42)

In [7]:
d = "Movielens1MReader"
train, _, test, d_name = gh.retrieve_train_validation_test_holdhout_dataset(d)
print("Dataset: {}".format(d))
train_urm = train.get_URM()
test_urm = test.get_URM()

DataSplitter: Preloaded data not found or corrupted, reading from original files...
Reader: Loading /Users/alex/NearestNeighborMF-data/splits/Movielens1M/original/URM_all...
Reader: Loading complete!
Reader: Applying <RecSysFramework.DataManager.DatasetPostprocessing.ImplicitURM.ImplicitURM object at 0x1410e8ac0>
Current dataset is: Movielens1M
	Number of items: 3706
	Number of users: 6040
	Number of interactions in URM_all: 836478
	Interaction density: 3.7369%
	Interactions per user:
		 Min: 0
		 Avg: 138.49
		 Max: 1968
	Interactions per item:
		 Min: 0
		 Avg: 225.71
		 Max: 3211

Reader: Applying <RecSysFramework.DataManager.DatasetPostprocessing.KCore.KCore object at 0x140fa71c0>
DataDenseSplit_K_Cores: k-cores extraction will zero out some users and items without changing URM shape
DataDenseSplit_K_Cores: Initial URM desity is 3.74E-02
DataDenseSplit_K_Cores: Iteration 1. URM desity without zeroed-out nodes is 4.19E-02.
Users with less than 5 interactions are 2 ( 0.03%), 
Items w

In [8]:
train_urm.shape

(6038, 3307)

In [149]:
class DCT(Recommender):
    """
    
    Decoupled Completion and Transduction
    Cold-Start Item and User Recommendation with Decoupled Completion and Transduction
    Iman Barjasteh et al.
    
    """

    RECOMMENDER_NAME = "DCT"

    def __init__(self, URM_train, item_similarity_matrix):

        super(DCT, self).__init__(URM_train)
        self.item_similarity_matrix = check_matrix(item_similarity_matrix)
        

    def _compute_item_score(self, user_id_array, items_to_compute=None):

        assert self.W.shape[0] > user_id_array.max(),\
                "MatrixFactorization_Cython: Cold users not allowed. " \
                "Users in trained model are {}, requested prediction for users up to {}"\
                .format(self.W.shape[0], user_id_array.max())

        if items_to_compute is not None:
            item_scores = np.dot(self.W[user_id_array], self.H[:, items_to_compute])
        else:
            item_scores = np.dot(self.W[user_id_array], self.H)

        item_scores = np.dot(np.dot(item_scores, self.Ub_hat_complete), self.Ub.T)

        return item_scores


    def fit(self, num_factors=10, num_eigs=10, Ub_hat_size=10, max_vol=False):

        super(DCT, self).fit()

        self.num_factors = num_factors
        self.num_eigs = num_eigs

        # self.items_to_keep = np.arange(self.n_items)[np.ediff1d(self.URM_train.tocsc().indptr) > 0]
        # self.items_to_keep = np.random.choice(np.arange(self.n_items), size=Ub_hat_size)
            
        self._print("Calculating URM factorization")
        u, s, vt = randomized_svd(self.URM_train, num_factors)

        self._print("Calculating eigenvalues of B")
        _, self.Ub = eigs(self.item_similarity_matrix, k=num_eigs)
        
        if max_vol:
            max_vol_idxs, _ = rect_maxvol(self.Ub)
            self.items_to_keep = max_vol_idxs
            print("Size:", len(max_vol_idxs))
        
        elif Ub_hat_size is None:
            self.items_to_keep = np.arange(self.n_items)[np.ediff1d(self.URM_train.tocsc().indptr) > 0]
            
        else:
            self.items_to_keep = np.random.choice(np.arange(self.n_items), size=Ub_hat_size)
            
        #print("items_to_keep:", self.items_to_keep.shape)
        
        self.W = np.dot(u, np.diag(s))
        self.H = vt[:, self.items_to_keep]

        self._print("Calculating Ub complete")
        Ub_hat = self.Ub[self.items_to_keep]
        self.Ub_hat = Ub_hat
        #self.Ub_hat_complete = np.dot(Ub_hat, np.linalg.pinv(np.dot(Ub_hat.T, Ub_hat)))
        self.Ub_hat_complete = Ub_hat @ np.linalg.pinv(Ub_hat.T @ Ub_hat)


    def _get_dict_to_save(self):

        return {"W": self.W,
                "H": self.H,
                "Ub": self.Ub}

In [150]:
def cosine_similarity_zd(matrix):
    '''Build cosine similarity matrix with zero diagonal.'''
    similarity = cosine_similarity(matrix, dense_output=False)
    similarity.setdiag(0)
    similarity.eliminate_zeros()
    return similarity.tocsr()

In [151]:
item_similarity = cosine_similarity_zd(train_urm.T)

evaluator = EvaluatorHoldout(
    cutoff_list=[5],
    metrics_list=[EvaluatorMetrics.MAP, EvaluatorMetrics.NDCG, EvaluatorMetrics.RMSE]
)

In [None]:
num_factors_list = [500, 1000, 5000]
num_eigs_list = [500, 1000, 5000]
Ub_hat_size_list = [None] #[10, 25, 50, 100, 172, 250, 500, 1000, 1500, None]

max_ndcg = 0

for num_factors, num_eigs, Ub_hat_size in itertools.product(
    num_factors_list, num_eigs_list, Ub_hat_size_list
):
    dct = DCT(train_urm, item_similarity)
    dct.fit(num_factors=num_factors, num_eigs=num_eigs, Ub_hat_size=Ub_hat_size, max_vol=False)
    res = evaluator.evaluateRecommender(dct, test_urm).get_results_string()
    print(f'Num_factors = {num_factors}, num_eigs = {num_eigs}, Ub_hat_size = {Ub_hat_size}')
    print(res)

DCT: URM Detected 90 (0.27 %) cold items.
DCT: Calculating URM factorization


In [144]:
dct = DCT(train_urm, item_similarity)
dct.fit(num_factors=100, num_eigs=100, Ub_hat_size=None, max_vol=True)
res = evaluator.evaluateRecommender(dct, test_urm).get_results_string()
print(res)

DCT: URM Detected 10 (0.35 %) cold items.
DCT: Calculating URM factorization
DCT: Calculating eigenvalues of B
Size: 168
DCT: Calculating Ub complete
EvaluatorHoldout: Processed 1850 ( 100.00% ) in 0.29 sec. Users per second: 6295
CUTOFF: 5 - MAP: 0.0769616, NDCG: 0.0943154, RMSE: 0.8873256+0.0000000j, 



### Shapes:

items_to_keep - (n_items, 1)

W - (n_users, num_factors)

H - (n_factors, n_users)

Ub_hat, Ub_hat_complete, Ub - (num_items, num_eigs)

### Variables:
num_factors - r