In [2]:
import sys
sys.path.append('../')
from src.memory_based import Memory_based_CF
from src.utils import loading, Spark
from scipy import sparse
import numpy as np
from tqdm import tqdm
import pyspark.sql.functions as F

In [3]:
# create spark session
spark = Spark()

Spark UI address http://127.0.0.1:4040


In [4]:
splits = loading(spark, '../data/interim')

In [6]:
train, test = splits['train_0.5_0.5'], splits['train_0.5_0.5']

In [7]:
train

DataFrame[userId: string, movieId: string, rating: string]

In [8]:
cf = Memory_based_CF(spark, 'user')

In [9]:
cf.fit(train)

In [10]:
cf.X

<19865x1000 sparse matrix of type '<class 'numpy.int32'>'
	with 221831 stored elements in Compressed Sparse Row format>

In [12]:
cf.similarity_matrix.shape

(19865, 19865)

In [19]:
x = cf.X
sim = cf.similarity_matrix

In [165]:
(x!=0)

<19865x1000 sparse matrix of type '<class 'numpy.bool_'>'
	with 218659 stored elements in Compressed Sparse Row format>

In [189]:
cf.X[0,0]

2

In [128]:
mu_iarray = np.array(np.nan_to_num(cf.X.sum(1) / (cf.X != 0).sum(1))).reshape(-1)
mu_imat = np.vstack([mu_array for _ in range(x.shape[1])]).T

In [129]:
mu_jarray = np.array(np.nan_to_num(cf.X.sum(0) / (cf.X != 0).sum(0))).reshape(-1)
mu_jmat = np.vstack([mu_jarray for _ in range(x.shape[0])])

In [144]:
diff = (x - mu_imat).clip(min=0)

In [175]:
diff = x-mu_imat

In [183]:
x = x.astype(float)

In [185]:
x[x==0] = np.NaN

In [187]:
x - mu_imat

matrix([[0.10989011,        nan,        nan, ...,        nan,        nan,
                nan],
        [       nan, 2.37640449, 1.37640449, ...,        nan,        nan,
                nan],
        [       nan, 0.12437811, 1.12437811, ...,        nan,        nan,
                nan],
        ...,
        [       nan,        nan,        nan, ...,        nan,        nan,
                nan],
        [       nan,        nan,        nan, ...,        nan,        nan,
                nan],
        [       nan,        nan,        nan, ...,        nan,        nan,
                nan]])

In [152]:
norm = abs(cf.similarity_matrix).dot((diff!=0).astype(int))

In [155]:
norm

matrix([[3.20871013e+02, 2.41221322e+02, 1.29459954e+02, ...,
         4.30607251e-02, 1.89351895e-01, 0.00000000e+00],
        [2.44190951e+02, 3.23786077e+02, 1.43465392e+02, ...,
         3.61426178e-03, 2.05552330e-01, 0.00000000e+00],
        [2.71490106e+02, 2.81703035e+02, 1.67927361e+02, ...,
         1.49834268e-02, 2.24288702e-01, 0.00000000e+00],
        ...,
        [1.78631579e+01, 1.66485480e+01, 9.10540187e+00, ...,
         6.88857942e-03, 1.00610888e-02, 0.00000000e+00],
        [0.00000000e+00, 0.00000000e+00, 0.00000000e+00, ...,
         0.00000000e+00, 0.00000000e+00, 0.00000000e+00],
        [5.87448796e+01, 7.67468028e+01, 2.91503301e+01, ...,
         6.88857942e-03, 1.18732230e-01, 0.00000000e+00]])

In [157]:
sim_norm_mat.mean()

1252.0758182185543

In [109]:
sim_norm_array = np.linalg.norm(cf.similarity_matrix, ord=1, axis = 0)
sim_norm_mat = np.vstack([sim_norm_array for _ in range(x.shape[1])]).T

In [113]:
w = cf.similarity_matrix.dot((cf.X - mu_jmat).clip(min=0)) / sim_norm_mat

In [121]:
w = np.nan_to_num(w)

In [123]:
(mu_imat + w).mean()

3.5991332960493696

In [127]:
class Memory_based_CF():
    def __init__(self, spark, base, usercol='userId', itemcol='movieId', ratingcol='rating'):
        self.base = base
        self.usercol = usercol
        self.itemcol = itemcol
        self.ratingcol = ratingcol
        self.spark = spark
        self.X = None
        self.idxer = None
        self.similarity_matrix = None
    def fit(self, _X):
        X = self._preprocess(_X, True)
        self.X = X
        self.similarity_matrix = self._pearson_corr(X)
        self.prediction_matrix = self._get_predict()
        
    def predict(self, _X):
        rows, cols = self._preprocess(_X, False)
        preds = []
        for i,j in zip(rows,cols):   
            preds.append(self.prediction_matrix[i, j])
        df = self.idxer.transform(_X).select(self.usercol, self.itemcol, self.ratingcol).toPandas()
        df['prediction'] = preds
        return self.spark.createDataFrame(df)
    def _preprocess(self, X, fit):
        cast_int = lambda df: df.select([F.col(c).cast('int') for c in [self.usercol, self.itemcol]] + \
                                [F.col(self.ratingcol).cast('float')])
        _X = cast_int(X)
        if fit:
            self.idxer = indexTransformer(self.usercol, self.itemcol)
            self.idxer.fit(_X)
            X = self.idxer.transform(_X).select(self.usercol+'_idx', self.itemcol+'_idx', self.ratingcol).toPandas().values
            if self.base == 'user':
                row = X[:, 0]
                col = X[:, 1]
                data = X[:, 2]
            elif self.base == 'item':
                row = X[:, 1]
                col = X[:, 0]
                data = X[:, 2]
            else:
                raise NotImplementedError
            return sparse.csr_matrix((data, (row, col)))
        else:
            X = self.idxer.transform(_X).select(self.usercol+'_idx', self.itemcol+'_idx').toPandas().values
            if self.base == 'user':
                row = X[:, 0]
                col = X[:, 1]
            elif self.base == 'item':
                row = X[:, 1]
                col = X[:, 0]
            else:
                raise NotImplementedError
            return row, col

    def _pearson_corr(self, A):
        n = A.shape[1]
        
        rowsum = A.sum(1)
        centering = rowsum.dot(rowsum.T) / n
        C = (A.dot(A.T) - centering) / (n - 1)
        
        d = np.diag(C)
        coeffs = C / np.sqrt(np.outer(d, d))
        return np.array(np.nan_to_num(coeffs))
    def _get_predict(self):
        mu_iarray = np.array(np.nan_to_num(self.X.sum(1) / (self.X != 0).sum(1))).reshape(-1)
        mu_imat = np.vstack([mu_iarray for _ in range(self.X.shape[1])]).T
        mu_jarray = np.array(np.nan_to_num(self.X.sum(0) / (self.X != 0).sum(0))).reshape(-1)
        mu_jmat = np.vstack([mu_jarray for _ in range(self.X.shape[0])])
        sim_norm_array = np.linalg.norm(self.similarity_matrix, ord=1, axis = 0)
        sim_norm_mat = np.vstack([sim_norm_array for _ in range(self.X.shape[1])]).T
        w = self.similarity_matrix.dot((self.X - mu_jmat).clip(min=0)) / sim_norm_mat
        w = np.nan_to_num(w)
        return mu_imat + w