In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../')

In [3]:
from src.utils import loading, Spark
import pyspark.ml as M
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from tqdm.notebook import tqdm

In [4]:
from scipy import sparse
import numpy as np
import pandas as pd

In [5]:
spark = Spark()

Spark UI address http://127.0.0.1:4040


In [6]:
datas = loading(spark, '../data/interim')

In [7]:
cast_int = lambda df: df.select([F.col(c).cast('int') for c in ['userId', 'movieId']] + \
                                 [F.col('rating').cast('float')])
training, test = cast_int(datas['train_0.75_0.25']), cast_int(datas['test_0.75_0.25'])

In [8]:
class indexTransformer():
    def __init__(self, usercol='userId', itemcol='movieId'):
        self.usercol = usercol
        self.itemcol = itemcol
        self.u_indxer =  M.feature.StringIndexer(inputCol=usercol, 
                                                outputCol=usercol+'_idx', 
                                                handleInvalid = 'skip')
        self.i_indxer = M.feature.StringIndexer(inputCol=itemcol, 
                                                outputCol=itemcol+'_idx', 
                                                handleInvalid = 'skip')
        self.X = None
    def fit(self, X):
        self.X = X
        self.u_indxer = self.u_indxer.fit(self.X)
        self.i_indxer = self.i_indxer.fit(self.X)
        return
    def transform(self, X):
        X_ = self.u_indxer.transform(X)
        X_ = self.i_indxer.transform(X_)
        return self._cast_int(X_).orderBy([self.usercol+'_idx', self.itemcol+'_idx'])
    
    def fit_transform(self, X):
        self.fit(X)
        return self.transform(X)
    
    def _cast_int(self, X):
        return X.select([F.col(c).cast('int') for c in X.columns])

In [9]:
idxer = indexTransformer()
training = idxer.fit_transform(training)
test = idxer.transform(test)

In [10]:
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry

In [11]:
from scipy import stats

In [12]:
X_train = training.select('userId_idx', 'movieId_idx', 'rating').toPandas().values

In [13]:
row = X_train[:, 0]
col = X_train[:, 1]
data = X_train[:, 2]

In [14]:
X_train = sparse.csr_matrix((data, (row, col)))

In [15]:
def pearson_corr(A):
    n = A.shape[1]
    
    rowsum = A.sum(1)
    centering = rowsum.dot(rowsum.T) / n
    C = (A.dot(A.T) - centering) / (n - 1)
    
    d = np.diag(C)
    coeffs = C / np.sqrt(np.outer(d, d))
    return np.nan_to_num(coeffs)

In [17]:
sim = pearson_corr(X_train)

  coeffs = C / np.sqrt(np.outer(d, d))


In [18]:
sim

matrix([[ 1.00000000e+00, -2.00924617e-02,  4.46350065e-02, ...,
         -4.53758402e-03, -4.53758402e-03, -4.53758402e-03],
        [-2.00924617e-02,  1.00000000e+00, -1.77460937e-02, ...,
         -4.02180630e-03, -4.02180630e-03, -4.02180630e-03],
        [ 4.46350065e-02, -1.77460937e-02,  1.00000000e+00, ...,
         -4.00769168e-03, -4.00769168e-03, -4.00769168e-03],
        ...,
        [-4.53758402e-03, -4.02180630e-03, -4.00769168e-03, ...,
          1.00000000e+00, -9.08265213e-04, -9.08265213e-04],
        [-4.53758402e-03, -4.02180630e-03, -4.00769168e-03, ...,
         -9.08265213e-04,  1.00000000e+00, -9.08265213e-04],
        [-4.53758402e-03, -4.02180630e-03, -4.00769168e-03, ...,
         -9.08265213e-04, -9.08265213e-04,  1.00000000e+00]])