In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import sys
sys.path.append('../')

In [3]:
from src.utils import loading, Spark
from src.memory_based import Memory_based_CF
import pyspark.ml as M
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.recommendation import ALS, ALSModel
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import ParamGridBuilder, TrainValidationSplit
from tqdm.notebook import tqdm
from sklearn import metrics

In [4]:
from scipy import sparse
import numpy as np
import pandas as pd

In [5]:
spark = Spark()

Spark UI address http://127.0.0.1:4040


In [6]:
datas = loading(spark, '../data/interim')

In [7]:
training, test = datas['train_0.75_0.25'], datas['test_0.75_0.25']

In [11]:
training

DataFrame[userId: string, movieId: string, rating: string]

In [12]:
class Baseline():
    def __init__(self, usercol='userId', itemcol='movieId', ratingcol='rating'):
        self.usercol = usercol
        self.itemcol = itemcol
        self.ratingcol = ratingcol
    
    def fit(self, X):
        train = self._preprocess(X)
        umean = train.groupby(self.usercol).agg(F.mean(self.ratingcol).alias('umean'))
        imean = train.groupby(self.itemcol).agg(F.mean(self.ratingcol).alias('imean'))
        
        self.umean = umean
        self.imean = imean
        
    def predict(self, X):
        test = self._preprocess(X)
        
        pred = test.join(self.umean, test[self.usercol] == self.umean[self.usercol])\
                   .select(test[self.usercol], test[self.itemcol], self.umean.umean)
        pred = pred.join(self.imean, pred[self.itemcol] == self.imean[self.itemcol])\
                   .select(pred[self.usercol], pred[self.itemcol], pred.umean, self.imean.imean)
        
        pred = pred.select(pred[self.usercol], pred[self.itemcol],
                           ((F.col('umean') + F.col('imean'))/2).alias('prediction'))
        
        return pred
        
        
        
    def _preprocess(self, _X):
        """[preprocess the input dataset]

        Args:
            _X (Pyspark DataFrame): [the training or test set]

        Returns:
            Pyspark DataFrame: [the preprocessed DataFrame]
        """        
        cast_int = lambda df: df.select([F.col(c).cast('int') for c in [self.usercol, self.itemcol]] + \
                                [F.col(self.ratingcol).cast('float')])
        return cast_int(_X)