# Evaluation Execution Notebook

## Import Libraries

In [1]:
%load_ext autoreload
%autoreload 2 
import sys
sys.path.append('../')

import pyspark.sql.functions as F
import pyspark.sql.types as T
import pandas as pd
import numpy as np
import time
from src.utils import loading, Spark
from src.evaluation import Evaluator, Cross_validate_als
from src.model_based import Als
from src.baseline import Baseline
from src.memory_based import Memory_based_CF
from src.model_based import Als

# create spark session
spark = Spark()

Spark UI address http://127.0.0.1:4040


## Loading Splitted Datasets

In [2]:
data = loading(spark, '../data/interim')
splits = ['0.75_0.25', '0.5_0.5', '0.25_0.75']
print(list(data.keys()))

['train_0.5_0.5', 'train_0.25_0.75', 'test_0.5_0.5', 'test_0.25_0.75', 'train_0.75_0.25', 'test_0.75_0.25']


## Declaring Evaluators (Metrics)

In [3]:
evaluators = {'rmse': Evaluator(metrics = 'rmse'), 
              'accuracy': Evaluator(metrics = 'accuracy'), 
              'coverage_user': Evaluator(metrics = 'converage_k', 
                                       ratingCol='rating', 
                                       predCol='prediction', 
                                       idCol='userId', 
                                       k=10),
              'coverage_item': Evaluator(metrics = 'converage_k', 
                                       ratingCol='rating', 
                                       predCol='prediction', 
                                       idCol='movieId', 
                                       k=100)}

In [4]:
# build evaluation pipeline
def evaluate(train, test, evaluators, model):
    start = time.time()
    model.fit(train)
    training_time = time.time() - start
    start = time.time()
    train_pred = model.predict(train)
    inference_train = time.time() - start
    start = time.time()
    test_pred = model.predict(test)
    inference_test = time.time() - start
    res = pd.DataFrame(np.zeros((len(evaluators),2)), columns = ['train', 'test'], index = evaluators.keys())
    for eva in evaluators.keys():
        res.loc[eva, 'train'] = evaluators[eva].evaluate(train_pred)
        res.loc[eva, 'test'] = evaluators[eva].evaluate(test_pred)
    return res, pd.Series({'training time': training_time, 
                           'inference train time':inference_train, 
                           'inference test time':inference_test})

def evaluate_pipeline(data, splits, model):
    result = []
    time = pd.DataFrame(columns = splits)
    for i in splits:
        train, test = data['train_' + i], data['test_' + i]
        res, time[i] = evaluate(train, test, evaluators, model)
        result.append(res)
    res = pd.DataFrame(columns = ['train', 'test', 'split'])
    for i, j in zip(result, splits):
        i['split'] = j
        res = res.append(i)
    return res, time

## Declaring Estimators

In [5]:
baseline = Baseline(usercol='userId', itemcol='movieId', ratingcol='rating')
userbased = Memory_based_CF(spark, base='user', usercol='userId', itemcol='movieId', ratingcol='rating')
itembased = Memory_based_CF(spark, base='item', usercol='userId', itemcol='movieId', ratingcol='rating')
modelbased = Als(userCol='userId', itemCol='movieId', ratingCol='rating', regParam=.15, seed=0, rank=10)

## Runing Evaluation Pipeline

In [6]:
%%time
baseline_res, baseline_time_res = evaluate_pipeline(data, splits, baseline)

CPU times: user 119 ms, sys: 42 ms, total: 161 ms
Wall time: 31.3 s


In [7]:
%%time
userbased_res, userbased_time_res = evaluate_pipeline(data, splits, userbased)

CPU times: user 4min 5s, sys: 37.1 s, total: 4min 43s
Wall time: 2min 51s


In [8]:
%%time
itembased_res, itembased_time_res = evaluate_pipeline(data, splits, itembased)

CPU times: user 58.2 s, sys: 2.56 s, total: 1min
Wall time: 1min 9s


In [9]:
%%time
modelbased_res, modelbased_time_res = evaluate_pipeline(data, splits, modelbased)

CPU times: user 132 ms, sys: 46.3 ms, total: 178 ms
Wall time: 39.4 s


## Saving the Results

In [10]:
baseline_res.to_csv('../data/processed/baseline_res.csv', header = True, index = True)
userbased_res.to_csv('../data/processed/userbased_res.csv', header = True, index = True)
itembased_res.to_csv('../data/processed/itembased_res.csv', header = True, index = True)
modelbased_res.to_csv('../data/processed/modelbased_res.csv', header = True, index = True)

baseline_time_res.to_csv('../data/processed/baseline_time_res.csv', header = True, index = True)
userbased_time_res.to_csv('../data/processed/userbased_time_res.csv', header = True, index = True)
itembased_time_res.to_csv('../data/processed/itembased_time_res.csv', header = True, index = True)
modelbased_time_res.to_csv('../data/processed/modelbased_time_res.csv', header = True, index = True)