# Coverage Execution

## Loading libraries

In [1]:
%load_ext autoreload
%autoreload 2 
import sys
sys.path.append('../')

import pyspark.sql.functions as F
import pyspark.sql.types as T
import pyspark.sql.window as W
import pandas as pd
import numpy as np
import time
from tqdm.notebook import tqdm
from src.utils import loading, Spark
from src.model_based import Als
from src.baseline import Baseline
from src.memory_based import Memory_based_CF
from src.model_based import Als

# create spark session
spark = Spark()

Spark UI address http://127.0.0.1:4041


In [2]:
datas = loading(spark, '../data/raw')
sample = datas['sample']
num_items = sample.select('movieId').distinct().count()

## Training Recommender Systems

In [3]:
baseline = Baseline(usercol='userId', itemcol='movieId', ratingcol='rating', make_recommend=True)
userbased = Memory_based_CF(spark, base='user', usercol='userId', itemcol='movieId', ratingcol='rating', make_recommend=True)
itembased = Memory_based_CF(spark, base='item', usercol='userId', itemcol='movieId', ratingcol='rating', make_recommend=True)
modelbased = Als(userCol='userId', itemCol='movieId', ratingCol='rating', regParam=.15, seed=0, rank=10, make_recommend=True)

In [4]:
for m in [baseline, userbased, itembased, modelbased]:
    m.fit(sample)

## Computing Coverage

In [5]:
def compute_coverage(k, model):
    return model.recommend(k).select('movieId').distinct().count()

In [7]:
result = {'baseline': [],
          'userbased': [],
          'itembased': [],
          'modelbased': []}
for k in tqdm(range(1, 101)):
    for model, m in zip([baseline, userbased, itembased, modelbased], ['baseline', 'userbased', 'itembased', 'modelbased']):
        result[m].append(compute_coverage(k, model))
result = pd.DataFrame(result) /num_items

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))




In [11]:
result.to_csv('../data/processed/evaluation_result/coverage.csv', index=False, header=True)