# Important

`make scores` has to be run before running any notebook cell

# Imports

In [None]:
import pandas as pd
import seaborn as sns

# Visualization settings

In [None]:
sns.set(context='paper', font_scale=1.2, style='ticks', palette='muted',
        rc={"axes.labelsize":16, "ytick.labelsize": 14, "xtick.labelsize":14,
            "font.family": "sans-serif"})

# Accuracy

## Results

In [None]:
df_accuracy = pd.read_csv("../results/cf-accuracy-results.csv", index_col='model')

In [None]:
df_accuracy.sort_values('rmse')

## Distribution

In [None]:
df_pred_so = pd.read_csv("../models/predictions/cf-results/testset/slopeone-testset-predictions.csv")
df_pred_knn = pd.read_csv("../models/predictions/cf-results/testset/knn-testset-predictions.csv")
df_pred_svd = pd.read_csv("../models/predictions/cf-results/testset/svd-testset-predictions.csv")

In [None]:
df_pred_so['err'] = abs(df_pred_so.est - df_pred_so.rating)
df_pred_knn['err'] = abs(df_pred_knn.est - df_pred_knn.rating)
df_pred_svd['err'] = abs(df_pred_svd.est - df_pred_svd.rating)

As the KNN and SVD results are very similar, the SlopeOne and KNN results are compared.

## Worst and best scenarios

In [None]:
df_pred_so.sort_values('err').tail()

In [None]:
df_pred_svd.sort_values('err').tail()

In [None]:
df_pred_knn[df_pred_so.err >= 3.5].head()

In [None]:
df_pred_knn[df_pred_knn.err >= 3.5].sort_values('err', ascending=False).head()

In [None]:
df_pred_so[df_pred_knn.err >= 3.5].head()

## Estimates distributions

In [None]:
df_pred_so.est.describe()

In [None]:
df_pred_knn.est.describe()

In [None]:
so_dist_plot = sns.distplot(df_pred_so.est, kde=False)
so_dist_plot.set(xlabel='Ratings estimation', ylabel='Frequency')

In [None]:
knn_dist_plot = sns.distplot(df_pred_knn.est, kde=False)
knn_dist_plot.set(xlabel='Ratings estimation', ylabel='Frequency')

In [None]:
df_pred_so.err.describe()

In [None]:
df_pred_knn.err.describe()

## Neighbors requirement

In [None]:
k_vals = df_pred_knn['details'].apply(lambda x : dict(eval(x))).apply(pd.Series)

In [None]:
df_pred_knn_full = pd.merge(df_pred_knn, k_vals, left_index=True, right_index=True)

In [None]:
df_pred_knn_full.head()

In [None]:
k_vals['actual_k'].describe()

In [None]:
k_vals[k_vals < 10].count() / len(k_vals)

In [None]:
df_pred_knn_full[df_pred_knn_full.err >= 3].head(1000).actual_k.describe()

# Effectiveness

In [None]:
df_eff = pd.read_csv("../results/cf-effectiveness-results-n.csv", index_col='model')

In [None]:
df_eff[df_eff.n == 20]

In [None]:
eff_over_n = df_eff[df_eff.index == 'svd-predictions.csv'].melt('n', var_name='cols',  value_name='vals')

In [None]:
import matplotlib.pyplot as plt
g = sns.catplot(x="n", y="vals", hue='cols', data=eff_over_n, aspect=1.2, legend=False)

for ax in g.axes.flat:
    labels = ax.get_xticklabels()
    for i,l in enumerate(labels):
        if((i+1)%10 != 0 and i != 0): labels[i] = ''
    ax.set_xticklabels(labels)

new_labels = ['Precision - to read', 'Precision - ratings', 'Recall - to read', 'Recall - ratings']
g.set(xlabel ='Number of recommendations', ylabel ='Metric value') 
g.ax.legend(loc=0)
handles, labels = ax.get_legend_handles_labels()
g.ax.legend(loc=2, handles=handles, labels=new_labels, fontsize='12')