In [1]:
import pandas as pd
import numpy as np
import random
%matplotlib inline
import matplotlib.pyplot as plt
import surprise
from collections import defaultdict
from surprise import SVD, SVDpp, NMF
from surprise import Dataset
from surprise import Reader
from surprise import evaluate, print_perf
from surprise import KNNBasic
from surprise import AlgoBase, BaselineOnly
from get_top_n import get_top_n
from model.hybrid_model import HybridModel
from model.evaluation import evaluation

In [2]:
df = pd.read_csv('sample_data.csv',index_col=0)

In [3]:
# Turn data frame into dictionary
df_records = df[['reviewerID','productID']].to_dict('records')
df_dict = defaultdict(list)
for row in df_records:
    df_dict[row['reviewerID']].append(row['productID'])

In [4]:
# Select % as holdout data
holdout = []
for reviewer in df_dict:
    hd_product = df_dict[reviewer][:max(1,int(0.25*len(df_dict[reviewer])))]
    for product in hd_product:
        holdout.append((reviewer,product))
df_tupleindex = df.set_index(['reviewerID','productID'])

In [5]:
# Develop training and test data
testdata = df_tupleindex.loc[holdout].reset_index()
traindata = df_tupleindex[~df_tupleindex.index.isin(holdout)].reset_index()

## Hybrid Model

In [6]:
# Divide training data into dense and sparse data, and handle them separately 
sparse_dt, dense_dt = HybridModel.divide_data(traindata,5)

In [7]:
sparse_dt.describe()

Unnamed: 0,rating
count,1552.0
mean,3.87049
std,1.08473
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


In [8]:
dense_dt.describe()

Unnamed: 0,rating
count,3153.0
mean,3.901047
std,1.038156
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


For dense data, we use a mixed hybrid model with our own Content-based model, 
along with the Matrix Factorization model, neighborhood based model we built in Part I

In [9]:
# Content-based model and its set of prediction


In [10]:
# Part I model: SVD model and its set of prediction
reader = Reader(rating_scale=(1,5))
df_surprise = Dataset.load_from_df(dense_dt[['reviewerID','productID','rating']],reader)

In [11]:
#Construct missing ratings we need to predict
dense_data_select = dense_dt.pivot_table('rating',index=['reviewerID','productID'],dropna=False)
dense_data_select = dense_data_select.loc[dense_data_select['rating'].isnull()]
dense_data_select.reset_index(inplace=True)
missing_values = dense_data_select[['reviewerID','productID']].values

In [12]:
#Function to predict missing values based on an algorithm
def mv_prediction(algo,missing_values):
    predictions = [algo.predict(uid, iid)
                       for (uid, iid) in missing_values]
    return predictions

In [13]:
algo1 = SVD()
evaluate(algo1,df_surprise,measures=['RMSE','MAE'], verbose= 0)

CaseInsensitiveDefaultDict(list,
                           {'mae': [0.7448540593841253,
                             0.70959026684826021,
                             0.70721691208136495,
                             0.78509054267732614,
                             0.69412464950708552],
                            'rmse': [0.96388088018646711,
                             0.88396764062866751,
                             0.90205229246524865,
                             0.98571383411272184,
                             0.87716995383146779]})

In [14]:
# Extract product recommendation list for each user
def extract_topk_surpise(prediction):
    topk = get_top_n(prediction)
    topk_norating = defaultdict(list)
    for user, i_r in topk.items():
        for item, rating in i_r:
            topk_norating[user].append(item)
    return topk_norating

In [15]:
# Get top k recommendation from SVD model
mv_svdprediction = mv_prediction(algo1,missing_values)
svd_topk = extract_topk_surpise(mv_svdprediction)

In [16]:
# Part I Model: KNN and its set of prediction
algo_name = KNNBasic ##  KNNWithMeans,KNNBaseline
sim_option={'name': 'cosine', ## cosine, msd, pearson, personbaseline
                 'user_based': 'False', ## False for item-based
                 'min_surpport':0 }##  if |Iuv|<min_support then sim(u,v)=0
max_k = 10 ## The (max) number of neighbors to take into account for aggregation
min_k = 7 ##  If there are not enough neighbors, the prediction is set the the global mean of all ratings
knn_default = algo_name(k = max_k, min_k = min_k, sim_options=sim_option)
#Train model
evaluate(knn_default, df_surprise, measures=['RMSE','MAE'], verbose= 1)

Evaluating RMSE, MAE of algorithm KNNBasic.

------------
Fold 1
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0384
MAE:  0.8041
------------
Fold 2
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9397
MAE:  0.7563
------------
Fold 3
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9536
MAE:  0.7537
------------
Fold 4
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0320
MAE:  0.8190
------------
Fold 5
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9461
MAE:  0.7570
------------
------------
Mean RMSE: 0.9820
Mean MAE : 0.7780
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'mae': [0.8040615116117551,
                             0.7563107556383426,
                             0.75368403565055664,
                             0.8189524408330654,
                             0.75695571243776361],
                            'rmse': [1.0384211911554688,
                             0.93974853169298933,
                             0.95362582354355641,
                             1.0319986851986822,
                             0.94607092539861803]})

In [17]:
# Get top k recommendation from KNN
mv_knnprediction = mv_prediction(knn_default,missing_values)
knn_topk = extract_topk_surpise(mv_knnprediction)

For sparse data, we use a mixed hybrid model of our own Association rule and Content-based model.
We already trained the content-based model above, now we need to train the Association rule model

In [25]:
# Final set of recommendation as mixed recommendation of Content-based, SVD, KNN for dense matrix,
# and mixed recommendation of Content-based, Association rule for sparse matrix
# The recommendations are presented side-by-side to each user
sparse_prediction = HybridModel.recommendation_mixer(svd_topk,knn_topk,n=7)
dense_prediction = HybridModel.recommendation_mixer(svd_topk,knn_topk,n=7)
hybrid_prediction = HybridModel.combine_prediction(sparse_prediction,dense_prediction)
hybrid_prediction

{'A100WO06OQR8BQ': ['B004U8XT10',
  'B00DS842HS',
  'B004GWRP1A',
  'B00DUQDIMI',
  'B00BJH5A26',
  'B009WPKGQC',
  'B009M515HG'],
 'A10ZBR6O8S8OCY': ['B00DS842HS',
  'B00DUQDIMI',
  'B003EXXPWA',
  'B00DBSG6V0',
  'B00BJH5A26',
  'B00DUQDIFA',
  'B003EXVM4S'],
 'A11I1I9QLMAM1A': ['B00DS842HS',
  'B004U8XT10',
  'B004GWRP1A',
  'B003EXXPWA',
  'B003EXVM4S',
  'B00DBSG6V0',
  'B00BJH5A26'],
 'A11LNY2OLQSUSV': ['B003EXVM4S',
  'B00DS842HS',
  'B004GWRP1A',
  'B004U8XT10',
  'B003EXXPWA',
  'B00BNR7I18',
  'B00BJH5A26'],
 'A11OTLEDSW8ZXD': ['B00DS842HS',
  'B004U8XT10',
  'B00BJH5A26',
  'B00DBSG6V0',
  'B00BJH59ZO',
  'B003EXVM4S',
  'B009M515DA'],
 'A13MKSASQ6YWL7': ['B00DS842HS',
  'B003EXXPWA',
  'B004U8XT10',
  'B004GWRP1A',
  'B00DUQDIMI',
  'B00BJH59ZO',
  'B00BNR7I18'],
 'A13WOT3RSXKRD5': ['B00DS842HS',
  'B00DUQDIMI',
  'B00BJH5A26',
  'B004U8XT10',
  'B003EXVM4S',
  'B004GWRP1A',
  'B003EXXPWA'],
 'A15T9G38F589KM': ['B00DS842HS',
  'B00DUQDIMI',
  'B004GWRP1A',
  'B003EXXPWA',
 

## Evaluation

Recall at top-k:
For each user, check if the prediction contains any of products in the holdout set. If yes, we count
the prediction as a success, and a failure otherwise. Recall at top-k is measured as percentage of
users with sucessful recommendation out of total number of users. This measurement is based on the 
same idea as in this paper: https://arxiv.org/pdf/1703.02344.pdf

In [26]:
# Recall at top k
evaluation.recall_at_topk(hybrid_prediction,testdata)

0.28413284132841327

In [27]:
# Coverage ratio
# Coverage ratio is measured as number of products recommended over total number of products
evaluation.coverage_ratio(hybrid_prediction,df)


0.4714285714285714