In [2]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [6]:
class HybridModel:
    """
    Inputs: data frame of user-item rating pair
    Output: ranking of recommended items
    """
    def divide_data(df,threshold):
        # Divide users into 2 groups: few vs many ratings
        rating_count = df.pivot_table('rating',index=['reviewerID'],dropna=False,aggfunc='count')

        sparse_group = rating_count.loc[rating_count['rating'] < threshold]
        sparse_group.reset_index(inplace = True)
        sparse_df = df.loc[df['reviewerID'].isin(sparse_group['reviewerID'])]

        dense_group = rating_count.loc[rating_count['rating'] >= threshold]
        dense_group.reset_index(inplace = True)
        dense_df = df.loc[df['reviewerID'].isin(dense_group['reviewerID'])]

        return sparse_df, dense_df
    
    def recommendation_mixer(prediction1, prediction2, prediction3 = None, n = 6):
        """
        Inputs: predictions from sub-models
        Outputs: final list of recommended products
        """
        predictionF = defaultdict(list)
        prediction_list = []
        prediction_list.append(prediction1)
        prediction_list.append(prediction2)
        if prediction3 is not None:
            prediction_list.append(prediction3)
        for user in prediction1:
            i = 0
            j = 0
            k = 0
            p = 0
            while len(predictionF[user]) < n:
                if p == 0:
                    if prediction_list[p][user][i] not in predictionF[user]:
                        predictionF[user].append(prediction_list[p][user][i])
                    i += 1
                    p = 1 
                elif p == 1:
                    if prediction_list[p][user][j] not in predictionF[user]:
                        predictionF[user].append(prediction_list[p][user][j])
                    j += 1
                    if prediction3 == None:
                        p = 0
                    else: 
                        p = 2
                elif p == 2:
                    if prediction_list[p][user][k] not in predictionF[user]:
                        predictionF[user].append(prediction_list[p][user][k])
                    k += 1
                    p = 0
                
        return predictionF
    
    def recombine_data(sparse_prediction,dense_prediction):
        """
        Combine prediction of 2 cases: sparse and dense
        """
        final_prediction = {**sparse_prediction,**dense_prediction}
        return final_prediction

In [11]:
prediction1 = {1:['a','b','c','d'],2:['d','m','n','x']}
prediction2 = {1:['a','c','e','h'],2:['m','n','q','y']}
prediction3 = {1:['a','c','b','f'],2:['d','n','q','z']}
prediction4 = {3:['o','p','k','l'],4:['r','s','t','u']}
prediction5 = {3:['v','w','k','y'],4:['z','f','w','o']}

In [12]:
dense_p = HybridModel.recommendation_mixer(prediction1,prediction2,prediction3)
sparse_p = HybridModel.recommendation_mixer(prediction4,prediction5)

In [13]:
print (HybridModel.recombine_data(sparse_p,dense_p))

{3: ['o', 'v', 'p', 'w', 'k', 'l'], 4: ['r', 'z', 's', 'f', 't', 'w'], 1: ['a', 'b', 'c', 'e', 'd', 'h'], 2: ['d', 'm', 'n', 'q', 'x', 'y']}


In [38]:
df = pd.read_csv('sample_data.csv',index_col=0)
sparsedf, densedf = HybridModel.divide_data(df,20)

In [63]:
sparsedf.describe()

Unnamed: 0,rating
count,4442.0
mean,3.947771
std,1.056167
min,1.0
25%,3.0
50%,4.0
75%,5.0
max,5.0


In [42]:
class evaluation:
    """
    Some ratios to measure performance of recommendation models
    """
    def recall_at_topk(prediction,holdout):
        """
        For each user, check if the prediction contains any of products in the holdout set. If yes, we count
        the prediction as a success, and a failure otherwise. Recall at top-k is measured as percentage of
        users with sucessful recommendation out of total number of users. This measurement is based on the 
        same idea as in this paper: https://arxiv.org/pdf/1703.02344.pdf
        """
        #Convert holdout dataframe to dictionary
        holdout_records = holdout[['reviewerID','productID']].to_dict('records')
        holdout_dict = defaultdict(list)
        for row in holdout_records:
            holdout_dict[row['reviewerID']].append(row['productID'])
        
        #Calculate recall at top k
        success_count = 0
        check = 0
        for user, items in prediction.items():
            for i in items:
                if i in holdout_dict[user]:
                    check = 1
            if check == 1:
                success_count += 1
                check = 0
        return float(success_count)/float(len(prediction))
    
    def coverage_ratio(prediction,dataset):
        """
        Coverage ratio is measured as number of products recommended over total number of products
        """
        recommended_product = []
        for user, items in prediction.items():
            recommended_product.extend(items)
        return float(len(set(recommended_product)))/float(len(set(dataset['productID'])))

In [34]:
holdout = {1:['a','g','c'],2:['a','f','d']}

In [43]:
evaluation.recall_at_topk(prediction1,df)

0.0

In [39]:
evaluation.coverage_ratio(prediction1,df)

0.1