In [1]:
from collections import Counter
import warnings

import numpy as np
import pandas as pd
from pandarallel import pandarallel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from tqdm import tqdm

from src.models import cf

pandarallel.initialize()
tqdm.pandas()
warnings.filterwarnings('ignore')

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.




# Load Data

In [2]:
# global variables
DATA_PATH = "data/evaluation"
CATEGORY = "Pet_Supplies"

# load train dataset
train = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_train.csv")

In [3]:
# checking train dataframe
train.head()

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,0,1223000893,"Cat Sitter DVD Trilogy - Vol 1, Vol 2 and Vol 3",[],A14CK12J7C7JRK,3.0,I purchased the Trilogy with hoping my two cat...,2011-01-12,purchase trilogy hop cat age interested yr old...
1,2,1223000893,"Cat Sitter DVD Trilogy - Vol 1, Vol 2 and Vol 3",[],A2CR37UY3VR7BN,4.0,I bought the triliogy and have tested out all ...,2012-12-19,buy triliogy test dvd appear volume receive re...
2,3,1223000893,"Cat Sitter DVD Trilogy - Vol 1, Vol 2 and Vol 3",[],A2A4COGL9VW2HY,4.0,My female kitty could care less about these vi...,2011-05-12,female kitty care video care little male dig a...
3,4,1223000893,"Cat Sitter DVD Trilogy - Vol 1, Vol 2 and Vol 3",[],A2UBQA85NIGLHA,3.0,"If I had gotten just volume two, I would have ...",2012-03-05,volume star trilogy star read review know vol ...
4,5,B00005MF9U,LitterMaid LM900 Mega Self-Cleaning Litter Box,"['Pet Supplies', 'Cats', 'Litter &amp; Housebr...",A2BH04B9G9LOYA,1.0,"First off, it seems that someone is spamming t...",2006-12-31,spamming review glow reviewer review amazon ba...


In [4]:
# get user rating history
train_user_rating_history = train.groupby(["reviewerID"])["asin"].progress_apply(list)
print(train_user_rating_history)

100%|██████████████████████████████████████████████████████| 19058/19058 [00:00<00:00, 63565.27it/s]


reviewerID
A04173782GDZSQ91AJ7OD                 [B0002AT464, B0002AT464, B00078ZK2S]
A042274212BJJVOBS4Q85                 [B001AT9B8M, B001E8LD3K, B00II7195M]
A0436342QLT4257JODYJ     [B000255NCI, B000255NCI, B000255O90, B000255O9...
A04795073FIBKY8GSLZYI                             [B000O39TDC, B000O39TE6]
A06658082A27F4VB5UG8E                             [B0006MU8WC, B000FS4OYA]
                                               ...                        
AZYJE40XW6MFG                                     [B0002IEYIE, B0002IEYIE]
AZZ56WF4X19G2                                                 [B0010P0YSW]
AZZNK89PXD006            [B0002DHV16, B0006N9D4A, B0018CIPS8, B001Q9EGK...
AZZV9PDNMCOZW                         [B0006N9LN8, B004PU7SBU, B00DIIKLCI]
AZZYW4YOE1B6E            [B0002ARQV4, B0002H3R2E, B0002H3R2E, B000MLG4K...
Name: asin, Length: 19058, dtype: object


# Utility Functions

In [5]:
def get_top_n(predictions: dict, user_rating_history: pd.DataFrame, n: int=10) -> dict:
    """Return the top-N recommendations for each user based on cosine similarity.
    
    Args:
    
    Returns:
        ([dict]): A dictionary of top-N recommendations for each unique user, sorted by
            cosine similarties.
    """
    
    # retrieve a 200 items candidate list based on similarities
    top_ns = {}
    for user in predictions:
        top_ns[user] = predictions[user][:n]
        
    return top_ns

def recall_at_k(asins, predicted_asins, k=10):
    # number of relevant items
    set_actual = set(asins)
    set_preds = set(predicted_asins)
    num_relevant = len(set_actual.intersection(set_preds))
    
    # calculating recall@K - relevant / total relevant items
    recall_at_k = num_relevant / len(asins)
    
    return recall_at_k

def novelty_at_k(item_popularity, predicted_asins, k=10):
    """
    """
    # finding avg novelty
    popularity_sum = item_popularity.loc[predicted_asins].sum()
    novelty_at_k = ((k*1) - popularity_sum) / k
    
    return novelty_at_k


def generate_item_popularity(train: pd.DataFrame) -> pd.DataFrame:
    """
    """
    
    # create a mapping of item popularatity
    # based on sum(item's review / max reviews) / no items
    max_reviews = (train.groupby(['asin'])
                   .agg({'processedReviewText': 'count'})
                   .max()
                   .values[0])
    item_popularity = (train.groupby(['asin'])
                       .agg({'processedReviewText': 'count'})
                       .apply(lambda x: x/max_reviews))
    
    return item_popularity
    

def evaluate_recommendations(top_ns: dict, user_rating_history: pd.DataFrame, item_popularity: pd.DataFrame, k=10) -> pd.DataFrame:
    """
    
    Args:
        top_ns
        user_rating_history
    """
    
    test_recommendations = pd.DataFrame(top_ns.items(), columns=["reviewerID", "pred_asin"])
    
    # combined test history and recommendations
    test_merged = pd.merge(user_rating_history, test_recommendations, on="reviewerID", how="inner")
    
    # generating recall@k metrics
    test_merged["recall@k"] = test_merged.apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)
    test_merged["novelty@k"] = test_merged.apply(lambda x: novelty_at_k(item_popularity, x.pred_asin, k=k), axis=1)
    average_recall_at_k = test_merged["recall@k"].mean()
    average_novelty_at_k = test_merged["novelty@k"].mean()
    
    print(f"The MEM-ECF has an average recall@{k}: {average_recall_at_k:.5f}, average novelty@{k}: {average_novelty_at_k:.5f}")
    
    return test_merged

# Generate N-Recommendations = {10, 25, 30, 45}

## Load Test Data

In [6]:
# loading test dataset
test = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_test.csv")

In [7]:
test.head().append(test.tail())

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,1,1223000893,"Cat Sitter DVD Trilogy - Vol 1, Vol 2 and Vol 3",[],A39QHP5WLON5HV,5.0,There are usually one or more of my cats watch...,2013-09-14,usually cat watch tv stay trouble dvd play lik...
1,104,B00005MF9V,LitterMaid Universal Cat Privacy Tent (LMT100),"['Pet Supplies', 'Cats', 'Litter & Housebreaki...",A366V0GCEPH5CX,5.0,My cats love it and so do I. I no longer have ...,2013-02-02,cat love longer cat litter fly floor litter fl...
2,133,B00005MF9T,LitterMaid LM500 Automated Litter Box,"['Pet Supplies', 'Cats', 'Litter & Housebreaki...",ALWWS8QBYN80B,1.0,I have one female cat that weighs under 10 pou...,2004-11-17,female cat weigh pound year old use everclean ...
3,153,B00005MF9W,LitterMaid Waste Receptacles Automatic Litter ...,"['Pet Supplies', 'Cats', 'Litter & Housebreaki...",A3PVI3NE7OY1SP,5.0,I love these. They make the clean up so much e...,2013-09-26,love clean easy clean box manually use issue w...
4,154,B00005MF9W,LitterMaid Waste Receptacles Automatic Litter ...,"['Pet Supplies', 'Cats', 'Litter & Housebreaki...",A2H83XMHUVDLJY,4.0,"I love this litter box. I do not use the lids,...",2014-06-26,love litter box use lid use receptacle tear cr...
41564,111601,B00KJGFGFO,Curry Brush with Coarse or Fine Bristles. High...,[],AV34KNYW82YSS,4.0,Pulled lots of hair out of my Labs coat. Didn'...,2014-07-18,pulled lot hair labs coat think prove wrong co...
41565,111603,B00KJGFGFO,Curry Brush with Coarse or Fine Bristles. High...,[],A1YMNTFLNDYQ1F,5.0,I have been trying to find a rubber bristle br...,2014-07-16,try rubber bristle brush persian year lose glo...
41566,111604,B00KJGFGFO,Curry Brush with Coarse or Fine Bristles. High...,[],A1FQ3HRVXA4A5B,5.0,Great product to use on your pets knowing this...,2014-07-11,great product use pet know gentle rubber damag...
41567,111605,B00KJGFGFO,Curry Brush with Coarse or Fine Bristles. High...,[],A3OP6CI0XCRQXO,5.0,I bought a second one because I have two cats ...,2014-07-22,buy second cat american short hair buy brush m...
41568,111606,B00KJGFGFO,Curry Brush with Coarse or Fine Bristles. High...,[],A11LC938XF35XN,5.0,Our dogs love getting brushed with this. It m...,2014-07-17,dog love brush massage remove heavy undercoat ...


In [8]:
# generating test history
test_user_history = (pd.DataFrame(test.groupby(['reviewerID'])['asin']
                                  .apply(list).reset_index()))

In [9]:
print(test_user_history)

                  reviewerID                                  asin
0      A04173782GDZSQ91AJ7OD              [B0090Z9AYS, B00CPDWT2M]
1      A042274212BJJVOBS4Q85              [B005AZ4M3Q, B00771WQIY]
2       A0436342QLT4257JODYJ  [B0018CDR68, B003SJTM8Q, B00474A3DY]
3      A04795073FIBKY8GSLZYI              [B001PKT30M, B005DGI2RY]
4      A06658082A27F4VB5UG8E              [B000TZ1TTM, B0019VUHH0]
...                      ...                                   ...
18993          AZYJE40XW6MFG              [B00HVAKJZS, B00IDZT294]
18994          AZZ56WF4X19G2                          [B004A7X218]
18995          AZZNK89PXD006  [B0002DHV16, B005BP8MQ8, B009RTX4SU]
18996          AZZV9PDNMCOZW              [B007EQL390, B00ISBWVT6]
18997          AZZYW4YOE1B6E  [B0002AQPA2, B0002AQPA2, B0002ARQV4]

[18998 rows x 2 columns]


## Instantiate Memory-based Embedding CF (User-based)

In [10]:
class UserBasedCF:
    """
    """

    def __init__(self):
        self._rating_history = None
        self._mean_ratings = None
        self._k_neighbourhood = None
        self.utility_matrix = None
        self.sim_matrix = None

    def __get_utility_matrix(self, trainset: pd.DataFrame):
        """
        """
        self._mean_ratings = trainset.groupby(['reviewerID'], as_index=False)['overall'].mean()
        self._mean_ratings.columns = ['reviewerID', 'mean_overall']

        # creating utility matrix
        train = pd.merge(trainset, self._mean_ratings, on='reviewerID')
        # deviation from user's average rating
        train['dev_overall'] = train['overall'] - train['mean_overall']
        utility_matrix = train.pivot_table(index='reviewerID', columns='asin', values='dev_overall')

        return utility_matrix.fillna(utility_matrix.mean(axis=0))

    def __get_similarities_matrix(self):
        """
        """
        cosine_sim = cosine_similarity(self.utility_matrix)
        np.fill_diagonal(cosine_sim, 0)
        # generate user similarity matrix
        users_sim = pd.DataFrame(cosine_sim, index=self.utility_matrix.index)
        users_sim.columns = self.utility_matrix.index

        return users_sim

    def __get_k_neighbourhood(self, k_neighbours: float):
        """
        """
        # sim_order = np.argsort(self.sim_matrix.values, axis=1)[:, :k_neighbours]
        neighbours = (self.sim_matrix
                      .apply(lambda x: pd.Series(x.sort_values(ascending=False)
                                                 .iloc[:k_neighbours]
                                                 .index,
                                                 index=['top{}'.format(i) for i in range(1, k_neighbours+1)]),
                             axis=1))

        return neighbours

    def __predict_rating(self, user):
        """
        """
        # retrieve user rating history
        user_rating_history = self._rating_history[user]

        # list of K-neighbourhood of similar users
        sim_users = (self._k_neighbourhood[self._k_neighbourhood.index == user]
                     .values
                     .squeeze()
                     .tolist())
        # retrieve similar user rating history
        sim_users_rating_history = [j for i in self._rating_history[sim_users] for j in i]
        # find items rated by similar users by not by target user
        item_under_consideration = set(sim_users_rating_history) - set(user_rating_history)

        # retrieve target user mean rating
        user_mean_rating = self._mean_ratings.loc[self._mean_ratings['reviewerID'] == user, 'mean_overall'].values[0]

        candidate_items = {}
        for item in item_under_consideration:
            # retrieve item norm ratings
            item_norm_ratings = self.utility_matrix.loc[:, item]
            # retrieve norm ratings from similar users
            sim_norm_ratings = item_norm_ratings[item_norm_ratings.index.isin(sim_users)]
            # retrieve target user and similar user cosine similarities
            corrs = self.sim_matrix.loc[user, sim_users]

            # combined item norm ratings and user corrs - cosine similarities
            user_corrs = pd.concat([sim_norm_ratings, corrs], axis=1)
            user_corrs.columns = ['dev_overall', 'correlation']
            user_corrs['overall'] = user_corrs.apply(lambda x: x['dev_overall'] * x['correlation'], axis=1)

            # compute predicted ratings
            numerator = user_corrs['overall'].sum()
            denominator = user_corrs['correlation'].sum()
            predict_rating = user_mean_rating + (numerator/denominator)

            candidate_items[item] = predict_rating

        # retrieve counts of items appearing in similar user rating history
        item_counts = pd.DataFrame.from_dict(Counter(sim_users_rating_history), orient='index', columns=['count'])
        candidate_items = pd.DataFrame.from_dict(candidate_items, orient='index', columns=['pred_overall'])
        # merge predicted ratings and counts
        candidate_items = candidate_items.merge(item_counts, left_index=True, right_index=True)

        return candidate_items.sort_values(by=['count', 'pred_overall'], ascending=False).index.tolist()

    def fit(self, trainset: pd.DataFrame, k_neighbours: float=50):
        """

        Args:
            trainset ([pd.DataFrame]):
            k_neighbours ([int]):
        """
        # generate user rating history
        self._rating_history = trainset.groupby(['reviewerID'])['asin'].apply(list)
        self.utility_matrix = self.__get_utility_matrix(trainset)
        self.sim_matrix = self.__get_similarities_matrix()
        self._k_neighbourhood = self.__get_k_neighbourhood(k_neighbours)

    def test(self):
        """
        """
        # retrieve unique users
        unique_users = self._rating_history.reset_index()['reviewerID'].tolist()

        predictions = {}
        for user in tqdm(unique_users):
            predictions[user] = self.__predict_rating(user)

        return predictions

In [13]:
# instantiate model
ub_cf = UserBasedCF()

In [14]:
# fit learning algorithm to training data
ub_cf.fit(train, k_neighbours=50)

In [16]:
# generate candidates items ranked by predicted ratings
candidate_items = ub_cf.test()

100%|█████████████████████████████████████████████████████████| 19058/19058 [54:18<00:00,  5.85it/s]


## Loop through N = {10, 25, 30, 45}

In [17]:
# generate item popularity
item_popularity = generate_item_popularity(train)

In [18]:
n_recommendations = {}
for n in [10, 25, 30, 45]:
    # retrieve the top-n items based on similarities
    top_ns = get_top_n(candidate_items, train_user_rating_history, n)
    # evaluate how well the recommended items predicted the future purchases
    n_recommended_items = evaluate_recommendations(top_ns, test_user_history, item_popularity, n)
    # saving the n-value and recommended items
    n_recommendations[n] = (top_ns, n_recommended_items)

The MEM-ECF has an average recall@10: 0.01085, average novelty@10: 0.91555
The MEM-ECF has an average recall@25: 0.02534, average novelty@25: 0.94071
The MEM-ECF has an average recall@30: 0.02772, average novelty@30: 0.94731
The MEM-ECF has an average recall@45: 0.03680, average novelty@45: 0.95870


# Evaluate N-Recommendations 

In [19]:
def retrieve_recommendations(train: pd.DataFrame, top_ns: dict):
    """
    """
    # generating a random user
    random_user = np.random.choice(list(train['reviewerID'].unique()), 1)[0]
    print(f"For user: {random_user}:")
    print(f"Purchase History:\n{train[train['reviewerID'] == random_user][['asin', 'title']]}")

    # find the recommendations
    print(f"\nRecommending:\n")
    recommendations = (train[train['asin']
                             .isin(top_ns[random_user])][['asin', 'title']]
                       .drop_duplicates(subset='asin')
                       .set_index('asin'))
    print(f"{recommendations.loc[top_ns[random_user]].reset_index()}")

## N=10

In [20]:
top_ns_10 = n_recommendations[10][0]

In [35]:
retrieve_recommendations(train, top_ns_10)

For user: A1NV76C0QZ1HEI:
Purchase History:
             asin                                              title
41078  B000S120H2                           Cat Catcher Refill Mouse
41613  B000VAK20C  GoCat Da Bird Pull 2 Piece Pull Apart Rod &amp...

Recommending:

         asin                                              title
0  B00027CL5S  Halo Liv-A-Littles Grain Free Natural Dog Trea...
1  B00153RC9I  Halo Holistic Natural Dry Cat Food for Adult Cats
2  B0002AS5PU  Kaytee CritterTrail Fun-nels Tubes Accessories...
3  B0002UL3F4                Lixit Top Fill Water Tank, 32-Ounce
4  B0002DK6HM           Van Ness X-Small Auto Waterer, 1.5 Liter
5  B004YTZYV8  Bravo! Bonus Bites All Natural Freeze Dried Ve...
6  B0002ASC1M  Pureness Heavyweight Large Crock Pet Dish, 52-...
7  B0002AQ228  Prevue Hendryx Pet Products Wrought Iron Fligh...
8  B0002AQ444  Coralife 05508 Mini Flourescent Colormax Lamp,...
9  B0048Z73TY   DERMagic Shampoo Bar, 3.75 oz, Certified Organic


## N=25

In [22]:
top_ns_25 = n_recommendations[25][0]

In [23]:
retrieve_recommendations(train, top_ns_25)

For user: A1QPEL4PENBQN8:
Purchase History:
             asin                                          title
58095  B003RQVGKC  Catit Jumbo Hooded Cat Litter Pan - Warm Gray
64362  B005U6GMJW      Contech ProCone Recovery Collar for Pets.

Recommending:

          asin                                              title
0   B0002ASM94  Nylabone Dura Chew Pooch Pacifier Chew Bone Va...
1   B00153RC9I  Halo Holistic Natural Dry Cat Food for Adult Cats
2   B0002APRKQ                                  Lee's Net Breeder
3   B00078Y3US        Multipet Nuts for Knots Ball Medium Dog Toy
4   B0002AS5PU  Kaytee CritterTrail Fun-nels Tubes Accessories...
5   B0002UL3F4                Lixit Top Fill Water Tank, 32-Ounce
6   B0002DK6HM           Van Ness X-Small Auto Waterer, 1.5 Liter
7   B009MKCJAI                                           KONG TUG
8   B004YTZYV8  Bravo! Bonus Bites All Natural Freeze Dried Ve...
9   B0002ASC1M  Pureness Heavyweight Large Crock Pet Dish, 52-...
10  B0048Z73TY   DE

## N=30

In [24]:
top_ns_30 = n_recommendations[30][0]

In [25]:
retrieve_recommendations(train, top_ns_30)

For user: A8U1DKUR11OJY:
Purchase History:
             asin                                              title
26020  B000633Y4A          Red Barn Peanut Butter Filled Bone-Single
68232  B00DNQ2VVE  Hartz Frisky Frolic Latex Squeakable Dog Toy a...

Recommending:

          asin                                              title
0   B0002ASM94  Nylabone Dura Chew Pooch Pacifier Chew Bone Va...
1   B00153RC9I  Halo Holistic Natural Dry Cat Food for Adult Cats
2   B0002AR0II                        KONG Extreme Dog Toy, Black
3   B00078Y3US        Multipet Nuts for Knots Ball Medium Dog Toy
4   B0002AS5PU  Kaytee CritterTrail Fun-nels Tubes Accessories...
5   B0002UL3F4                Lixit Top Fill Water Tank, 32-Ounce
6   B0002DK6HM           Van Ness X-Small Auto Waterer, 1.5 Liter
7   B009MKCJAI                                           KONG TUG
8   B004YTZYV8  Bravo! Bonus Bites All Natural Freeze Dried Ve...
9   B0002ASC1M  Pureness Heavyweight Large Crock Pet Dish, 52-...
10  B000

## N=45

In [26]:
top_ns_45 = n_recommendations[45][0]

In [33]:
retrieve_recommendations(train, top_ns_45)

For user: A2WFZWRYWLC20S:
Purchase History:
             asin                                              title
41417  B000U39044                    Nina Ottosson Wooden Dog, Smart
48929  B001F2I9YG  Rachael Ray Nutrish Natural Dry Dog Food, Real...
54739  B002CJIPEK                     Purina Chef Michael'S Dog Food
66163  B007M0JHGE  Pet Naturals of Vermont - Scoot Bars, Natural ...

Recommending:

          asin                                              title
0   B004X3VQ5I  FunBites FlatSticks Calcibone, 60-Count (Pack ...
1   B0002H3R2E       Wellness Natural Wellbars Crunchy Dog Treats
2   B0002AR0II                        KONG Extreme Dog Toy, Black
3   B00008JOL0                       Zuke's Hip Action Dog Treats
4   B00008O36H  Purina Beneful Healthy Weight with Real Chicke...
5   B0011DKX0C                              Pet Naturals Hairball
6   B001BCVY8I  Petite Cuisine Variety Pack (Sesame Chicken &a...
7   B004X3VS5Q  FunBites Skin and Coat Congos 5-Inches, 4-Coun...
8