In [1]:
from collections import Counter
import warnings

import numpy as np
import pandas as pd
from pandarallel import pandarallel
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from tqdm import tqdm

from src.models import cf

pandarallel.initialize()
tqdm.pandas()
warnings.filterwarnings('ignore')

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use standard multiprocessing data transfer (pipe) to transfer data between the main process and workers.




# Load Data

In [2]:
# global variables
DATA_PATH = "data/evaluation"
CATEGORY = "Clothing_Shoes_and_Jewelry"

# load train dataset
train = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_train.csv")

In [3]:
# checking train dataframe
train.head()

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,0,31887,Ballet Dress-Up Fairy Tutu,"[['Clothing, Shoes & Jewelry', 'Girls', 'Cloth...",A1KLRMWW2FWPL4,5.0,This is a great tutu and at a really great pri...,2011-02-12,great tutu great price look cheap glad look am...
1,1,31887,Ballet Dress-Up Fairy Tutu,"[['Clothing, Shoes & Jewelry', 'Girls', 'Cloth...",A2G5TCU2WDFZ65,5.0,I bought this for my 4 yr old daughter for dan...,2013-01-19,buy yr old daughter dance class wear today tim...
2,2,31887,Ballet Dress-Up Fairy Tutu,"[['Clothing, Shoes & Jewelry', 'Girls', 'Cloth...",A1RLQXYNCMWRWN,5.0,What can I say... my daughters have it in oran...,2013-01-04,daughter orange black white pink think buy fuc...
3,3,31887,Ballet Dress-Up Fairy Tutu,"[['Clothing, Shoes & Jewelry', 'Girls', 'Cloth...",A8U3FAMSJVHS5,5.0,"We bought several tutus at once, and they are ...",2014-04-27,buy tutu high review sturdy seemingly girl wea...
4,4,31887,Ballet Dress-Up Fairy Tutu,"[['Clothing, Shoes & Jewelry', 'Girls', 'Cloth...",A3GEOILWLK86XM,5.0,Thank you Halo Heaven great product for Little...,2014-03-15,thank halo heaven great product little girls g...


In [4]:
# get user rating history
train_user_rating_history = train.groupby(["reviewerID"])["asin"].progress_apply(list)
print(train_user_rating_history)

100%|██████████████████████████████████████████████████████| 39386/39386 [00:00<00:00, 70710.30it/s]

reviewerID
A001114613O3F18Q5NVR6     [B0016JNS44, B001T54XA8, B004AZXO1I, B004QJWKLS]
A00146182PNM90WNNAZ5Q     [B000JJX7C0, B000MX3SH2, B003CO205E, B008JXDFCU]
A00165422B2GAUE3EL6Z0     [B007WADN4G, B007WAEBPQ, B007WAT3I6, B008G51WHQ]
A00338282E99B8OR2JYTZ                 [B002FA5B8O, B003F06XQW, B00768LFYY]
A00354001GE099Q1FL0TU                 [B00387EEYA, B003RYZY8E, B0058XN9ZC]
                                               ...                        
AZZMQ85DPFEG3            [B0007KPPAI, B005EYUQ7E, B005VEMVI4, B007LOTZ5...
AZZNK89PXD006                         [B000KGOHLM, B005C3DH00, B005PQPLLC]
AZZT1ERHBSNQ8            [B000UANLGU, B002ATSG8C, B002UTJVMM, B007P83XJ...
AZZTOUKVTUMVM             [B000196UJ0, B009GE3XQ4, B00C2DJ66C, B00C4NV6LS]
AZZYW4YOE1B6E                         [B003V4AKTS, B004G8GOW0, B007LTV82W]
Name: asin, Length: 39386, dtype: object





# Utility Functions

In [119]:
def get_top_n(predictions: dict, user_rating_history: pd.DataFrame, n: int=10) -> dict:
    """Return the top-N recommendations for each user based on cosine similarity.
    
    Args:
    
    Returns:
        ([dict]): A dictionary of top-N recommendations for each unique user, sorted by
            cosine similarties.
    """
    
    # retrieve a 200 items candidate list based on similarities
    top_ns = {}
    for user in predictions:
        top_ns[user] = predictions[user][:n]
        
    return top_ns

def recall_at_k(asins, predicted_asins, k=10):
    # number of relevant items
    set_actual = set(asins)
    set_preds = set(predicted_asins)
    num_relevant = len(set_actual.intersection(set_preds))
    
    # calculating recall@K - relevant / total relevant items
    recall_at_k = num_relevant / len(asins)
    
    return recall_at_k

def novelty_at_k(item_popularity, predicted_asins, k=10):
    """
    """
    # finding avg novelty
    popularity_sum = item_popularity.loc[predicted_asins].sum()
    novelty_at_k = ((k*1) - popularity_sum) / k
    
    return novelty_at_k


def generate_item_popularity(train: pd.DataFrame) -> pd.DataFrame:
    """
    """
    
    # create a mapping of item popularatity
    # based on sum(item's review / max reviews) / no items
    max_reviews = (train.groupby(['asin'])
                   .agg({'processedReviewText': 'count'})
                   .max()
                   .values[0])
    item_popularity = (train.groupby(['asin'])
                       .agg({'processedReviewText': 'count'})
                       .apply(lambda x: x/max_reviews))
    
    return item_popularity
    

def evaluate_recommendations(top_ns: dict, user_rating_history: pd.DataFrame, item_popularity: pd.DataFrame, k=10) -> pd.DataFrame:
    """
    
    Args:
        top_ns
        user_rating_history
    """
    
    test_recommendations = pd.DataFrame(top_ns.items(), columns=["reviewerID", "pred_asin"])
    
    # combined test history and recommendations
    test_merged = pd.merge(user_rating_history, test_recommendations, on="reviewerID", how="inner")
    
    # generating recall@k metrics
    test_merged["recall@k"] = test_merged.apply(lambda x: recall_at_k(x.asin, x.pred_asin, k=k), axis=1)
    test_merged["novelty@k"] = test_merged.apply(lambda x: novelty_at_k(item_popularity, x.pred_asin, k=k), axis=1)
    average_recall_at_k = test_merged["recall@k"].mean()
    average_novelty_at_k = test_merged["novelty@k"].mean()
    
    print(f"The MEM-ECF has an average recall@{k}: {average_recall_at_k:.5f}, average novelty@{k}: {average_novelty_at_k:.5f}")
    
    return test_merged

# Generate N-Recommendations = {10, 25, 30, 45}

## Load Test Data

In [6]:
# loading test dataset
test = pd.read_csv(f"{DATA_PATH}/{CATEGORY}_test.csv")

In [7]:
test.head().append(test.tail())

Unnamed: 0,index,asin,title,categories,reviewerID,overall,reviewText,reviewTime,processedReviewText
0,6,0000031887,Ballet Dress-Up Fairy Tutu,"[['Clothing, Shoes & Jewelry', 'Girls', 'Cloth...",A16GFPNVF4Y816,5.0,Bought this as a backup to the regular ballet ...,2014-05-03,bought backup regular ballet outfit daughter w...
1,17,0000031887,Ballet Dress-Up Fairy Tutu,"[['Clothing, Shoes & Jewelry', 'Girls', 'Cloth...",A2XJ13PIXVJFJH,1.0,Never GOT this item - but gave a 1 STAR becaus...,2014-05-12,item star reply supplier great try send item r...
2,23,0123456479,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,"[['Clothing, Shoes & Jewelry', 'Novelty, Costu...",A2WNN1DQVL4LH5,5.0,The minute I saw this my heart skipped a beat....,2013-11-07,minute saw heart skip beat nice case sort coll...
3,24,0123456479,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,"[['Clothing, Shoes & Jewelry', 'Novelty, Costu...",A1ZPOCG2ST2CY3,5.0,Love this Jewelry Box so well put together ho...,2014-01-19,love jewelry box hold plendy love pink look ni...
4,27,0123456479,SHINING IMAGE HUGE PINK LEATHER JEWELRY BOX / ...,"[['Clothing, Shoes & Jewelry', 'Novelty, Costu...",A1JC50F14SLAEV,3.0,I wanted to have the title summarize my though...,2014-05-12,want title summarize thought decide read entir...
99497,278341,B00KF9180W,[2 PACK] Multi-Purpose Sports Balaclava - For ...,"[['Clothing, Shoes & Jewelry', 'Men', 'Accesso...",A1EVV74UQYVKRY,4.0,I go walking a lot in all kinds of weather and...,2014-06-16,walk lot kind weather know lot like block cold...
99498,278342,B00KF9180W,[2 PACK] Multi-Purpose Sports Balaclava - For ...,"[['Clothing, Shoes & Jewelry', 'Men', 'Accesso...",ABUE0ALHKWKHC,5.0,This two pack of Balaclavas makes for a very n...,2014-06-09,pack balaclava nice purchase balaclava fit snu...
99499,278343,B00KF9180W,[2 PACK] Multi-Purpose Sports Balaclava - For ...,"[['Clothing, Shoes & Jewelry', 'Men', 'Accesso...",A1PI8VBCXXSGC7,5.0,"Well, the first thing I did was try the balacl...",2014-06-13,thing try balaclava hubby try fit average size...
99500,278344,B00KF9180W,[2 PACK] Multi-Purpose Sports Balaclava - For ...,"[['Clothing, Shoes & Jewelry', 'Men', 'Accesso...",A2XX2A4OJCDNLZ,5.0,While balaclavas can be used for a variety of ...,2014-06-13,balaclava use variety thing use mainly late fa...
99501,278346,B00KF9180W,[2 PACK] Multi-Purpose Sports Balaclava - For ...,"[['Clothing, Shoes & Jewelry', 'Men', 'Accesso...",A3UJRNI8UR4871,4.0,"Nice material, but not as nice as silk or mer...",2014-06-09,nice material nice silk merino wool course mat...


In [8]:
# generating test history
test_user_history = (pd.DataFrame(test.groupby(['reviewerID'])['asin']
                                  .apply(list).reset_index()))

In [9]:
print(test_user_history)

                  reviewerID                                              asin
0      A001114613O3F18Q5NVR6              [B000J6ZYL0, B005BXP7R2, B0093STGGO]
1      A00146182PNM90WNNAZ5Q              [B006Y4QDVQ, B00823Y41S, B00BQJV1LG]
2      A00165422B2GAUE3EL6Z0                          [B008SBGKP2, B00BLW2PZY]
3      A00338282E99B8OR2JYTZ                          [B003F8BKGW, B00DVFNNQE]
4      A00354001GE099Q1FL0TU                          [B0058YTOP0, B00BTWAZ0I]
...                      ...                                               ...
39358          AZZMQ85DPFEG3  [B000S7O8AS, B009N0CSXU, B00CKGB85I, B00DUW4VAA]
39359          AZZNK89PXD006                          [B004L7J7IO, B008ZBPQJ6]
39360          AZZT1ERHBSNQ8              [B00856U6BE, B0089GNZNQ, B00CPK44DM]
39361          AZZTOUKVTUMVM                          [B000VL04LS, B0053XF2U2]
39362          AZZYW4YOE1B6E                          [B007UNSF3O, B008J4RESK]

[39363 rows x 2 columns]


## Instantiate Memory-based Embedding CF (User-based)

In [10]:
class UserBasedCF:
    """
    """

    def __init__(self):
        self._rating_history = None
        self._mean_ratings = None
        self._k_neighbourhood = None
        self.utility_matrix = None
        self.sim_matrix = None

    def __get_utility_matrix(self, trainset: pd.DataFrame):
        """
        """
        self._mean_ratings = trainset.groupby(['reviewerID'], as_index=False)['overall'].mean()
        self._mean_ratings.columns = ['reviewerID', 'mean_overall']

        # creating utility matrix
        train = pd.merge(trainset, self._mean_ratings, on='reviewerID')
        # deviation from user's average rating
        train['dev_overall'] = train['overall'] - train['mean_overall']
        utility_matrix = train.pivot_table(index='reviewerID', columns='asin', values='dev_overall')

        return utility_matrix.fillna(utility_matrix.mean(axis=0))

    def __get_similarities_matrix(self):
        """
        """
        cosine_sim = cosine_similarity(self.utility_matrix)
        np.fill_diagonal(cosine_sim, 0)
        # generate user similarity matrix
        users_sim = pd.DataFrame(cosine_sim, index=self.utility_matrix.index)
        users_sim.columns = self.utility_matrix.index

        return users_sim

    def __get_k_neighbourhood(self, k_neighbours: float):
        """
        """
        # sim_order = np.argsort(self.sim_matrix.values, axis=1)[:, :k_neighbours]
        neighbours = (self.sim_matrix
                      .apply(lambda x: pd.Series(x.sort_values(ascending=False)
                                                 .iloc[:k_neighbours]
                                                 .index,
                                                 index=['top{}'.format(i) for i in range(1, k_neighbours+1)]),
                             axis=1))

        return neighbours

    def __predict_rating(self, user):
        """
        """
        # retrieve user rating history
        user_rating_history = self._rating_history[user]

        # list of K-neighbourhood of similar users
        sim_users = (self._k_neighbourhood[self._k_neighbourhood.index == user]
                     .values
                     .squeeze()
                     .tolist())
        # retrieve similar user rating history
        sim_users_rating_history = [j for i in self._rating_history[sim_users] for j in i]
        # find items rated by similar users by not by target user
        item_under_consideration = set(sim_users_rating_history) - set(user_rating_history)

        # retrieve target user mean rating
        user_mean_rating = self._mean_ratings.loc[self._mean_ratings['reviewerID'] == user, 'mean_overall'].values[0]

        candidate_items = {}
        for item in item_under_consideration:
            # retrieve item norm ratings
            item_norm_ratings = self.utility_matrix.loc[:, item]
            # retrieve norm ratings from similar users
            sim_norm_ratings = item_norm_ratings[item_norm_ratings.index.isin(sim_users)]
            # retrieve target user and similar user cosine similarities
            corrs = self.sim_matrix.loc[user, sim_users]

            # combined item norm ratings and user corrs - cosine similarities
            user_corrs = pd.concat([sim_norm_ratings, corrs], axis=1)
            user_corrs.columns = ['dev_overall', 'correlation']
            user_corrs['overall'] = user_corrs.apply(lambda x: x['dev_overall'] * x['correlation'], axis=1)

            # compute predicted ratings
            numerator = user_corrs['overall'].sum()
            denominator = user_corrs['correlation'].sum()
            predict_rating = user_mean_rating + (numerator/denominator)

            candidate_items[item] = predict_rating

        # retrieve counts of items appearing in similar user rating history
        item_counts = pd.DataFrame.from_dict(Counter(sim_users_rating_history), orient='index', columns=['count'])
        candidate_items = pd.DataFrame.from_dict(candidate_items, orient='index', columns=['pred_overall'])
        # merge predicted ratings and counts
        candidate_items = candidate_items.merge(item_counts, left_index=True, right_index=True)

        return candidate_items.sort_values(by=['count', 'pred_overall'], ascending=False).index.tolist()

    def fit(self, trainset: pd.DataFrame, k_neighbours: float=50):
        """

        Args:
            trainset ([pd.DataFrame]):
            k_neighbours ([int]):
        """
        # generate user rating history
        self._rating_history = trainset.groupby(['reviewerID'])['asin'].apply(list)
        self.utility_matrix = self.__get_utility_matrix(trainset)
        self.sim_matrix = self.__get_similarities_matrix()
        self._k_neighbourhood = self.__get_k_neighbourhood(k_neighbours)

    def predict(self):
        """
        """
        # retrieve unique users
        unique_users = self._rating_history.reset_index()['reviewerID'].tolist()

        predictions = {}
        for user in tqdm(unique_users):
            predictions[user] = self.__predict_rating(user)

        return predictions

In [11]:
# instantiate model
ub_cf = cf.UserBasedCF()

In [12]:
# fit learning algorithm to training data
ub_cf.fit(train, k_neighbours=50)

In [13]:
# generate candidates items ranked by predicted ratings
candidate_items = ub_cf.test()

100%|███████████████████████████████████████████████████████| 39386/39386 [7:54:43<00:00,  1.38it/s]


## Loop through N = {10, 25, 30, 45}

In [104]:
# generate item popularity
item_popularity = generate_item_popularity(train)

In [120]:
n_recommendations = {}
for n in [10, 25, 30, 45]:
    # retrieve the top-n items based on similarities
    top_ns = get_top_n(candidate_items, train_user_rating_history, n)
    # evaluate how well the recommended items predicted the future purchases
    n_recommended_items = evaluate_recommendations(top_ns, test_user_history, item_popularity, n)
    # saving the n-value and recommended items
    n_recommendations[n] = (top_ns, n_recommended_items)

The MEM-ECF has an average recall@10: 0.00453, average novelty@10: 0.25549
The MEM-ECF has an average recall@25: 0.00762, average novelty@25: 0.13104
The MEM-ECF has an average recall@30: 0.00826, average novelty@30: 0.11785
The MEM-ECF has an average recall@45: 0.01074, average novelty@45: 0.09977


# Evaluate N-Recommendations 

In [40]:
def retrieve_recommendations(train: pd.DataFrame, top_ns: dict):
    """
    """
    # generating a random user
    random_user = np.random.choice(list(train['reviewerID'].unique()), 1)[0]
    print(f"For user: {random_user}:")
    print(f"Purchase History:\n{train[train['reviewerID'] == random_user][['asin', 'title']]}")

    # find the recommendations
    print(f"\nRecommending:\n")
    recommendations = (train[train['asin']
                             .isin(top_ns[random_user])][['asin', 'title']]
                       .drop_duplicates(subset='asin')
                       .set_index('asin'))
    print(f"{recommendations.loc[top_ns[random_user]].reset_index()}")

## N=10

In [36]:
top_ns_10 = n_recommendations[10][0]

In [73]:
retrieve_recommendations(train, top_ns_10)

For user: A2BAPJKLRD0GAS:
Purchase History:
              asin                                              title
24485   B000KR3D5W      Merrell Men's Moab Ventilator Multisport Shoe
108096  B0058XIMMM                   Skechers Women's Go Walk Slip-On
150495  B008S2QIEY  Rockport Men's RocSports Lite Summer 3 Strap S...

Recommending:

         asin                                              title
0  B009ZDEXQK         Skechers Women's Go Walk 2 Fashion Sneaker
1  B0000ANHST         Carhartt Men's Workwear Pocket T-Shirt K87
2  B00A0D4V7K                           Skechers Men's Go Walk 2
3  B004Y53ETQ     Skechers Women's Go Walk Ultimate Walking Shoe
4  B0007YVP1W                  Levi's Men's 550 Relaxed Fit Jean
5  B0001YSBEW  Carhartt Men's Long Sleeve Workwear Pocket T-S...
6  B0067LGP3W  Christmas Time Elf Costume Junior's Green T-shirt
7  B00E9CC25Q      Skechers Women's Go Walk 2-Spark Walking Shoe
8  B001QTW98A  Bulova Women's 98R112 Diamond Accented Two-Ton...
9  B001B35

## N=25

In [18]:
top_ns_25 = n_recommendations[25][0]

In [42]:
retrieve_recommendations(train, top_ns_25)

For user: A27I8ABM3ED3DI:
Purchase History:
             asin                                              title
50049  B001J62WNM               The Flash Classic Logo Men's T-shirt
66428  B002TLUFDA  Fruit of the Loom Boys 8-20 Assorted Boxer Bri...
70075  B00332FWHI        Nautica Men's Woven Mediterranean Dot Short

Recommending:

          asin                                              title
0   B0000ANHST         Carhartt Men's Workwear Pocket T-Shirt K87
1   B0001YSBEW  Carhartt Men's Long Sleeve Workwear Pocket T-S...
2   B005LERHD8  Vintage, Retro Colorful Crystal Owl Pendant an...
3   B0018OM1TU         Levi's Men's 559 Relaxed Straight Leg Jean
4   B0078FXHNM  Antique Alloy with Colour Crystal Owl Long Pre...
5   B0007YVP1W                  Levi's Men's 550 Relaxed Fit Jean
6   B0006LMBJ6                     Levi's Men's 511 Slim Fit Jean
7   B0000C321X                 Levi's Men's 501 Original Fit Jean
8   B0006LJLMG                Levi's Men's 501 Shrink To Fit Jean
9   

## N=30

In [20]:
top_ns_30 = n_recommendations[30][0]

In [61]:
retrieve_recommendations(train, top_ns_30)

For user: A2J3LNBFYLGU3G:
Purchase History:
              asin                                              title
19779   B000GB1RC8  Casio Women's LQ139A-9B3 Black Casual Classic ...
88173   B0046ZWZ7G  Surgical Steel 2mm Domed Wedding Band Thumb / ...
154384  B0099UKW96                      SODA WOMENS DOME COMBAT BOOTS

Recommending:

          asin                                              title
0   B005LERHD8  Vintage, Retro Colorful Crystal Owl Pendant an...
1   B006R0QGWS  1.00 Carat Total Weight Amazing Black Cubic Zi...
2   B0088I46XI  Authentic Diamond Color Crystal Disco Ball Stu...
3   B002JCSXZQ                    crocs Unisex Crocband Flip Flop
4   B0081IZ3UA   925 Sterling Silver Cubic Zirconia Stud Earrings
5   B000SZKHNC  Timex Unisex T5G841 1440 Sports Digital Silver...
6   B004Q7AB4I  Super King Jewelry Exquisite 2.00 Carat Cubic ...
7   B008RKYS44  Authentic Diamond Color Crystals Ball Pendant,...
8   B0067GUM2W  Vintage Owl Necklace Long Pattern Necklace Coa...


## N=45

In [22]:
top_ns_45 = n_recommendations[45][0]

In [54]:
retrieve_recommendations(train, top_ns_45)

For user: A2MDH0E6N5GC4A:
Purchase History:
             asin                                              title
5514   B0002USAW8  Capezio Daisy 205 Ballet Shoe (Toddler/Little ...
16386  B000ECXZHE                       i play. Ultimate Swim Diaper
65441  B002R0FD7G            Capezio Girls 7-16 Short Sleeve Leotard

Recommending:

          asin                                              title
0   B0001YSBEW  Carhartt Men's Long Sleeve Workwear Pocket T-S...
1   B005LERHD8  Vintage, Retro Colorful Crystal Owl Pendant an...
2   B0000ANHST         Carhartt Men's Workwear Pocket T-Shirt K87
3   B0007YVP1W                  Levi's Men's 550 Relaxed Fit Jean
4   B004Q7AB4I  Super King Jewelry Exquisite 2.00 Carat Cubic ...
5   B0018OM1TU         Levi's Men's 559 Relaxed Straight Leg Jean
6   B0078FXHNM  Antique Alloy with Colour Crystal Owl Long Pre...
7   B000NGKLNC  Sterling Silver Three-Stone Cubic Zirconia Rin...
8   B0006LMBJ6                     Levi's Men's 511 Slim Fit Jean
9   