In [2]:
import pandas as pd
import numpy as np
import itertools
import pickle
from collections import defaultdict

# display options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# custom helper functions
%load_ext autoreload
%autoreload 2
from helper.general_helper import *

# lightfm
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score


# view plotly in jupyter 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# ignore warnings
import warnings
warnings.filterwarnings("ignore")



In [3]:
df = pd.read_csv('../../merged_df.csv')

In [4]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,description,title,imUrl,related,salesRank,categories,price,brand,prod_review_counts,review_count,reviewer_cat
0,ALC5GH8CAMAI7,159985130X,AnnN,"[1, 1]",This is a great little gadget to have around. ...,5.0,Handy little gadget,1294185600,"01 5, 2011","The Pocket Magnifier is so popular, we are hav...","Lightwedge Lighted Pocket Magnifier, Plum",http://ecx.images-amazon.com/images/I/31eVZtZ%...,"{'also_bought': ['B002DGPUM2', 'B00524H8MC', '...",0.0,living-equipment-aids-supplies-visual-impairme...,24.95,,medium,1,alergy
1,AHKSURW85PJUE,159985130X,"AZ buyer ""AZ buyer""","[1, 1]",I would recommend this for a travel magnifier ...,4.0,Small & may need to encourage battery,1329523200,"02 18, 2012","The Pocket Magnifier is so popular, we are hav...","Lightwedge Lighted Pocket Magnifier, Plum",http://ecx.images-amazon.com/images/I/31eVZtZ%...,"{'also_bought': ['B002DGPUM2', 'B00524H8MC', '...",0.0,living-equipment-aids-supplies-visual-impairme...,24.95,,medium,1,alergy
2,A38RMU1Y5TDP9,159985130X,"Bob Tobias ""Robert Tobias""","[75, 77]",What I liked was the quality of the lens and t...,4.0,Very good but not great,1275955200,"06 8, 2010","The Pocket Magnifier is so popular, we are hav...","Lightwedge Lighted Pocket Magnifier, Plum",http://ecx.images-amazon.com/images/I/31eVZtZ%...,"{'also_bought': ['B002DGPUM2', 'B00524H8MC', '...",0.0,living-equipment-aids-supplies-visual-impairme...,24.95,,medium,1,scale
3,A1XZUG7DFXXOS4,159985130X,Cat lover,"[56, 60]",Love the Great point light pocket magnifier! ...,4.0,great addition to your purse,1202428800,"02 8, 2008","The Pocket Magnifier is so popular, we are hav...","Lightwedge Lighted Pocket Magnifier, Plum",http://ecx.images-amazon.com/images/I/31eVZtZ%...,"{'also_bought': ['B002DGPUM2', 'B00524H8MC', '...",0.0,living-equipment-aids-supplies-visual-impairme...,24.95,,medium,1,scale
4,A1MS3M7M7AM13X,159985130X,Cricketoes,"[1, 1]",This is very nice. You pull out on the magnifi...,5.0,Very nice and convenient.,1313452800,"08 16, 2011","The Pocket Magnifier is so popular, we are hav...","Lightwedge Lighted Pocket Magnifier, Plum",http://ecx.images-amazon.com/images/I/31eVZtZ%...,"{'also_bought': ['B002DGPUM2', 'B00524H8MC', '...",0.0,living-equipment-aids-supplies-visual-impairme...,24.95,,medium,1,massage


##### Create Usier/Item/Review dataset plus item dataset

In [5]:
def generate_int_id(dataframe, id_col_name):
    """
    Generate unique integer id for users, questions and answers

    Parameters
    ----------
    dataframe: Dataframe
        Pandas Dataframe for Users or Q&A. 
    id_col_name : String 
        New integer id's column name.
        
    Returns
    -------
    Dataframe
        Updated dataframe containing new id column 
    """
    new_dataframe=dataframe.assign(
        int_id_col_name=np.arange(len(dataframe))
        ).reset_index(drop=True)
    return new_dataframe.rename(columns={'int_id_col_name': 'int_id'})

In [6]:
df = generate_int_id(df, 'reviewerID')

In [7]:
idx_dict = defaultdict(str)
for real_id, num_id in zip(df.reviewerID, df.int_id):
    idx_dict[num_id] = real_id

In [8]:
cut_down = df.groupby('reviewerID').filter(lambda x: len(x) >= 15)
cut_down = df.groupby('asin').filter(lambda x: len(x)> 15)

##### set user ids as integers

In [9]:
# pass 1
cut_down['related'] = cut_down.related.apply(lambda x: x.split()[1:13] if isinstance(x, str) else 'unknown')

# pass 2
cut_down['related'] = cut_down.related.apply(lambda x: " ".join(x).strip('[,'))

# final pass
cut_down['related'] = cut_down.related.apply(lambda x: " ".join(sorted(x.split(','))))

In [10]:
df_uir = cut_down[['int_id', 'asin', 'overall']]

In [11]:
df_uir.head()

Unnamed: 0,int_id,asin,overall
0,0,159985130X,5.0
1,1,159985130X,4.0
2,2,159985130X,4.0
3,3,159985130X,4.0
4,4,159985130X,5.0


In [12]:
df_uf = cut_down[['int_id','reviewer_cat']]

In [13]:
df_uf.head()

Unnamed: 0,int_id,reviewer_cat
0,0,alergy
1,1,alergy
2,2,scale
3,3,scale
4,4,massage


In [16]:
df_if =  cut_down[['asin', 'prod_review_counts']]

In [17]:
df_if.head()

Unnamed: 0,asin,prod_review_counts
0,159985130X,medium
1,159985130X,medium
2,159985130X,medium
3,159985130X,medium
4,159985130X,medium


##### generating feature to unique values lists

In [None]:
item_feature_list = []

# format should be 'feature:value'

# selecting unique values
prod_lst = [f"prod_review_counts:{cat}" for cat in df_if.prod_review_counts.unique()]
rank_lst = [f"salesRank:{num}" for num in df_if.salesRank.unique()]
price_lst = [f"price:{num}" for num in df_if.price.unique()]

# appending list
for cat in prod_lst:
    item_feature_list.append(cat)
for cat in rank_lst:
    item_feature_list.append(cat)
for cat in price_lst:
    item_feature_list.append(cat)



In [24]:

# user feature list
user_feature_list = []

rev_cat_lst = [f"reviewer_cat:{cat}" for cat in df_uf.reviewer_cat.unique()]



for cat in rev_cat_lst:
    user_feature_list.append(cat)

##### setting LightFM dataset correctly

In [36]:
dataset = Dataset()
dataset.fit(cut_down['int_id'].unique(),
            cut_down['asin'].unique())
#             item_features = item_feature_list)

##### set weights and build interactions

In [134]:

weight_dictionary = dict(df.groupby('asin').count()['reviewerID'].apply(lambda x: 1 / x))

In [135]:
df_uir['weights'] = df_uir['asin'].apply(lambda x: weight_dictionary[x])

In [37]:
(interactions, weights) = dataset.build_interactions([(x[0], x[1], x[2]) for x in df_uir.values])

##### building interactions

In [20]:
df_if.head(2)

Unnamed: 0,asin,prod_review_counts
0,159985130X,medium
1,159985130X,medium


In [138]:
df_uf.head(2)

Unnamed: 0,int_id,reviewer_cat
0,0,alergy
1,1,alergy


##### build item_feature for lightfm

In [139]:
# map each item category and values to app
ad_subset = df_if[['prod_review_counts', 'salesRank', 'price']] 
ad_list = [list(x) for x in ad_subset.values]

# helper function
def feature_colon_value(my_list):
    """
    Takes as input a list and prepends the columns names to respective values in the list.
    For example: if my_list = [1,1,0,'del'],
    resultant output = ['f1:1', 'f2:1', 'f3:0', 'loc:del']
   
    """
    result = []
    ll = ['prod_review_counts:', 'salesRank:', 'price:']
    aa = my_list
    for x, y in zip(ll, aa):
        res = str(x) +""+ str(y)
        result.append(res)
    return result

feature_list = []
for item in ad_list:
    feature_list.append(feature_colon_value(item))
print(f'Final output: {feature_list[:2]}')

Final output: [['prod_review_counts:medium', 'salesRank:0.0', 'price:24.95'], ['prod_review_counts:medium', 'salesRank:0.0', 'price:24.95']]


In [140]:
# combining product with each distinct feature subset
item_tuple = list(zip(df_if['asin'], feature_list))
item_features = dataset.build_item_features(item_tuple, normalize=False)

##### build user feature for lightfm

In [27]:
ad_subset = df_uf[['reviewer_cat']] 
ad_list = [list(x) for x in ad_subset.values]

# helper function
def feature_colon_value(my_list):
    """
    Takes as input a list and prepends the columns names to respective values in the list.
    For example: if my_list = [1,1,0,'del'],
    resultant output = ['f1:1', 'f2:1', 'f3:0', 'loc:del']
   
    """
    result = []
    ll = ['reviewer_cat:']
    aa = my_list
    for x, y in zip(ll, aa):
        res = str(x) +""+ str(y)
        result.append(res)
    return result

feature_list = []
for item in ad_list:
    feature_list.append(feature_colon_value(item))
print(f'Final output: {feature_list[:5]}')

Final output: [['reviewer_cat:alergy'], ['reviewer_cat:alergy'], ['reviewer_cat:scale'], ['reviewer_cat:scale'], ['reviewer_cat:massage']]


In [28]:
user_tuple = list(zip(df_uf['int_id'], feature_list))
user_features = dataset.build_user_features(user_tuple, normalize=False)

##### model validation with hyperparameter tuning

In [38]:
train, test = random_train_test_split(interactions)

In [39]:
model = LightFM(loss='warp', no_components=20)
model.fit(train, epochs=20, num_threads=1)

<lightfm.lightfm.LightFM at 0x2457e566bb0>

In [33]:
from lightfm.evaluation import precision_at_k

In [40]:
k = 20
print('Train precision at k={}:\t{:.4f}'.format(k, precision_at_k(model, train, k=k).mean()))
print('Test precision at k={}:\t\t{:.4f}'.format(k, precision_at_k(model, test, k=k).mean()))

Train precision at k=20:	0.0500
Test precision at k=20:		0.0019


In [144]:
def sample_hyperparameters():
    """
    Yield possible hyperparameter choices.
    """

    while True:
        yield {
            "no_components": np.random.randint(10, 100),
            "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
            "loss": np.random.choice(["bpr", "warp", "warp-kos"]),
            "learning_rate": np.random.exponential(0.05),
            "item_alpha": np.random.exponential(1e-8),
            "max_sampled": np.random.randint(5, 15),
            "num_epochs": np.random.randint(5, 100),
        }
        
def random_search(train, test, num_samples=10):
    """
    Sample random hyperparameters, fit a LightFM model, and evaluate it
    on the test set.

    Parameters
    ----------

    train: np.float32 coo_matrix of shape [n_users, n_items]
        Training data.
    test: np.float32 coo_matrix of shape [n_users, n_items]
        Test data.
    num_samples: int, optional
        Number of hyperparameter choices to evaluate.


    Returns
    -------

    generator of (auc_score, hyperparameter dict, fitted model)

    """

    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")

        model = LightFM(**hyperparams)
        model.fit(train, epochs=num_epochs, user_features=user_features, item_features= item_features)

        score = auc_score(model,
                      test, train_interactions=train,
                      user_features=user_features,
                      item_features=item_features
                     ).mean()

        hyperparams["num_epochs"] = num_epochs

        yield (score, hyperparams, model)

In [145]:
(score, hyperparams, model) = max(random_search(train, test), key=lambda x: x[0])

In [None]:
optimized auc: 0.7297078967094421
hyperparameters: {'no_components': 16, 'learning_schedule': 'adadelta', 'loss': 'warp', 'learning_rate': 0.004692430910889622, 'item_alpha': 1.0857673877772748e-08, 'max_sampled': 10, 'num_epochs': 25

In [57]:
# with open('../../lightfm_cut.pickle', 'wb') as fle:
#     pickle.dump(model, fle)

##### Known User Prediction

In [None]:
user_x = user_id_map['u3']
n_users, n_items = interactions.shape # no of users * no of items
model.predict(user_x, np.arange(n_items)) # means predict for all

##### Unknown Users

In [None]:

from scipy import sparse
def format_newuser_input(user_feature_map, user_feature_list):
  num_features = len(user_feature_list)
  normalised_val = 1.0 
  target_indices = []
  for feature in user_feature_list:
    try:
        target_indices.append(user_feature_map[feature])
    except KeyError:
        print("new user feature encountered '{}'".format(feature))
        pass

  new_user_features = np.zeros(len(user_feature_map.keys()))
  for i in target_indices:
    new_user_features[i] = normalised_val
  new_user_features = sparse.csr_matrix(new_user_features)
  return(new_user_features
         

new_user_features = format_newuser_input(user_feature_map, user_feature_list)
model.predict(0, np.arange(n_items), user_features=new_user_features)