In [56]:
import pandas as pd
import numpy as np
import itertools
import pickle

# display options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# custom helper functions
%load_ext autoreload
%autoreload 2
from helper.general_helper import *

# lightfm
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score


# view plotly in jupyter 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# ignore warnings
import warnings
warnings.filterwarnings("ignore")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
df = pd.read_csv('../../merged_df.csv')

In [60]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,description,title,imUrl,related,salesRank,categories,price,brand,prod_review_counts
0,ALC5GH8CAMAI7,159985130X,AnnN,"[1, 1]",This is a great little gadget to have around. ...,5.0,Handy little gadget,1294185600,"01 5, 2011","The Pocket Magnifier is so popular, we are hav...","lightwedge-lighted-pocket-magnifier,-plum",http://ecx.images-amazon.com/images/I/31eVZtZ%...,"{'also_bought': ['B002DGPUM2', 'B00524H8MC', '...",0.0,Medical Supplies & Equipment Daily Living Aids...,24.95,misc,medium
1,AHKSURW85PJUE,159985130X,"AZ buyer ""AZ buyer""","[1, 1]",I would recommend this for a travel magnifier ...,4.0,Small & may need to encourage battery,1329523200,"02 18, 2012","The Pocket Magnifier is so popular, we are hav...","lightwedge-lighted-pocket-magnifier,-plum",http://ecx.images-amazon.com/images/I/31eVZtZ%...,"{'also_bought': ['B002DGPUM2', 'B00524H8MC', '...",0.0,Medical Supplies & Equipment Daily Living Aids...,24.95,misc,medium
2,A38RMU1Y5TDP9,159985130X,"Bob Tobias ""Robert Tobias""","[75, 77]",What I liked was the quality of the lens and t...,4.0,Very good but not great,1275955200,"06 8, 2010","The Pocket Magnifier is so popular, we are hav...","lightwedge-lighted-pocket-magnifier,-plum",http://ecx.images-amazon.com/images/I/31eVZtZ%...,"{'also_bought': ['B002DGPUM2', 'B00524H8MC', '...",0.0,Medical Supplies & Equipment Daily Living Aids...,24.95,misc,medium
3,A1XZUG7DFXXOS4,159985130X,Cat lover,"[56, 60]",Love the Great point light pocket magnifier! ...,4.0,great addition to your purse,1202428800,"02 8, 2008","The Pocket Magnifier is so popular, we are hav...","lightwedge-lighted-pocket-magnifier,-plum",http://ecx.images-amazon.com/images/I/31eVZtZ%...,"{'also_bought': ['B002DGPUM2', 'B00524H8MC', '...",0.0,Medical Supplies & Equipment Daily Living Aids...,24.95,misc,medium
4,A1MS3M7M7AM13X,159985130X,Cricketoes,"[1, 1]",This is very nice. You pull out on the magnifi...,5.0,Very nice and convenient.,1313452800,"08 16, 2011","The Pocket Magnifier is so popular, we are hav...","lightwedge-lighted-pocket-magnifier,-plum",http://ecx.images-amazon.com/images/I/31eVZtZ%...,"{'also_bought': ['B002DGPUM2', 'B00524H8MC', '...",0.0,Medical Supplies & Equipment Daily Living Aids...,24.95,misc,medium


In [3]:
df.brand.fillna(value='misc', inplace=True)
df.brand = df.brand.apply(lambda x: "-".join(x.lower().split()))
df.brand.value_counts()

misc                     77372
now-foods                 8660
jarrow                    7604
philips                   5672
panasonic                 3183
                         ...  
shamtastic                   5
sundesa-blenderbottle        5
pure-touch-skin-care         5
fc2                          5
laerdal                      5
Name: brand, Length: 3825, dtype: int64

##### Create Usier/Item/Review dataset plus item dataset

In [4]:
cut_down = df.groupby('reviewerID').filter(lambda x: len(x) >= 10)

In [6]:
df_uir = cut_down[['reviewerID', 'asin', 'overall']]

In [7]:
df_uir.head()

Unnamed: 0,reviewerID,asin,overall
2,A38RMU1Y5TDP9,159985130X,4.0
5,AXO4PQU0XG3TG,159985130X,5.0
6,A28X0LT2100RL1,159985130X,4.0
8,A1JQDCX4LDKBZ3,159985130X,3.0
9,A3RNRXOM5J2C93,159985130X,4.0


In [8]:
df_if =  cut_down[['asin', 'price', 'prod_review_counts', 'categories', 'brand']]

In [9]:
df_if.head()

Unnamed: 0,asin,price,prod_review_counts,categories,brand
2,159985130X,24.95,medium,Medical Supplies & Equipment Daily Living Aids...,misc
5,159985130X,24.95,medium,Medical Supplies & Equipment Daily Living Aids...,misc
6,159985130X,24.95,medium,Medical Supplies & Equipment Daily Living Aids...,misc
8,159985130X,24.95,medium,Medical Supplies & Equipment Daily Living Aids...,misc
9,159985130X,24.95,medium,Medical Supplies & Equipment Daily Living Aids...,misc


##### generating feature to unique values lists

In [24]:
feature_list = []

# format should be 'feature:value'

# selecting unique values
# price_lst = [f"price:{val}" for val in df_if.price.unique()]
sales_lst = [f"prod_review_counts:{cat}" for cat in df_if.prod_review_counts.unique()]
cat_lst = [f"categories:{cat}" for cat in df_if.categories.unique()]
brand_lst = [f"brand:{brand}" for brand in df_if.brand.unique()]

# appending list
# for price in price_lst:
#     feature_list.append(price)
for sales in sales_lst:
    feature_list.append(sales)
for cat in cat_lst:
    feature_list.append(cat)
for brand in brand_lst:
    feature_list.append(brand)
    


##### setting LightFM dataset correctly

In [39]:
dataset = Dataset()
dataset.fit(cut_down['reviewerID'].unique(),
             cut_down['asin'].unique(),
             item_features = feature_list)

In [40]:
(interactions, weights) = dataset.build_interactions([(x[0], x[1], x[2]) for x in df_uir.values])

##### building interactions

In [48]:
# map each item category and values to app
ad_subset = df_if[['prod_review_counts', 'categories', 'brand']] 
ad_list = [list(x) for x in ad_subset.values]


feature_list = []
for item in ad_list:
    feature_list.append(feature_colon_value(item))
print(f'Final output: {feature_list[:10]}')

Final output: [['price:24.95', 'prod_review_counts:medium', 'categories:Medical Supplies & Equipment Daily Living Aids Visual Impairment Aids Magnifiers', 'brand:misc'], ['price:24.95', 'prod_review_counts:medium', 'categories:Medical Supplies & Equipment Daily Living Aids Visual Impairment Aids Magnifiers', 'brand:misc'], ['price:24.95', 'prod_review_counts:medium', 'categories:Medical Supplies & Equipment Daily Living Aids Visual Impairment Aids Magnifiers', 'brand:misc'], ['price:24.95', 'prod_review_counts:medium', 'categories:Medical Supplies & Equipment Daily Living Aids Visual Impairment Aids Magnifiers', 'brand:misc'], ['price:24.95', 'prod_review_counts:medium', 'categories:Medical Supplies & Equipment Daily Living Aids Visual Impairment Aids Magnifiers', 'brand:misc'], ['price:24.95', 'prod_review_counts:medium', 'categories:Medical Supplies & Equipment Daily Living Aids Visual Impairment Aids Magnifiers', 'brand:misc'], ['price:24.95', 'prod_review_counts:medium', 'categorie

In [49]:
# combining product with each distinct feature subset
item_tuple = list(zip(df_if.asin, feature_list))
item_features = dataset.build_item_features(item_tuple, normalize=False)

##### model validation with hyperparameter tuning

In [50]:
train, test = random_train_test_split(interactions)

In [51]:
def sample_hyperparameters():
    """
    Yield possible hyperparameter choices.
    """

    while True:
        yield {
            "no_components": np.random.randint(16, 100),
            "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
            "loss": np.random.choice(["bpr", "warp", "warp-kos"]),
            "learning_rate": np.random.exponential(0.05),
            "item_alpha": np.random.exponential(1e-8),
            "max_sampled": np.random.randint(5, 15),
            "num_epochs": np.random.randint(5, 100),
        }
        
def random_search(train, test, num_samples=10):
    """
    Sample random hyperparameters, fit a LightFM model, and evaluate it
    on the test set.

    Parameters
    ----------

    train: np.float32 coo_matrix of shape [n_users, n_items]
        Training data.
    test: np.float32 coo_matrix of shape [n_users, n_items]
        Test data.
    num_samples: int, optional
        Number of hyperparameter choices to evaluate.


    Returns
    -------

    generator of (auc_score, hyperparameter dict, fitted model)

    """

    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")

        model = LightFM(**hyperparams)
        model.fit(train, epochs=num_epochs, item_features= item_features)

        score = auc_score(model,
                      test, train_interactions=train,
                      item_features=item_features
                     ).mean()

        hyperparams["num_epochs"] = num_epochs

        yield (score, hyperparams, model)

In [52]:
(score, hyperparams, model) = max(random_search(train, test), key=lambda x: x[0])

In [54]:
print(f"optimized auc: {score}")
print(f"hyperparameters: {hyperparams}")

optimized auc: 0.7297078967094421
hyperparameters: {'no_components': 16, 'learning_schedule': 'adadelta', 'loss': 'warp', 'learning_rate': 0.004692430910889622, 'item_alpha': 1.0857673877772748e-08, 'max_sampled': 10, 'num_epochs': 25}


In [None]:
optimized auc: 0.7297078967094421
hyperparameters: {'no_components': 16, 'learning_schedule': 'adadelta', 'loss': 'warp', 'learning_rate': 0.004692430910889622, 'item_alpha': 1.0857673877772748e-08, 'max_sampled': 10, 'num_epochs': 25

In [57]:
# with open('../../lightfm_cut.pickle', 'wb') as fle:
#     pickle.dump(model, fle)

##### Known User Prediction

In [None]:
user_x = user_id_map['u3']
n_users, n_items = interactions.shape # no of users * no of items
model.predict(user_x, np.arange(n_items)) # means predict for all

##### Unknown Users

In [None]:
user_feature_list = ['f1:1', 'f2:1', 'f3:0', 'loc:del']

from scipy import sparse
def format_newuser_input(user_feature_map, user_feature_list):
  num_features = len(user_feature_list)
  normalised_val = 1.0 
  target_indices = []
  for feature in user_feature_list:
    try:
        target_indices.append(user_feature_map[feature])
    except KeyError:
        print("new user feature encountered '{}'".format(feature))
        pass

  new_user_features = np.zeros(len(user_feature_map.keys()))
  for i in target_indices:
    new_user_features[i] = normalised_val
  new_user_features = sparse.csr_matrix(new_user_features)
  return(new_user_features
         

new_user_features = format_newuser_input(user_feature_map, user_feature_list)
model.predict(0, np.arange(n_items), user_features=new_user_features)