In [1]:
import pandas as pd
import numpy as np
import itertools

# display options
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# custom helper functions
%load_ext autoreload
%autoreload 2
from helper.general_helper import *

# lightfm
from lightfm.data import Dataset
from lightfm import LightFM
from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score


# view plotly in jupyter 
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)

# ignore warnings
import warnings
warnings.filterwarnings("ignore")



In [2]:
df = pd.read_csv('../../merged_df.csv')

In [3]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime,related,categories,price,prod_review_counts,free,percent_helpful
0,A1N4O8VOJZTDVB,B004A9SDD8,Annette Yancey,"[1, 1]","Loves the song, so he really couldn't wait to ...",3.0,Really cute,1383350400,2013-11-02,"{'also_bought': ['B006M3K874', 'B00F85SMOI', '...",Kids,0.0,medium,1,1.0
1,A2HQWU6HUKIEC7,B004A9SDD8,"Audiobook lover ""Kathy""","[0, 0]","Oh, how my little grandson loves this app. He'...",5.0,2-year-old loves it,1323043200,2011-12-05,"{'also_bought': ['B006M3K874', 'B00F85SMOI', '...",Kids,0.0,medium,1,0.0
2,A1SXASF6GYG96I,B004A9SDD8,Barbara Gibbs,"[0, 0]",I found this at a perfect time since my daught...,5.0,Fun game,1337558400,2012-05-21,"{'also_bought': ['B006M3K874', 'B00F85SMOI', '...",Kids,0.0,medium,1,0.0
3,A2B54P9ZDYH167,B004A9SDD8,"Brooke Greenstreet ""Babylove""","[3, 4]",My 1 year old goes back to this game over and ...,5.0,We love our Monkeys!,1354752000,2012-12-06,"{'also_bought': ['B006M3K874', 'B00F85SMOI', '...",Kids,0.0,medium,1,0.75
4,AFOFZDTX5UC6D,B004A9SDD8,C. Galindo,"[1, 1]",There are three different versions of the song...,5.0,This is my granddaughters favorite app on my K...,1391212800,2014-02-01,"{'also_bought': ['B006M3K874', 'B00F85SMOI', '...",Kids,0.0,medium,1,1.0


##### Create Usier/Item/Review dataset plus item dataset

In [4]:
df_uir = df[['reviewerID', 'asin', 'overall']]

In [5]:
df_uir.head()

Unnamed: 0,reviewerID,asin,overall
0,A1N4O8VOJZTDVB,B004A9SDD8,3.0
1,A2HQWU6HUKIEC7,B004A9SDD8,5.0
2,A1SXASF6GYG96I,B004A9SDD8,5.0
3,A2B54P9ZDYH167,B004A9SDD8,5.0
4,AFOFZDTX5UC6D,B004A9SDD8,5.0


In [6]:
df_if =  df[['asin', 'free', 'prod_review_counts', 'categories']]

In [7]:
df_if.head()

Unnamed: 0,asin,free,prod_review_counts,categories
0,B004A9SDD8,1,medium,Kids
1,B004A9SDD8,1,medium,Kids
2,B004A9SDD8,1,medium,Kids
3,B004A9SDD8,1,medium,Kids
4,B004A9SDD8,1,medium,Kids


##### generating feature to unique values lists

In [8]:
feature_list = []

# format should be 'feature:value'

# selecting unique values
price_lst = [f"free:{val}" for val in df_if.free.unique()]
sales_lst = [f"prod_review_counts:{cat}" for cat in df_if.prod_review_counts.unique()]
cat_lst = [f"categories:{cat}" for cat in df_if.categories.unique()]

# appending list
for price in price_lst:
    feature_list.append(price)
for sales in sales_lst:
    feature_list.append(sales)
for cat in cat_lst:
    feature_list.append(cat)

##### setting LightFM dataset correctly

In [9]:
dataset = Dataset()
dataset.fit(df['reviewerID'].unique(),
             df['asin'].unique(),
             item_features = feature_list)

In [10]:
(interactions, weights) = dataset.build_interactions([(x[0], x[1], x[2]) for x in df_uir.values])

##### building interactions

In [11]:
# map each item category and values to app
ad_subset = df_if[['free','prod_review_counts', 'categories']] 
ad_list = [list(x) for x in ad_subset.values]

feature_list = []
for item in ad_list:
    feature_list.append(feature_colon_value(item))
print(f'Final output: {feature_list[:10]}')

Final output: [['free:1', 'prod_review_counts:medium', 'categories:Kids'], ['free:1', 'prod_review_counts:medium', 'categories:Kids'], ['free:1', 'prod_review_counts:medium', 'categories:Kids'], ['free:1', 'prod_review_counts:medium', 'categories:Kids'], ['free:1', 'prod_review_counts:medium', 'categories:Kids'], ['free:1', 'prod_review_counts:medium', 'categories:Kids'], ['free:1', 'prod_review_counts:medium', 'categories:Kids'], ['free:1', 'prod_review_counts:medium', 'categories:Kids'], ['free:1', 'prod_review_counts:medium', 'categories:Kids'], ['free:1', 'prod_review_counts:medium', 'categories:Kids']]


In [12]:
# combining app with each distinct feature subset
item_tuple = list(zip(df_if.asin, feature_list))
item_features = dataset.build_item_features(item_tuple, normalize= False)

##### model validation with hyperparameter tuning

In [13]:
train, test = random_train_test_split(interactions)

In [14]:
def sample_hyperparameters():
    """
    Yield possible hyperparameter choices.
    """

    while True:
        yield {
            "no_components": np.random.randint(16, 100),
            "learning_schedule": np.random.choice(["adagrad", "adadelta"]),
            "loss": np.random.choice(["bpr", "warp", "warp-kos"]),
            "learning_rate": np.random.exponential(0.05),
            "item_alpha": np.random.exponential(1e-8),
            "max_sampled": np.random.randint(5, 15),
            "num_epochs": np.random.randint(5, 100),
        }
        
def random_search(train, test, num_samples=10):
    """
    Sample random hyperparameters, fit a LightFM model, and evaluate it
    on the test set.

    Parameters
    ----------

    train: np.float32 coo_matrix of shape [n_users, n_items]
        Training data.
    test: np.float32 coo_matrix of shape [n_users, n_items]
        Test data.
    num_samples: int, optional
        Number of hyperparameter choices to evaluate.


    Returns
    -------

    generator of (auc_score, hyperparameter dict, fitted model)

    """

    for hyperparams in itertools.islice(sample_hyperparameters(), num_samples):
        num_epochs = hyperparams.pop("num_epochs")

        model = LightFM(**hyperparams)
        model.fit(train, epochs=num_epochs, item_features= item_features)

        score = auc_score(model,
                      test, train_interactions=train,
                      item_features=item_features
                     ).mean()

        hyperparams["num_epochs"] = num_epochs

        yield (score, hyperparams, model)

In [15]:
(score, hyperparams, model) = max(random_search(train, test), key=lambda x: x[0])

KeyboardInterrupt: 

In [None]:
print(f"optimized auc: {score}")
print(f"hyperparameters: {hyperparams})

##### Known User Prediction

In [None]:
user_x = user_id_map['u3']
n_users, n_items = interactions.shape # no of users * no of items
model.predict(user_x, np.arange(n_items)) # means predict for all

##### Unknown Users

In [None]:
user_feature_list = ['f1:1', 'f2:1', 'f3:0', 'loc:del']

from scipy import sparse
def format_newuser_input(user_feature_map, user_feature_list):
  num_features = len(user_feature_list)
  normalised_val = 1.0 
  target_indices = []
  for feature in user_feature_list:
    try:
        target_indices.append(user_feature_map[feature])
    except KeyError:
        print("new user feature encountered '{}'".format(feature))
        pass

  new_user_features = np.zeros(len(user_feature_map.keys()))
  for i in target_indices:
    new_user_features[i] = normalised_val
  new_user_features = sparse.csr_matrix(new_user_features)
  return(new_user_features
         

new_user_features = format_newuser_input(user_feature_map, user_feature_list)
model.predict(0, np.arange(n_items), user_features=new_user_features)