### 1) Importing Libraries

In [2]:
import pandas as pd
import os
from scipy.sparse import csr_matrix
import numpy as np
from IPython.display import display_html
import warnings

import matplotlib.pyplot as plt
from matplotlib.gridspec import GridSpec
import seaborn as sns
%matplotlib inline

from lightfm.cross_validation import random_train_test_split
from lightfm.evaluation import auc_score, precision_at_k, recall_at_k
from lightfm import LightFM
#from skopt import forest_minimize
from sklearn.metrics.pairwise import cosine_similarity
import pickle
import joblib
from scipy import sparse

def display_side_by_side(*args):
    html_str = ''
    for df in args:
        html_str += df.to_html()
    display_html(html_str.replace(
        'table', 'table style="display:inline"'), raw=True)


# update the working directory to the root of the project
os.chdir('..')
warnings.filterwarnings("ignore")



In [3]:
%%time
ratings = pd.read_csv('final_snack_data.csv')

movies = pd.read_csv('Itemcategory.csv', encoding= 'unicode_escape')

Wall time: 937 ms


In [4]:
ratings.head()

Unnamed: 0,user_id,Item_id,Rating,click,I1,I2,I3,I4,I5,I6,...,I12,C1,C2,C3,C4,C5,C6,C7,C8,C9
0,50,4,3,0,1005,1,1,0,20596,320,...,157,5ee41ff2,17d996e6,f028772b,ecad2386,7801e8d9,07d7df22,a99f214a,224d24e9,d14c4241
1,39,1,2,0,1005,0,1,0,15705,320,...,79,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,afa320bb,8a4875bd
2,18,13,2,0,1005,0,1,0,20366,320,...,157,543a539e,c7ca3108,3e814130,ecad2386,7801e8d9,07d7df22,a99f214a,67287d76,24f6b932
3,40,10,2,0,1002,0,0,0,20077,300,...,70,f528e1e7,0f095819,50e219e0,ecad2386,7801e8d9,07d7df22,465ead49,b888c4b6,021b641d
4,46,15,2,0,1005,0,1,0,15703,320,...,79,1fbe01fe,f3845767,28905ebd,ecad2386,7801e8d9,07d7df22,a99f214a,c64f8ed3,a5bce124


### 2) Importing Data

In [5]:
ratings = ratings[['user_id', 'Item_id', 'Rating']]
ratings.sample(5)

Unnamed: 0,user_id,Item_id,Rating
46646,56,4,2
63049,32,8,2
53893,85,1,2
5215,21,1,3
39446,89,10,5


In [6]:
movies.columns.values


array(['ItemId', 'SnackName', 'Category'], dtype=object)

In [7]:
movies = movies[['ItemId', 'SnackName', 'Category']]

In [8]:

def create_interaction_matrix(df,user_col, item_col, rating_col, norm= False, threshold = None):
    '''
    Function to create an interaction matrix dataframe from transactional type interactions
    Required Input -
        - df = Pandas DataFrame containing user-item interactions
        - user_col = column name containing user's identifier
        - item_col = column name containing item's identifier
        - rating col = column name containing user feedback on interaction with a given item
        - norm (optional) = True if a normalization of ratings is needed
        - threshold (required if norm = True) = value above which the rating is favorable
    Expected output - 
        - Pandas dataframe with user-item interactions ready to be fed in a recommendation algorithm
    '''
    interactions = df.groupby([user_col, item_col])[rating_col] \
            .sum().unstack().reset_index(). \
            fillna(0).set_index(user_col)
    if norm:
        interactions = interactions.applymap(lambda x: 1 if x > threshold else 0)
    return interactions

In [9]:
interactions = create_interaction_matrix(df = ratings,
                                         user_col = 'user_id',
                                         item_col = 'Item_id',
                                         rating_col = 'Rating',
                                         threshold = '5')
interactions.shape

(100, 15)

### 3) Preprocessing

#### 3.1) Create interaction matrix

In [10]:
def create_item_dict(df,id_col,name_col):
    '''
    Function to create an item dictionary based on their item_id and item name
    Required Input - 
        - df = Pandas dataframe with Item information
        - id_col = Column name containing unique identifier for an item
        - name_col = Column name containing name of the item
    Expected Output -
        item_dict = Dictionary type output containing item_id as key and item_name as value
    '''
    item_dict ={}
    for i in range(df.shape[0]):
        item_dict[(df.loc[i,id_col])] = df.loc[i,name_col]
    return item_dict

In [11]:
item_dict = create_item_dict(df = movies,
                               id_col = 'ItemId',
                               name_col = 'SnackName')

In [12]:
item_dict

{1: 'Snakku',
 2: 'Love with food',
 3: 'Candy Club',
 4: 'Nature Box',
 5: 'Snack Nation',
 6: 'Zen Pop',
 7: 'Yummy Bazzar World Sampler',
 8: 'Fitsnack',
 9: 'Bokksu',
 10: 'MunchPak',
 11: 'Keto krate',
 12: 'Universal yums',
 13: 'Vegan cut snack box',
 14: 'TokyoTreat',
 15: 'Try the world snacks'}

In [13]:
joblib.dump(item_dict, 'item_dict')

['item_dict']

In [14]:
def create_user_dict(interactions):
    '''
    Function to create a user dictionary based on their index and number in interaction dataset
    Required Input - 
        interactions - dataset create by create_interaction_matrix
    Expected Output -
        user_dict - Dictionary type output containing interaction_index as key and user_id as value
    '''
    user_id = list(interactions.index)
    user_dict = {}
    counter = 0 
    for i in user_id:
        user_dict[i] = counter
        counter += 1
    return user_dict

# user_id = list(interactions.index)
# user_dict = {}
# counter = 0 
# for i in user_id:
#     user_dict[i] = counter
#     counter += 1

In [15]:
create_user_dict

<function __main__.create_user_dict(interactions)>

In [16]:
user_dict = create_user_dict(interactions)

In [17]:
user_dict

{1: 0,
 2: 1,
 3: 2,
 4: 3,
 5: 4,
 6: 5,
 7: 6,
 8: 7,
 9: 8,
 10: 9,
 11: 10,
 12: 11,
 13: 12,
 14: 13,
 15: 14,
 16: 15,
 17: 16,
 18: 17,
 19: 18,
 20: 19,
 21: 20,
 22: 21,
 23: 22,
 24: 23,
 25: 24,
 26: 25,
 27: 26,
 28: 27,
 29: 28,
 30: 29,
 31: 30,
 32: 31,
 33: 32,
 34: 33,
 35: 34,
 36: 35,
 37: 36,
 38: 37,
 39: 38,
 40: 39,
 41: 40,
 42: 41,
 43: 42,
 44: 43,
 45: 44,
 46: 45,
 47: 46,
 48: 47,
 49: 48,
 50: 49,
 51: 50,
 52: 51,
 53: 52,
 54: 53,
 55: 54,
 56: 55,
 57: 56,
 58: 57,
 59: 58,
 60: 59,
 61: 60,
 62: 61,
 63: 62,
 64: 63,
 65: 64,
 66: 65,
 67: 66,
 68: 67,
 69: 68,
 70: 69,
 71: 70,
 72: 71,
 73: 72,
 74: 73,
 75: 74,
 76: 75,
 77: 76,
 78: 77,
 79: 78,
 80: 79,
 81: 80,
 82: 81,
 83: 82,
 84: 83,
 85: 84,
 86: 85,
 87: 86,
 88: 87,
 89: 88,
 90: 89,
 91: 90,
 92: 91,
 93: 92,
 94: 93,
 95: 94,
 96: 95,
 97: 96,
 98: 97,
 99: 98,
 100: 99}

In [18]:
joblib.dump(user_dict, 'user_dict')

['user_dict']

#### 3.2) Create User Dict

In [19]:
# convert to csr matrix
user_book_interaction_csr = csr_matrix(interactions.values)
user_book_interaction_csr

<100x15 sparse matrix of type '<class 'numpy.int64'>'
	with 1500 stored elements in Compressed Sparse Row format>

In [20]:
user_dict

{1: 0,
 2: 1,
 3: 2,
 4: 3,
 5: 4,
 6: 5,
 7: 6,
 8: 7,
 9: 8,
 10: 9,
 11: 10,
 12: 11,
 13: 12,
 14: 13,
 15: 14,
 16: 15,
 17: 16,
 18: 17,
 19: 18,
 20: 19,
 21: 20,
 22: 21,
 23: 22,
 24: 23,
 25: 24,
 26: 25,
 27: 26,
 28: 27,
 29: 28,
 30: 29,
 31: 30,
 32: 31,
 33: 32,
 34: 33,
 35: 34,
 36: 35,
 37: 36,
 38: 37,
 39: 38,
 40: 39,
 41: 40,
 42: 41,
 43: 42,
 44: 43,
 45: 44,
 46: 45,
 47: 46,
 48: 47,
 49: 48,
 50: 49,
 51: 50,
 52: 51,
 53: 52,
 54: 53,
 55: 54,
 56: 55,
 57: 56,
 58: 57,
 59: 58,
 60: 59,
 61: 60,
 62: 61,
 63: 62,
 64: 63,
 65: 64,
 66: 65,
 67: 66,
 68: 67,
 69: 68,
 70: 69,
 71: 70,
 72: 71,
 73: 72,
 74: 73,
 75: 74,
 76: 75,
 77: 76,
 78: 77,
 79: 78,
 80: 79,
 81: 80,
 82: 81,
 83: 82,
 84: 83,
 85: 84,
 86: 85,
 87: 86,
 88: 87,
 89: 88,
 90: 89,
 91: 90,
 92: 91,
 93: 92,
 94: 93,
 95: 94,
 96: 95,
 97: 96,
 98: 97,
 99: 98,
 100: 99}

In [21]:

def runMF(interactions, n_components=30, loss='warp', k=15, epoch=30,n_jobs = 4):
    '''
    Function to run matrix-factorization algorithm
    Required Input -
        - interactions = dataset create by create_interaction_matrix
        - n_components = number of embeddings you want to create to define Item and user
        - loss = loss function other options are logistic, brp
        - epoch = number of epochs to run 
        - n_jobs = number of cores used for execution 
    Expected Output  -
        Model - Trained model
    '''
    x = sparse.csr_matrix(interactions.values)
    model = LightFM(no_components= n_components, loss=loss,k=k)
    model.fit(x,epochs=epoch,num_threads = n_jobs)
    return model

In [22]:
mf_model = runMF(interactions = interactions,
                 n_components = 30,
                 loss = 'warp',
                 k = 15,
                 epoch = 30,
                 n_jobs = 2)

In [23]:
def sample_recommendation_item(model,interactions,Item_id,user_dict,item_dict,number_of_user):
    '''
    Funnction to produce a list of top N interested users for a given item
    Required Input -
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
        - item_id = item ID for which we need to generate recommended users
        - user_dict =  Dictionary type input containing interaction_index as key and user_id as value
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - number_of_user = Number of users needed as an output
    Expected Output -
        - user_list = List of recommended users 
    '''
    n_users, n_items = interactions.shape
    x = np.array(interactions.columns)
    scores = pd.Series(model.predict(np.arange(n_users), np.repeat(x.searchsorted(Item_id),n_users)))
    user_list = list(interactions.index[scores.sort_values(ascending=False).head(number_of_user).index])
    return user_list

In [24]:
sample_recommendation_item(model = mf_model,
                           interactions = interactions,
                           Item_id = 5,
                           user_dict = user_dict,
                           item_dict = item_dict,
                           number_of_user = 15)

[38, 79, 64, 4, 6, 86, 74, 48, 78, 29, 87, 90, 22, 57, 73]

In [25]:
def create_item_emdedding_distance_matrix(model,interactions):
    '''
    Function to create item-item distance embedding matrix
    Required Input -
        - model = Trained matrix factorization model
        - interactions = dataset used for training the model
    Expected Output -
        - item_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w items
    '''
    df_item_norm_sparse = sparse.csr_matrix(model.item_embeddings)
    similarities = cosine_similarity(df_item_norm_sparse)
    item_emdedding_distance_matrix = pd.DataFrame(similarities)
    item_emdedding_distance_matrix.columns = interactions.columns
    item_emdedding_distance_matrix.index = interactions.columns
    return item_emdedding_distance_matrix

In [26]:
item_item_dist = create_item_emdedding_distance_matrix(model = mf_model,
                                                       interactions = interactions)

In [27]:
def item_item_recommendation(item_emdedding_distance_matrix, Item_id, 
                             item_dict, n_items = 10, show = True):
    '''
    Function to create item-item recommendation
    Required Input - 
        - item_emdedding_distance_matrix = Pandas dataframe containing cosine distance matrix b/w items
        - item_id  = item ID for which we need to generate recommended items
        - item_dict = Dictionary type input containing item_id as key and item_name as value
        - n_items = Number of items needed as an output
    Expected Output -
        - recommended_items = List of recommended items
    '''
    recommended_items = list(pd.Series(item_emdedding_distance_matrix.loc[Item_id,:]. \
                                  sort_values(ascending = False).head(n_items+1). \
                                  index[1:n_items+1]))
    if show == True:
        print("Item of interest :{0}".format(item_dict[Item_id]))
        print("Item similar to the above item:")
        counter = 1
        for i in recommended_items:
            print(str(counter) + '- ' +  item_dict[i])
            counter+=1
    return recommended_items

In [28]:
rec3_list = item_item_recommendation(item_emdedding_distance_matrix = item_item_dist,
                                    Item_id = 14,
                                    item_dict = item_dict,
                                    n_items = 5)

Item of interest :TokyoTreat
Item similar to the above item:
1- MunchPak
2- Snack Nation
3- Yummy Bazzar World Sampler
4- Candy Club
5- Snakku


In [29]:
joblib.dump(interactions, 'interactions')

['interactions']

In [30]:
joblib.dump(mf_model, 'mf_model')

['mf_model']