In [1]:
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
from pathlib import Path
from numpy import bincount, log, sqrt

import scipy.sparse as sparse
import pandas as pd
import numpy as np
import pickle
import time

In [8]:
### Helper Functions

def sparsity(matrix):
    """
    Given a matrix, returns its sparsity
    """
    total_size = matrix.shape[0] * matrix.shape[1]
    actual_size = matrix.size
    sparsity = (1 - (actual_size / total_size)) * 100
    return(sparsity)


def get_k_popular(k, df_merged_order_products_prior):
    """
    Returns the `k` most popular products based on purchase count in the dataset
    """
    popular_products = list(df_merged_order_products_prior["product_id"].value_counts().head(k).index)
    return popular_products


def make_prior_data():
    """
    Generates the prior dataset including prior_user_products and product_frequency
    """
    # Read prior order csv
    df_order_products_prior = pd.read_csv("order_products__prior.csv")
    current_order_user_df = df_orders.loc[(df_orders.eval_set == "prior")].reset_index()
    current_order_user_df = current_order_user_df[["order_id", "user_id"]]

    assert len(current_order_user_df["order_id"].unique()) == len(df_order_products_prior["order_id"].unique())

    # Group product_id for each order into products
    df_order_products_prior = df_order_products_prior[["order_id", "product_id"]]
    df_product_frequency = df_order_products_prior['product_id'].value_counts()
    df_order_products_prior = df_order_products_prior.groupby("order_id")["product_id"].apply(list).reset_index().rename(columns={"product_id": "products"})
    
    
    assert current_order_user_df.size == df_order_products_prior.size

    df_prior_user_products = pd.merge(current_order_user_df, df_order_products_prior, on="order_id")
    df_prior_user_products = df_prior_user_products[["user_id", "products"]]
    df_prior_user_products = df_prior_user_products.groupby("user_id")["products"].agg(sum).reset_index()

    return df_prior_user_products, df_product_frequency

def make_test_data():
    """
    Generates the prior dataset including prior_user_products and product_frequency
    """
    # Read prior order csv
    df_order_products_test = pd.read_csv("order_products__train.csv")
    current_order_user_df = df_orders.loc[(df_orders.eval_set == "train")].reset_index()
    current_order_user_df = current_order_user_df[["order_id", "user_id"]]

    assert len(current_order_user_df["order_id"].unique()) == len(df_order_products_test["order_id"].unique())

    # Group product_id for each order into products
    df_order_products_test = df_order_products_test[["order_id", "product_id"]]
    df_product_frequency = df_order_products_test['product_id'].value_counts()
    df_order_products_test = df_order_products_test.groupby("order_id")["product_id"].apply(list).reset_index().rename(columns={"product_id": "products"})
    
    
    assert current_order_user_df.size == df_order_products_test.size

    df_test_user_products = pd.merge(current_order_user_df, df_order_products_test, on="order_id")
    df_test_user_products = df_test_user_products[["user_id", "products"]]
    df_test_user_products = df_test_user_products.groupby("user_id")["products"].agg(sum).reset_index()

    return df_test_user_products, df_product_frequency

'''
def make_test_data(df_orders, df_order_products_train):
    """
    Generates the test dataset and saves it to disk at the given path
    """
    start = time.time()
    print("Creating test data ...")

    # Read train csv
    df_order_user_current = df_orders.loc[(df_orders.eval_set == "train")].reset_index()
    df_order_user_current = df_order_user_current[["order_id", "user_id"]]
    
    # Sanity check #1: `current_order_user_df` and `df_order_products_train` should have the same number of 
    # unique order ids
    assert len(df_order_user_current["order_id"].unique()) == len(df_order_products_train["order_id"].unique())

    # Convert train dataframe to a similar format
    df_order_products_test = df_order_products_train[["order_id", "product_id"]]
    df_order_products_test = df_order_products_test.groupby("order_id")["product_id"].apply(list).reset_index().rename(columns={"product_id": "products"})

    # Sanity check #2: `df_order_products_test` and `df_order_user_current` should have the same number of 
    # records before attempting to merge them
    assert df_order_products_test.size == df_order_user_current.size

    # Merge on order id
    df_user_products_test = pd.merge(df_order_user_current, df_order_products_test, on="order_id")
    df_user_products_test = df_user_products_test[["user_id", "products"]]
    
    print("Completed in {:.2f}s".format(time.time() - start))
    return df_user_products_test
'''

def save_data_to_disk(dataframe, df_name):
    """
    Save the data to disk
    """
    filepath = "df_{}.pkl".format(df_name)
    dataframe.to_pickle(filepath)


In [3]:

# Order datasets
df_order_products_prior = pd.read_csv("order_products__prior.csv")
df_order_products_train = pd.read_csv("order_products__train.csv")
df_orders = pd.read_csv("orders.csv") 

# Products
df_products = pd.read_csv("products.csv")

# Merge prior orders and products
df_merged_order_products_prior = pd.merge(df_order_products_prior, df_products, on="product_id", how="left")


# In[5]:


df_merged_order_products_prior.head(5)


# In[6]:


# Skip this block if you already have the df_user_products.pkl and df_product_frequency.pkl in the disk
# Make prior data
# Running time: 3 min
df_prior_user_products, df_product_frequency = make_prior_data()

# save data to disk, running time : 2 mi
save_data_to_disk(df_prior_user_products, "user_products")
save_data_to_disk(df_product_frequency, "product_frequency")


In [4]:
df_prior_user_products.head(5)

Unnamed: 0,user_id,products
0,1,"[196, 14084, 12427, 26088, 26405, 196, 10258, ..."
1,2,"[32792, 47766, 20574, 12000, 48110, 22474, 165..."
2,3,"[9387, 17668, 15143, 16797, 39190, 47766, 2190..."
3,4,"[36606, 7350, 35469, 2707, 42329, 7160, 1200, ..."
4,5,"[15349, 21413, 48775, 28289, 8518, 11777, 3171..."


In [5]:
df_product_frequency.head(5)

24852    472565
13176    379450
21137    264683
21903    241921
47209    213584
Name: product_id, dtype: int64

In [6]:
# Read user_products and product_frequency from the disk
df_prior_user_products = pd.read_pickle("df_user_products.pkl")
df_product_frequency = pd.read_pickle("df_product_frequency.pkl")
df_product_frequency = pd.DataFrame(df_product_frequency).rename(columns={"product_id": "frequency"})

In [9]:
df_test_user_products, df_product_frequency_test = make_test_data()

In [10]:
df_test_user_products.head()

Unnamed: 0,user_id,products
0,1,"[196, 25133, 38928, 26405, 39657, 10258, 13032..."
1,2,"[22963, 7963, 16589, 32792, 41787, 22825, 1364..."
2,5,"[15349, 19057, 16185, 21413, 20843, 20114, 482..."
3,7,"[12053, 47272, 37999, 13198, 43967, 40852, 176..."
4,8,"[15937, 5539, 10960, 23165, 22247, 4853, 27104..."


In [11]:
# Load Product Item Matrix

def get_user_product_test_df(filepath, df_orders, df_order_products_train):
    """
    Generates a dataframe of users and their prior products purchases, and writes it to disk at the given path
    """
    start = time.time()
    print("Creating prior user product data frame ...")
    
    # Consider ony "prior" orders and remove all columns except `train` from `df_orders`
    df_order_user_train = df_orders.loc[df_orders.eval_set == "train"]
    df_order_user_train = df_order_user_train[["order_id", "user_id"]]
    
    # Remove all columns except order_id and user_id from df_orders and 
    # merge the above on `order_id` and remove `order_id`
    df_merged = pd.merge(df_order_user_train, df_order_products_train[["order_id", "product_id"]], on="order_id")
    df_user_product_train = df_merged[["user_id", "product_id"]]
    df_user_product_train = df_user_product_train.groupby(["user_id", "product_id"]).size().reset_index().rename(columns={0:"quantity"})
    
    # Write to disk
    df_user_product_train.to_csv(filepath, index_label=False)

    print("Completed in {:.2f}s".format(time.time() - start))


# Build dataframe of users, products and quantity bought using prior datasets
REBUILD_MATRIX_DF = False
matrix_df_path = "user_products__test.csv"
if REBUILD_MATRIX_DF or not Path(matrix_df_path).is_file():
    get_user_product_test_df(matrix_df_path, df_orders, df_order_products_train)
df_user_product_test = pd.read_csv(matrix_df_path)
df_user_product_test["user_id"] = df_user_product_test["user_id"].astype("category")
df_user_product_test["product_id"] = df_user_product_test["product_id"].astype("category")



In [12]:
# Load Product Item Matrix

def get_user_product_prior_df(filepath, df_orders, df_order_products_prior):
    """
    Generates a dataframe of users and their prior products purchases, and writes it to disk at the given path
    """
    start = time.time()
    print("Creating prior user product data frame ...")
    
    # Consider ony "prior" orders and remove all columns except `user_id` from `df_orders`
    df_order_user_prior = df_orders.loc[df_orders.eval_set == "prior"]
    df_order_user_prior = df_order_user_prior[["order_id", "user_id"]]
    
    # Remove all columns except order_id and user_id from df_orders and 
    # merge the above on `order_id` and remove `order_id`
    df_merged = pd.merge(df_order_user_prior, df_order_products_prior[["order_id", "product_id"]], on="order_id")
    df_user_product_prior = df_merged[["user_id", "product_id"]]
    df_user_product_prior = df_user_product_prior.groupby(["user_id", "product_id"]).size().reset_index().rename(columns={0:"quantity"})
    
    # Write to disk
    df_user_product_prior.to_csv(filepath, index_label=False)

    print("Completed in {:.2f}s".format(time.time() - start))


# Build dataframe of users, products and quantity bought using prior datasets
REBUILD_MATRIX_DF = False
matrix_df_path = "user_products__prior.csv"
if REBUILD_MATRIX_DF or not Path(matrix_df_path).is_file():
    get_user_product_prior_df(matrix_df_path, df_orders, df_order_products_prior)
df_user_product_prior = pd.read_csv(matrix_df_path)
df_user_product_prior["user_id"] = df_user_product_prior["user_id"].astype("category")
df_user_product_prior["product_id"] = df_user_product_prior["product_id"].astype("category")



In [13]:
def build_product_user_matrix(matrix_path, df_user_product_prior):
    """
    Generates a utility matrix representing purchase history of users, and writes it to disk.
    Rows and Columns represent products and users respectively.
    """
    start = time.time()
    print("Creating product user matrix ...")
    
    # Make the dataframe a sparse matrix
    df_user_product_prior["user_id"] = df_user_product_prior["user_id"].astype("category")
    df_user_product_prior["product_id"] = df_user_product_prior["product_id"].astype("category")
    product_user_matrix = sparse.coo_matrix((df_user_product_prior["quantity"],
                                            (df_user_product_prior["product_id"].cat.codes.copy(),
                                             df_user_product_prior["user_id"].cat.codes.copy())))
    
    sparse.save_npz(matrix_path, product_user_matrix)
    
    print("Completed in {:.2f}s".format(time.time() - start))

    

# Get the `product x user` matrix
REBUILD_MATRIX = False
matrix_path = "product_user_matrix.npz"
if REBUILD_MATRIX or not Path(matrix_path).is_file():
    build_product_user_matrix(matrix_path, df_user_product_prior)
product_user_matrix = sparse.load_npz(matrix_path).tocsr()

In [14]:
# User=1 bought product=196 10 times
assert product_user_matrix[195, 0] == 10

In [15]:
sparsity(product_user_matrix)

99.8700882953749

In [16]:
df_user_product_prior.head(5)
df_user_product_test.head(5)

Unnamed: 0,user_id,product_id,quantity
0,1,196,1
1,1,10258,1
2,1,13032,1
3,1,25133,1
4,1,26088,1


In [17]:
def create_data_dummy(data):
    data_dummy = data.copy()
    data_dummy['quantity_dummy'] = 1
    return data_dummy
data_dummy = create_data_dummy(df_user_product_prior)

In [18]:
df_user_product_prior.shape

(13307953, 3)

In [49]:
from sklearn.preprocessing import minmax_scale
data_norm = df_user_product_prior.copy()
data_norm[['quantity']] = minmax_scale(data_norm[['quantity']])                                                  

In [62]:


df_user_product_prior['product_id'] = df_user_product_prior['product_id'].astype(np.int64)
df_user_product_prior['user_id'] = df_user_product_prior['user_id'].astype(np.int64)
data_norm['product_id'] = data_norm['product_id'].astype(np.int64)
data_norm['user_id'] = data_norm['user_id'].astype(np.int64)
df_user_product_test['product_id'] = df_user_product_test['product_id'].astype(np.int64)
df_user_product_test['user_id'] = df_user_product_test['user_id'].astype(np.int64)

df_user_product_prior.dtypes

user_id       int64
product_id    int64
quantity      int64
dtype: object

In [63]:
import turicreate as tc
train_data = tc.SFrame(df_user_product_prior[['user_id','product_id','quantity']])
data_norm = tc.SFrame(data_norm[['user_id','product_id','quantity']])
test_data = tc.SFrame(df_user_product_test[['user_id','product_id','quantity']])

In [64]:
# constant variables to define field names include:
user_id = 'user_id'
product_id = 'product_id'
target = 'quantity'
users_to_recommend = list(df_prior_user_products[user_id])
n_rec = 10 # number of items to recommend
n_display = 30 # to display the first few rows in an output dataset

In [65]:
def model(train_data, name, user_id, item_id, target, users_to_recommend, n_rec, n_display):
    if name == 'popularity':
        model = tc.popularity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target)
    elif name == 'cosine':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='cosine')
    elif name == 'pearson':
        model = tc.item_similarity_recommender.create(train_data, 
                                                    user_id=user_id, 
                                                    item_id=item_id, 
                                                    target=target, 
                                                    similarity_type='pearson')
        
    recom = model.recommend(users=users_to_recommend, k=n_rec)
    recom.print_rows(n_display)
    return model

In [66]:
name = 'popularity'
popularity = model(train_data, name, user_id, product_id, target, users_to_recommend, n_rec, n_display)

+---------+------------+--------------------+------+
| user_id | product_id |       score        | rank |
+---------+------------+--------------------+------+
|    1    |    6433    |        17.0        |  1   |
|    1    |    2075    |        14.5        |  2   |
|    1    |   43553    |        13.0        |  3   |
|    1    |   27740    |       12.625       |  4   |
|    1    |   14609    | 11.666666666666666 |  5   |
|    1    |   13875    |       11.25        |  6   |
|    1    |   39992    |        11.0        |  7   |
|    1    |    5868    |        10.0        |  8   |
|    1    |   35604    |        10.0        |  9   |
|    1    |   31418    |        10.0        |  10  |
|    2    |    6433    |        17.0        |  1   |
|    2    |    2075    |        14.5        |  2   |
|    2    |   43553    |        13.0        |  3   |
|    2    |   27740    |       12.625       |  4   |
|    2    |   14609    | 11.666666666666666 |  5   |
|    2    |   13875    |       11.25        | 

In [67]:
name = 'popularity'
target = 'quantity'
pop_norm = model(data_norm, name, user_id, product_id, target, users_to_recommend, n_rec, n_display)

+---------+------------+---------------------+------+
| user_id | product_id |        score        | rank |
+---------+------------+---------------------+------+
|    1    |    6433    | 0.16326530612244897 |  1   |
|    1    |    2075    |  0.1377551020408163 |  2   |
|    1    |   43553    | 0.12244897959183672 |  3   |
|    1    |   27740    | 0.11862244897959183 |  4   |
|    1    |   14609    | 0.10884353741496598 |  5   |
|    1    |   13875    | 0.10459183673469385 |  6   |
|    1    |   39992    |  0.1020408163265306 |  7   |
|    1    |    5868    | 0.09183673469387754 |  8   |
|    1    |   35604    | 0.09183673469387754 |  9   |
|    1    |   31418    | 0.09183673469387754 |  10  |
|    2    |    6433    | 0.16326530612244897 |  1   |
|    2    |    2075    |  0.1377551020408163 |  2   |
|    2    |   43553    | 0.12244897959183672 |  3   |
|    2    |   27740    | 0.11862244897959183 |  4   |
|    2    |   14609    | 0.10884353741496598 |  5   |
|    2    |   13875    | 0.1

In [68]:
name = 'cosine'
target = 'quantity'
cos = model(train_data, name, user_id, product_id, target, users_to_recommend, n_rec, n_display)

+---------+------------+---------------------+------+
| user_id | product_id |        score        | rank |
+---------+------------+---------------------+------+
|    1    |   37710    |  0.689309322171741  |  1   |
|    1    |    6184    |  0.6667063170009189 |  2   |
|    1    |   11759    |  0.5261774957180023 |  3   |
|    1    |   41400    | 0.46802163786358303 |  4   |
|    1    |   18023    |  0.4671127398808797 |  5   |
|    1    |   45051    | 0.46666069163216484 |  6   |
|    1    |    8843    |  0.4628487295574612 |  7   |
|    1    |   13575    |  0.4477006097634633 |  8   |
|    1    |   31651    | 0.44746193289756775 |  9   |
|    1    |   46562    | 0.43764450152715045 |  10  |
|    2    |   21137    | 0.30878585635447037 |  1   |
|    2    |   21903    |  0.2719903547389835 |  2   |
|    2    |   26209    | 0.23687154522129134 |  3   |
|    2    |   47626    | 0.22843356401312584 |  4   |
|    2    |    8277    | 0.20857900556396036 |  5   |
|    2    |   22935    | 0.2

In [69]:
name = 'cosine' 
target = 'quantity' 
cos_norm = model(data_norm, name, user_id, product_id, target, users_to_recommend, n_rec, n_display)

+---------+------------+-----------------------+------+
| user_id | product_id |         score         | rank |
+---------+------------+-----------------------+------+
|    1    |   37710    | 0.0040131476190355085 |  1   |
|    1    |    6184    | 0.0036865373452504477 |  2   |
|    1    |   31651    |  0.003203484747144911 |  3   |
|    1    |   13424    | 0.0031906399461958143 |  4   |
|    1    |   13042    |  0.002799040741390652 |  5   |
|    1    |   41400    | 0.0026418169339497886 |  6   |
|    1    |   45051    | 0.0026229421297709146 |  7   |
|    1    |   13575    |  0.002553145090738932 |  8   |
|    1    |   31759    | 0.0025433897972106934 |  9   |
|    1    |   18023    | 0.0025031036800808376 |  10  |
|    2    |   21137    | 0.0009905123243144915 |  1   |
|    2    |   18926    | 0.0007580173950569302 |  2   |
|    2    |   21903    | 0.0007492432407304353 |  3   |
|    2    |   33787    | 0.0006588714964249555 |  4   |
|    2    |   47626    | 0.0006080556149576224 |

In [70]:
name = 'pearson'
target = 'quantity'
pear = model(train_data, name, user_id, product_id, target, users_to_recommend, n_rec, n_display)

+---------+------------+--------------------+------+
| user_id | product_id |       score        | rank |
+---------+------------+--------------------+------+
|    1    |    6433    |        17.0        |  1   |
|    1    |    2075    |        14.5        |  2   |
|    1    |   43553    |        13.0        |  3   |
|    1    |   27740    | 12.625000000000004 |  4   |
|    1    |   14609    | 11.666666666666668 |  5   |
|    1    |   13875    |       11.25        |  6   |
|    1    |   39992    |        11.0        |  7   |
|    1    |    5868    |        10.0        |  8   |
|    1    |   35604    |        10.0        |  9   |
|    1    |   31418    |        10.0        |  10  |
|    2    |    6433    |        17.0        |  1   |
|    2    |    2075    |        14.5        |  2   |
|    2    |   43553    |        13.0        |  3   |
|    2    |   27740    | 12.625000000000004 |  4   |
|    2    |   14609    | 11.666666666666668 |  5   |
|    2    |   13875    |       11.25        | 

In [71]:
name = 'pearson' 
target = 'quantity' 
pear_norm = model(data_norm, name, user_id, product_id, target, users_to_recommend, n_rec, n_display)

+---------+------------+---------------------+------+
| user_id | product_id |        score        | rank |
+---------+------------+---------------------+------+
|    1    |    6433    | 0.16326530612244897 |  1   |
|    1    |    2075    |  0.1377551020408163 |  2   |
|    1    |   43553    | 0.12244897959183672 |  3   |
|    1    |   27740    | 0.11862244897959183 |  4   |
|    1    |   14609    | 0.10884353741496597 |  5   |
|    1    |   13875    | 0.10459183673469387 |  6   |
|    1    |   39992    |  0.1020408163265306 |  7   |
|    1    |    5868    | 0.09183673469387754 |  8   |
|    1    |   35604    | 0.09183673469387754 |  9   |
|    1    |   31418    | 0.09183673469387754 |  10  |
|    2    |    6433    | 0.16326530612244897 |  1   |
|    2    |    2075    |  0.1377551020408163 |  2   |
|    2    |   43553    | 0.12244897959183672 |  3   |
|    2    |   27740    | 0.11862244897959183 |  4   |
|    2    |   14609    | 0.10884353741496597 |  5   |
|    2    |   13875    | 0.1

In [72]:
# create initial callable variables

models_w_counts = [popularity, cos, pear]
models_w_norm = [pop_norm, cos_norm, pear_norm]

names_w_counts = ['Popularity Model on Purchase Counts', 'Cosine Similarity on Purchase Counts', 'Pearson Similarity on Purchase Counts']
names_w_norm = ['Popularity Model on Scaled Purchase Counts', 'Cosine Similarity on Scaled Purchase Counts', 'Pearson Similarity on Scaled Purchase Counts']

In [73]:
eval_counts = tc.recommender.util.compare_models(test_data, models_w_counts, model_names=names_w_counts)
eval_counts = tc.recommender.util.compare_models(test_data, models_w_norm, model_names=names_w_norm)

PROGRESS: Evaluate model Popularity Model on Purchase Counts



Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    |          0.0           |          0.0           |
|   3    |          0.0           |          0.0           |
|   4    |          0.0           |          0.0           |
|   5    |          0.0           |          0.0           |
|   6    |          0.0           |          0.0           |
|   7    |          0.0           |          0.0           |
|   8    |          0.0           |          0.0           |
|   9    | 8.468253786791372e-07  | 1.9053571020280559e-06 |
|   10   | 1.5242856816224407e-06 | 2.8580356530421078e-06 |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 2.0280016929439006

Per User RMSE (best)
+---------+-----


Precision and recall summary statistics by cutoff
+--------+----------------------+-----------------------+
| cutoff |    mean_precision    |      mean_recall      |
+--------+----------------------+-----------------------+
|   1    | 0.028359335106585414 | 0.0028755054769682743 |
|   2    | 0.02618341729606966  |  0.00523525575030573  |
|   3    | 0.024401273286639047 |  0.007308051590745943 |
|   4    | 0.022892865580866638 |  0.009037669087297893 |
|   5    | 0.02188264524537186  |  0.010710097864766606 |
|   6    | 0.020969090026853085 |  0.01226208555887533  |
|   7    | 0.02017609854724713  |  0.013732185024421244 |
|   8    | 0.019415588869666094 |  0.015121424326050013 |
|   9    | 0.018793595629025897 |  0.01644956400207042  |
|   10   | 0.018225883895159476 |  0.017665892717098043 |
+--------+----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.9459703697860621

Per User RMSE (best)
+---------+-----------------------+-------+
| user_id |   


Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    |          0.0           |          0.0           |
|   3    |          0.0           |          0.0           |
|   4    |          0.0           |          0.0           |
|   5    |          0.0           |          0.0           |
|   6    |          0.0           |          0.0           |
|   7    |          0.0           |          0.0           |
|   8    |          0.0           |          0.0           |
|   9    |          0.0           |          0.0           |
|   10   | 1.5242856816224445e-06 | 2.8580356530421344e-06 |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 2.030022365385414

Per User RMSE (best)
+---------+------


Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    |          0.0           |          0.0           |
|   3    |          0.0           |          0.0           |
|   4    |          0.0           |          0.0           |
|   5    |          0.0           |          0.0           |
|   6    |          0.0           |          0.0           |
|   7    |          0.0           |          0.0           |
|   8    | 1.9053571020280544e-06 | 2.8580356530421213e-06 |
|   9    | 1.6936507573582852e-06 | 2.8580356530421213e-06 |
|   10   | 1.5242856816224453e-06 | 2.8580356530421213e-06 |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.982508374344506

Per User RMSE (best)
+---------+------


Precision and recall summary statistics by cutoff
+--------+----------------------+-----------------------+
| cutoff |    mean_precision    |      mean_recall      |
+--------+----------------------+-----------------------+
|   1    | 0.02592809944439788  | 0.0024394330005602285 |
|   2    | 0.023367299499272156 |  0.004372622803198902 |
|   3    | 0.021527994776781394 |  0.005978250965165464 |
|   4    | 0.020172015639170867 |  0.007497353587844249 |
|   5    | 0.019131309590043468 |  0.00884110185007241  |
|   6    | 0.018239348418680267 |  0.010070647258227709 |
|   7    | 0.01749117819661815  |  0.011217175720080269 |
|   8    | 0.016895754102233797 |  0.01236452325698452  |
|   9    | 0.01634881076077933  |  0.013429879346618318 |
|   10   | 0.015842663231942656 |  0.014453724221330394 |
+--------+----------------------+-----------------------+
[10 rows x 3 columns]


Overall RMSE: 0.9997439751144391

Per User RMSE (best)
+---------+------------------+-------+
| user_id |       r


Precision and recall summary statistics by cutoff
+--------+------------------------+------------------------+
| cutoff |     mean_precision     |      mean_recall       |
+--------+------------------------+------------------------+
|   1    |          0.0           |          0.0           |
|   2    |          0.0           |          0.0           |
|   3    |          0.0           |          0.0           |
|   4    |          0.0           |          0.0           |
|   5    |          0.0           |          0.0           |
|   6    |          0.0           |          0.0           |
|   7    |          0.0           |          0.0           |
|   8    |          0.0           |          0.0           |
|   9    |          0.0           |          0.0           |
|   10   | 1.5242856816224426e-06 | 2.8580356530421073e-06 |
+--------+------------------------+------------------------+
[10 rows x 3 columns]


Overall RMSE: 0.982488939646044

Per User RMSE (best)
+---------+------

In [74]:
def create_output(model, users_to_recommend, n_rec, print_csv=True):
    recomendation = model.recommend(users=users_to_recommend, k=n_rec)
    df_rec = recomendation.to_dataframe()
    df_rec['recommendedProducts'] = df_rec.groupby([user_id])[product_id] \
        .transform(lambda x: '|'.join(x.astype(str)))
    df_output = df_rec[['user_id', 'recommendedProducts']].drop_duplicates() \
        .sort_values('user_id').set_index('user_id')
    if print_csv:
        df_output.to_csv('option1_recommendation.csv')
        print("An output file can be found in 'output' folder with name 'option1_recommendation.csv'")
    return df_output

In [77]:
df_output = create_output(pear_norm, users_to_recommend, n_rec, print_csv=True)
print(df_output.shape)
df_output.head()

An output file can be found in 'output' folder with name 'option1_recommendation.csv'
(206209, 1)


Unnamed: 0_level_0,recommendedProducts
user_id,Unnamed: 1_level_1
1,6433|2075|43553|27740|14609|13875|39992|5868|3...
2,6433|2075|43553|27740|14609|13875|39992|5868|3...
3,6433|2075|43553|27740|14609|13875|39992|5868|3...
4,6433|2075|43553|27740|14609|13875|39992|5868|3...
5,6433|2075|43553|27740|14609|13875|39992|5868|3...


In [79]:
def customer_recomendation(customer_id):
    if customer_id not in df_output.index:
        print('Customer not found.')
        return customer_id
    return df_output.loc[customer_id]

In [81]:
customer_recomendation(4)

recommendedProducts    6433|2075|43553|27740|14609|13875|39992|5868|3...
Name: 4, dtype: object

In [None]:
customer_recomendation(21)