Run Import_Data_Process.ipynb before running this notebook to get pre-processed data

## Import Packages/Dataset & Data Pre-Processing

In [1]:
# Importing Necessary Packages

from implicit.nearest_neighbours import tfidf_weight
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
from pathlib import Path
from numpy import bincount, log, sqrt

import scipy.sparse as sparse
import implicit
import pandas as pd
import numpy as np
import pickle
import time
import heapq

In [2]:
# Read data files
df_order_products_prior = pd.read_csv("order_products__prior.csv")
df_order_products_train = pd.read_csv("order_products__train.csv")
df_orders = pd.read_csv("orders.csv") 
df_products = pd.read_csv("products.csv")

# Merge prior orders and products
df_merged_prior = pd.merge(df_order_products_prior, df_products, on="product_id", how="left")

In [3]:
# Read user_products and product_frequency from disk
df_prior_user_products = pd.read_pickle("df_user_products_prior.pkl")
df_product_frequency = pd.read_pickle("df_product_frequency.pkl")
df_product_frequency = pd.DataFrame(df_product_frequency).rename(columns={"product_id": "frequency"})

In [5]:
# Read from saved test data
test_data_path = "user_products__test.csv"
df_user_products_test = pd.read_csv(test_data_path)
df_user_products_test.head()

Unnamed: 0,user_id,product_id
0,1,"[196, 25133, 38928, 26405, 39657, 10258, 13032..."
1,2,"[22963, 7963, 16589, 32792, 41787, 22825, 1364..."
2,5,"[15349, 19057, 16185, 21413, 20843, 20114, 482..."
3,7,"[12053, 47272, 37999, 13198, 43967, 40852, 176..."
4,8,"[15937, 5539, 10960, 23165, 22247, 4853, 27104..."


## Create Necessary Matrices

In [6]:
# Make user_product dataframe

In [18]:
def user_product_prior(filepath, df_orders, df_order_products_prior):
    """
    Generates a dataframe of users and their purchase of products
    """
    order_user = df_orders.loc[df_orders.eval_set == "prior"]
    order_user = order_user[["order_id", "user_id"]]
    
    
    # merge order:duplic user_id with duplic order_id:product_id on order_id
    # take out order id so only duplic user_id: product_id remains
    # Add quantity column
    df_merged = pd.merge(order_user, df_order_products_prior[["order_id", "product_id"]], on="order_id")
    user_product = df_merged[["user_id", "product_id"]]
    user_product = user_product.groupby(["user_id", "product_id"]).size().reset_index()
    user_product = user_product.rename(columns={0:"quantity"})
    
    # Write to disk
    user_product.to_csv(filepath, index_label=False)


In [19]:
# Build dataframe of users, products and quantity bought (prior)
matrix_df_path = "user_products__prior.csv"
if not Path(matrix_df_path).is_file():
    user_product_prior(matrix_df_path, df_orders, df_order_products_prior)
df_user_product_prior = pd.read_csv(matrix_df_path)
df_user_product_prior["user_id"] = df_user_product_prior["user_id"].astype("category")
df_user_product_prior["product_id"] = df_user_product_prior["product_id"].astype("category")

In [20]:
df_user_product_prior.head()

Unnamed: 0,user_id,product_id,quantity
0,1,196,10
1,1,10258,9
2,1,10326,1
3,1,12427,10
4,1,13032,3


In [24]:
# Make weighted utility matrix

In [25]:
def product_user_matrix(matrix_path, df_user_product_prior):
    """
    Generates utility matrix based on purchase history. Rows: products Columns: users
    """
    # Make the dataframe a sparse matrix
    product_user_matrix = sparse.coo_matrix((df_user_product_prior["quantity"],
                                            (df_user_product_prior["product_id"].cat.codes.copy(),
                                             df_user_product_prior["user_id"].cat.codes.copy())))
    
    sparse.save_npz(matrix_path, product_user_matrix)

In [26]:
# Get the `product x user` matrix
matrix_path = "product_user_matrix.npz"
if not Path(matrix_path).is_file():
    product_user_matrix(matrix_path, df_user_product_prior)
product_user_matrix = sparse.load_npz(matrix_path).tocsr()

In [27]:
# User=1 bought product=196 10 times
product_user_matrix[195, 0]

10

In [28]:
# Make user x product matrix
user_product_matrix = product_user_matrix.T

In [29]:
# Should be 10
user_product_matrix[0,195]

10

In [30]:
def tfidf(tf):
    """
    Generates TF-IDF weight matrix with given user x product matrix
    Document = user
    Term = product
    tf = count of term in document, squared (common practice)
    idf = log(# of documents/# of documents with t + 1). Plus one on denominator to avoid dividing by 0.
    """
    tf_idf = coo_matrix(tf)

    # Number of users
    N = float(tf_idf.shape[0])
    
    # bincount = nonzero elements
    # bincount(tf_idf.col) = # of users who bought the product
    no_users_prod = bincount(tf_idf.col)
    idf = log(N / (1 + no_users_prod))

    # Squaring tf is a common practice
    tf_idf.data = sqrt(tf_idf.data) * idf[tf_idf.col]
    
    return tf_idf

In [31]:
tf_idf = tfidf(user_product_matrix)
# convert to Compressed Sparse Row format
tf_idf = tf_idf.tocsr()

## Make Recommendation

In [90]:
# Example Recommendation
def recommend(target_user, cos_sim, K, N) :
    """
    Arguments: target_user (row of tf_idf matrix), cosine similarity vector, number of similar users to consider (K),
    number of products to recommend (N)
    Generates N recommendations for target user
    """
    
    # Select K similar users with the highest cosine similarity score (most similar)
    K_similar = heapq.nlargest(K+1, range(len(cos_sim)), cos_sim.take)
    
    # Find products bought by the target user
    products_target_user = df_prior_user_products.loc[df_prior_user_products['user_id'] == target_user_id].product_id
    products_target_user = set(products_target_user.tolist()[0])

    recommendations = []
    # Make recommended items list of length N
    # Ensures recommendations from users who are most similar are included
    for similar_user in K_similar:
        products_similar_user = df_prior_user_products.loc[df_prior_user_products['user_id'] == similar_user + 1]
        product_id_sim_user = products_similar_user['product_id']
        product_id_sim_user = product_id_sim_user.tolist()[0]
        # Look at all products bought by the similar user the target user did not buy
        sim_recs = set(product_id_sim_user) - products_target_user
        # Skip if looking at target user or if there are no recommendations from similar user
        if similar_user == target_user_id or not sim_recs: 
            continue
        # Add recommended items to total recommendation list
        recommendations.extend(sim_recs)
        if len(recommendations) > N:
            break
        
    # Pick the top N popularity (overall sales) to recommend
    heap = []
    for product in recommendations:
        heapq.heappush(heap, (df_product_frequency.loc[product]['frequency'], product))
        if len(heap) > N:
            heapq.heappop(heap)
            
    return products_target_user, [item[1] for item in heap]

In [91]:
# Test one user

target_user_index = 10
target_user = tf_idf[target_user_index - 1]

# Cosine similarity vector of target user
cos_sim = cosine_similarity(tf_idf, target_user, False).toarray()
# Pick 20 neighbors and 10 products to recommend
# Returns products the target user already bought and their recommendations
products_target, recommendations = recommend(target_user, cos_sim, 20, 10)

In [92]:
print('10 Recommendations for User {}:'.format(target_user_index))
print(recommendations)
print()
print('User {} already bought:'.format(target_user_index))
print(products_target)

10 Recommendations for User 10:
[8518, 4605, 45066, 22935, 5876, 39275, 8277, 47209, 21903, 47766]

User 10 already bought:
{36865, 20995, 13829, 43014, 11782, 18441, 47626, 5646, 22035, 27156, 15392, 32299, 34358, 15937, 7746, 19019, 48204, 45664, 30305, 26209, 24184, 9339, 23165, 42625, 35973, 48775, 5769, 9871, 14992, 42647, 20632, 40604, 15011, 28842, 47788, 21174, 5818, 13512, 19678, 18656, 42736, 28928, 40706, 41220, 260, 31506, 24852, 47380, 32537, 30489, 13083, 8988, 22825, 37687, 4920, 16185, 28986, 26940, 13629, 11068, 44359, 23879, 21833, 5450, 25931, 34126, 34128, 36695, 43352, 42342, 44910, 28535, 36735, 17794, 46979, 35725, 13198, 38293, 13212, 16797, 17828, 47526, 20920, 15290, 47042, 39877, 45007, 7632, 16857, 27104, 31717, 47591, 23541, 1529}


## Evaluation

In [102]:
def k_popular(k, df_merged_prior):
    """
    Returns the `k` most popular products based on purchase count
    """
    pop_prods = df_merged_prior["product_id"].value_counts()[0:10]
    pop_prods_id = pop_prods.index
    return pop_prods_id

In [103]:
# Get the 10 most popular products
popular_products = k_popular(10, df_merged_prior)
popular_products

Int64Index([24852, 13176, 21137, 21903, 47209, 47766, 47626, 16797, 26209,
            27845],
           dtype='int64')

### Baseline F-1 Score

In [113]:
def new_products(row):
    """
    Given a row in the test dataset
    Returns the list of new products purchased
    """
    actual = row["product_id"][1:-1]
    actual = set([int(p.strip()) for p in actual.strip().split(",")])
    products_target_user = df_prior_user_products.loc[df_prior_user_products['user_id'] == row["user_id"]].product_id
    liked = set(products_target_user.tolist()[0])
    return actual - liked

def recall_score(actual, pred):
    if len(actual) == 0:
        return 0
    actual, pred = set(actual), set(pred)
    return len(actual.intersection(pred)) / len(actual)

def precision_score(actual, pred):
    if len(actual) == 0:
        return 0
    actual, pred = set(actual), set(pred)
    return len(actual.intersection(pred)) / len(pred)

def popular_recommend(row):
    """
    Given a row in the test dataset
    Returns the f1 score when popular products are recommended
    """
    actual = new_products(row)
    recall = recall_score(actual, popular_products)
    precision = precision_score(actual, popular_products)

    # Avoid division by 0
    if precision+recall == 0:
        f1 = 0
    else:
        f1 = 2*precision*recall/(precision+recall)
    return f1

In [115]:
# Baseline F1 score
def baseline_df(filepath, df_user_products_test, subset=None):
    start = time.time()
    df_eval = df_user_products_test.copy()
    
    if subset:
        df_eval = df_eval.iloc[subset[0]-1:subset[1]-1]
    df_eval["popular_score"] = df_eval.apply(popular_recommend, axis=1)
    df_eval.to_csv(filepath)
    
    print("Completed in {:.2f}s".format(time.time() - start))

In [116]:
# Get baseline numbers
REBUILD_EVAL_DF = True
subset = [1,20000]

base_path = "eval_tfidf_baseline.csv"
if REBUILD_EVAL_DF or not Path(eval_path).exists():
    baseline_df(base_path, df_user_products_test, subset=subset)
df_eval = pd.read_csv(base_path)

# Mean f1 score for baseline
baseline_mean_f1 = np.mean(df_eval["popular_score"])
print("Baseline: {:.2f}%".format(baseline_mean_f1 * 100))

Completed in 46.84s
Baseline: 1.53%


### Model F-1 Score

In [117]:
def tfidf_recommend(row):
    """
    Given a row in the test dataset
    Returns the F1 score when our model recommends products
    """
    actual = row["product_id"][1:-1]
    actual = [int(p.strip()) for p in actual.strip().split(",")]
    target_user = tf_idf[row["user_id"] - 1]
    similarities = cosine_similarity(tf_idf, target_user, False)
    cos_vec = similarities.toarray()
    productset_target_user, recommended = recommend(target_user, cos_vec, 20, 10)

    cur_recall_score = recall_score(actual, recommended)
    precision = precision_score(actual,recommended)
    
    if precision+cur_recall_score == 0:
        f1 = 0
    else:
        f1 = 2*precision*cur_recall_score/(precision+cur_recall_score)
    
    global count, f1_sum
    count += 1; f1_sum += f1
    if count%1000 == 0:
        print("{:.1f}th iteration, current mean of recall = {}".format(count, f1_sum / count))   
    
    return f1

def build_eval_df(filepath, df_user_products_test, subset=None):
    """
    Builds a dataframe of f1 values of the baseline and our model for all the users
    in the test data, and saves its to disk at `filepath`
    """
    start = time.time()
    print("Building dataframe with f1 values ...")
    
    df_eval = df_user_products_test.copy()
    if subset:
        df_eval = df_eval.iloc[subset[0]-1:subset[1]-1]
    df_eval["tfidf_score"] = df_eval.apply(tfidf_recommend, axis=1)
    df_eval.to_csv(filepath)
    
    print("Completed in {:.2f}s".format(time.time() - start))    


In [118]:
REBUILD_EVAL_DF = True
subset = [1,20000]

# Counter
count = 0
f1_sum = 0

# Estimated 6-7 hours to complete
eval_path = "eval_tfidf_{}.csv".format(subset[1] if subset is not None else "full")
if REBUILD_EVAL_DF or not Path(eval_path).exists():
    build_eval_df(eval_path, df_user_products_test, subset=subset)
df_eval = pd.read_csv(eval_path)

Building dataframe with f1 values ...
Completed in 57.84s


In [119]:
# Mean F-1 Score
model_mean_f1 = np.mean(df_eval["tfidf_score"])
print("Model: {:.2f}%".format(model_mean_f1 * 100))

Model: 17.89%
