# Recommender System
### Matrix Factorization for implicit feedback data using Alternating Least Squares

The matrix factorization performed in this notebook is based on the this [paper](http://yifanhu.net/PUB/cf.pdf) by Yehuda Koren et. al which explores an alternative way to represent utility matrices with implicit feedback. We are using the library [`implicit`](https://github.com/benfred/implicit) which implements the outlined algorithm.

*Note:* Datafiles are built from scratch in this notebook only if they don't exist on disk. However, to force rebuild any datafile, there will be a `REBUILD_*` constant in the respective cell that should be set to `True`

In [1]:
# Imports
from implicit.als import AlternatingLeastSquares
from datetime import datetime
from pathlib import Path

import scipy.sparse as sparse
import implicit
import pandas as pd
import numpy as np
import pickle
import time

# Initialize datasets

In [2]:
# Order datasets
df_order_products_prior = pd.read_csv("../data/order_products__prior.csv")
df_order_products_train = pd.read_csv("../data/order_products__train.csv")
df_orders = pd.read_csv("../data/orders.csv") 

# Products
df_products = pd.read_csv("../data/products.csv")
# Merge prior orders and products
df_merged_order_products_prior = pd.merge(df_order_products_prior, df_products, on="product_id", how="left")

rec_items = 10

In [3]:
def make_test_data(filepath, df_orders, df_order_products_train):
    """
    Generates the test dataset and saves it to disk at the given path
    """
    
    start = time.time()
    print("Creating test data ...")

    # Read train csv
    df_order_user_current = df_orders.loc[(df_orders.eval_set == "train")].reset_index()
    df_order_user_current = df_order_user_current[["order_id", "user_id"]]
    
    # Sanity check #1: `current_order_user_df` and `df_order_products_train` should have the same number of 
    # unique order ids
    assert len(df_order_user_current["order_id"].unique()) == len(df_order_products_train["order_id"].unique())

    # Convert train dataframe to a similar format
    df_order_products_test = df_order_products_train[["order_id", "product_id"]]
    df_order_products_test = df_order_products_test.groupby("order_id")["product_id"].apply(list).reset_index().rename(columns={"product_id": "products"})

    # Sanity check #2: `df_order_products_test` and `df_order_user_current` should have the same number of 
    # records before attempting to merge them
    assert df_order_products_test.size == df_order_user_current.size

    # Merge on order id
    df_user_products_test = pd.merge(df_order_user_current, df_order_products_test, on="order_id")
    df_user_products_test = df_user_products_test[["user_id", "products"]]

    # Write to disk
    df_user_products_test.to_csv(filepath, index_label=False, index=False)
    
    print("Completed in {:.2f}s".format(time.time() - start))


# Generate test data if it doesn't exist already
REBUILD_TEST_DATA = True
test_data_path = "../data/user_products__test.csv"
if REBUILD_TEST_DATA or not Path(test_data_path).is_file():
    make_test_data(test_data_path, df_orders, df_order_products_train)

df_user_products_test = pd.read_csv(test_data_path)
df_user_products_test.head(20)

Creating test data ...
Completed in 2.54s


Unnamed: 0,user_id,products
0,1,[1]
1,3,[9]
2,5,[20]
3,6,[26]
4,7,[35]
5,8,[46]
6,8,[75]
7,8,[50]
8,8,[41]
9,8,[56]


# Utility Matrix

In [4]:
def get_user_product_prior_df(filepath, df_orders, df_order_products_prior):
    """
    Generates a dataframe of users and their prior products purchases, and writes it to disk at the given path
    """
    
    start = time.time()
    print("Creating prior user-product data frame ...")
    
    # Consider ony "prior" orders and remove all columns except `user_id` from `df_orders`
    df_order_user_prior = df_orders.loc[df_orders.eval_set == "prior"]
    df_order_user_prior = df_order_user_prior[["order_id", "user_id"]]
    
    # Remove all columns except order_id and user_id from df_orders and 
    # merge the above on `order_id` and remove `order_id`
    df_merged = pd.merge(df_order_user_prior, df_order_products_prior[["order_id", "product_id"]], on="order_id")
    df_user_product_prior = df_merged[["user_id", "product_id"]]
    df_user_product_prior = df_user_product_prior.groupby(["user_id", "product_id"]).size().reset_index().rename(columns={0:"quantity"})
    
    # Write to disk
    df_user_product_prior.to_csv(filepath, index_label=False, index=False)

    print("Completed in {:.2f}s".format(time.time() - start))


# Build dataframe of users and their prior product purchases.
# This is needed for building the utility matrix
REBUILD_MATRIX_DF = True
matrix_df_path = "../data/user_products__prior.csv"
if REBUILD_MATRIX_DF or not Path(matrix_df_path).is_file():
    get_user_product_prior_df(matrix_df_path, df_orders, df_order_products_prior)

df_user_product_prior = pd.read_csv(matrix_df_path)
df_user_product_prior["user_id"] = df_user_product_prior["user_id"].astype("category")
df_user_product_prior["product_id"] = df_user_product_prior["product_id"].astype("category")

Creating prior user-product data frame ...
Completed in 1.66s


In [5]:
def build_product_user_matrix(matrix_path, df_user_product_prior):
    """
    Generates a utility matrix representing purchase history of users, and writes it to disk.
    Rows and Columns represent products and users respectively.
    """
    start = time.time()
    print("Creating product user matrix ...")

    product_user_matrix = sparse.coo_matrix((df_user_product_prior["quantity"],
                                            (df_user_product_prior["product_id"].cat.codes.copy(),
                                             df_user_product_prior["user_id"].cat.codes.copy())))    
    sparse.save_npz(matrix_path, product_user_matrix)
    
    print("Completed in {:.2f}s".format(time.time() - start))


# Get the `product x user` utility matrix
REBUILD_MATRIX = True
matrix_path = "../data/product_user_matrix.npz"
if REBUILD_MATRIX or not Path(matrix_path).is_file():
    build_product_user_matrix(matrix_path, df_user_product_prior)

product_user_matrix = sparse.load_npz(matrix_path).tocsr()

Creating product user matrix ...
Completed in 0.26s


In [6]:
# How sparse is the utility matrix?
def sparsity(matrix):
    """
    Given a matrix, returns its sparsity
    """
    total_size = matrix.shape[0] * matrix.shape[1]
    actual_size = matrix.size
    sparsity = (1 - (actual_size / total_size)) * 100
    return(sparsity)

sparsity(product_user_matrix)

99.99555128671498

# Implicit Matrix Factorization using ALS

In [7]:
def confidence_matrix(prod_user_matrix, alpha):
    """
    Given a utility matrix,
    Returns the given matrix converted to a confidence matrix
    For more details, look at http://yifanhu.net/PUB/cf.pdf
    """
    return (prod_user_matrix * alpha).astype("double")
    

def build_imf(prod_user_matrix, **kwargs):
    """
    Given the utility matrix and model parameters,
    Builds models and writes it to disk at 
    """
    start = time.time()
    
    # Build model
    print("Building IMF model with alpha: {} ...".format(kwargs["alpha"]))
    model = AlternatingLeastSquares()
    model.approximate_similar_items = False
    
    model.fit(confidence_matrix(prod_user_matrix, kwargs["alpha"]))

    # Save model to disk
    with open(kwargs["path"], "wb+") as f:
        pickle.dump(model, f, pickle.HIGHEST_PROTOCOL)
    
    print("Completed in {:.2f}s".format(time.time() - start))

    
# Specify model params and build it
## Alpha's in the range [10, 50] with a step size of 5 were tried. alpha = 15 was found to have the best overall 
## recall value. 
model_params = {"alpha": 15} 
model_params["path"] = "../models/imf/{}.imf".format(model_params["alpha"])

REBUILD_MODEL = True
if REBUILD_MODEL or not Path(model_params["path"]).exists():
    build_imf(product_user_matrix, **model_params)
with open(model_params["path"], "rb") as f:
    imf_model = pickle.load(f)

Building IMF model with alpha: 15 ...


100%|████████████████████████████████████████████████████████████████████████████████| 15.0/15 [00:08<00:00,  1.71it/s]


Completed in 9.32s


# Example Recommendation

In [8]:
# Since the utility matrix is 0-indexed, the below dict is required to convert between `ids` and `indices`.
# For example, `product_id` 1 in the dataset is represented by the `0`th row of the utility matrix.

# Maps user_id: user index
u_dict = {uid:i for i, uid in enumerate(df_user_product_prior["user_id"].cat.categories)}

# Maps product_index: product id
p_dict = dict(enumerate(df_user_product_prior["product_id"].cat.categories))

In [9]:
users = pd.read_csv("../Data/users.csv")

col = ['model','reviewerID','act_products','act_imurl','rec_products','rec_imurl']
model='imf'
imf_rec_items = pd.DataFrame(columns=col)

for rr, value in users.iterrows():
    act_products = []
    act_imurl =[]
    rec_products = []
    rec_imurl= []
    
    user_id = value[0]
    
    recommendations = imf_model.recommend(u_dict[user_id], product_user_matrix.T.tocsr(), N = rec_items)
    row = df_user_products_test.loc[df_user_products_test.user_id == user_id]
    actual = list(row["products"])
    actual = actual[0][1:-1]
    actual = list(np.array([p.strip() for p in actual.strip().split(",")]).astype(np.int64))

    for pid in actual:
#         act_products.extend((df_products.loc[df_products.product_id == pid].product_name).tolist())
        act_products.extend((df_products.loc[df_products.product_id == pid].product_id).tolist())
        act_imurl.extend((df_products.loc[df_products.product_id == pid].imUrl).tolist())
    print("Actual products bought by user {}\n{}".format(user_id, act_products))

    # Recommended
    r = [p_dict[r[0]] for r in recommendations] # Takes the product_cat_code and maps to product_id

    for pid in r:
#         rec_products.extend((df_products.loc[df_products.product_id == pid].product_name).tolist())
        rec_products.extend((df_products.loc[df_products.product_id == pid].product_id).tolist())
        rec_imurl.extend((df_products.loc[df_products.product_id == pid].imUrl).tolist())
    print("\nRecommendations for user {}\n{}".format(user_id, rec_products))
    
    imf_rec_items.loc[len(imf_rec_items)] = [model,user_id,act_products,act_imurl,rec_products,rec_imurl]
    imf_rec_items.head()

Actual products bought by user 34418
[9996]

Recommendations for user 34418
[3040, 2396, 1645, 1184, 8238, 2571, 10569, 393, 4716, 13289]
Actual products bought by user 17284
[9993]

Recommendations for user 17284
[1919, 5325, 1921, 5327, 17879, 9993, 1665, 13805, 16039, 2449]
Actual products bought by user 8132
[12286]

Recommendations for user 8132
[889, 12286, 1696, 5326, 170, 834, 5033, 2789, 1375, 1551]
Actual products bought by user 36218
[194]

Recommendations for user 36218
[194, 647, 12397, 10000, 5807, 5845, 2501, 1686, 788, 1849]
Actual products bought by user 9127
[1804]

Recommendations for user 9127
[1664, 4091, 1804, 2410, 5629, 117, 1580, 1907, 1663, 116]
Actual products bought by user 41538
[3831]

Recommendations for user 41538
[2529, 1645, 16106, 2853, 536, 1658, 1184, 1944, 2474, 4716]
Actual products bought by user 7356
[4619]

Recommendations for user 7356
[7575, 4619, 23978, 3691, 3897, 5326, 1397, 8932, 170, 1804]
Actual products bought by user 7588
[79]

Recomm

In [10]:
imf_rec_items.to_csv('../data/imf_rec_items.csv', index=False)

In [11]:
# # Recommend items for a user 1
# user_id = 1
# recommendations = imf_model.recommend(u_dict[user_id], product_user_matrix.T.tocsr(), N = rec_items)

In [12]:
# # Actual 
# row = df_user_products_test.loc[df_user_products_test.user_id == user_id]
# actual = list(row["products"])
# actual = actual[0][1:-1]
# actual = list(np.array([p.strip() for p in actual.strip().split(",")]).astype(np.int64))
# act_products = []
# for pid in actual:
#     act_products.extend((df_products.loc[df_products.product_id == pid].product_name).tolist())
# print("Actual products bought by user {}\n{}".format(user_id, act_products))

# # Recommended
# r = [p_dict[r[0]] for r in recommendations] # Takes the product_cat_code and maps to product_id
# rec_products = []
# for pid in r:
#     rec_products.extend((df_products.loc[df_products.product_id == pid].product_name).tolist())
# print("\nRecommendations for user {}\n{}".format(user_id, rec_products))

Our recommender is discovery based and hence recommends products that the user has never purchased before. Hence, for evaluation, we are removing products purchased before from his current purchase represented by `actual`.

## Evaluation using `Recall`

In [13]:
def get_k_popular(k, df_merged_order_products_prior):
    """
    Returns the `k` most popular products based on purchase count in the dataset
    """
    popular_products = list(df_merged_order_products_prior["product_id"].value_counts().head(k).index)
    return popular_products

In [14]:
# Transpose of the product_user utility matrix
user_product_matrix = product_user_matrix.T.tocsr()

# Number of recommendations to make for every user
N_REC = rec_items

# Get the `N_REC` most popular products
popular_products = get_k_popular(N_REC, df_merged_order_products_prior)

In [15]:
def recall_score(actual, pred):
    """
    Given two lists representing actual and predicted values
    Returns the recall of the prediction
    """
    if len(actual) == 0:
        return 0
    actual, pred = set(actual), set(pred)
    return len(actual.intersection(pred)) / len(actual)


def new_products(row):
    """
    Given a row in the test dataset
    Returns the list of new products purchased
    """
    actual = row["products"][1:-1]  # Products purchased currently 
    actual = set([int(p.strip()) for p in actual.strip().split(",")])
    liked = set([p_dict[i] for i in user_product_matrix[u_dict[row["user_id"]]].indices])  # User's purchase history
    return actual - liked  # Return only new products purchased


def popular_recommend(row):
    """
    Given a row in the test dataset
    Returns the recall score when popular products are recommended
    """
    actual = new_products(row)
    return recall_score(actual, popular_products)

             
def imf_recommend(row):
    """
    Given a row in the test dataset
    Returns the recall score when our model recommends products
    """
    actual = new_products(row)
    recommended = imf_model.recommend(u_dict[row["user_id"]], user_product_matrix, N=N_REC)
    recommended = [p_dict[r[0]] for r in recommended]
    return recall_score(actual, recommended)

             
def build_eval_df(df_user_products_test, filepath=None, subset=None):
    """
    Builds a dataframe of recall values of the baseline and our model for all the users
    in the test data, and saves its to disk at `filepath`
    """
    start = time.time()
    print("Building dataframe with recall values ...")
    
    df_eval = df_user_products_test.copy()
    if subset:
        df_eval = df_eval.sample(n=int(len(df_eval) * subset), random_state=7)
    df_eval["popular_score"] = df_eval.apply(popular_recommend, axis=1)
    df_eval["imf_score"] = df_eval.apply(imf_recommend, axis=1)
    
    df_eval.to_csv(filepath, index=False)
    
    print("Completed in {:.2f}s".format(time.time() - start))    


# Get the dataframe with recall values of the baseline and the model
REBUILD_EVAL_DF = True
subset = 0.2  # Evaluate on `subset x 100`% of the test dataset
eval_path = "../data/eval/eval_discovery_{}_{}.csv".format(subset if subset is not None else "full", N_REC)
if REBUILD_EVAL_DF or not Path(eval_path).exists():
    build_eval_df(df_user_products_test, filepath=eval_path, subset=subset)
df_eval = pd.read_csv(eval_path)

Building dataframe with recall values ...
Completed in 43.10s


# Results

In [16]:
# Mean recall scores
model_mean_recall, baseline_mean_recall = np.mean(df_eval["imf_score"]), np.mean(df_eval["popular_score"])
print("Model: {:.2f}%".format(model_mean_recall * 100))
print("Baseline: {:.2f}%".format(baseline_mean_recall * 100))

Model: 2.05%
Baseline: 0.66%


Recommendations through matrix factorization are almost a factor of 2 times better than the baseline model.