## Import Packages/Dataset & Data Pre-Processing

In [11]:
# Importing Necessary Packages

from implicit.nearest_neighbours import tfidf_weight
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from datetime import datetime
from pathlib import Path
from numpy import bincount, log, sqrt

import scipy.sparse as sparse
import implicit
import pandas as pd
import numpy as np
import pickle
import time
import heapq

In [12]:
# Read data files
df_order_products_prior = pd.read_csv("order_products__prior.csv")
df_order_products_train = pd.read_csv("order_products__train.csv")
df_orders = pd.read_csv("orders.csv") 
df_products = pd.read_csv("products.csv")

# Merge prior orders and products
df_merged_prior = pd.merge(df_order_products_prior, df_products, on="product_id", how="left")

In [15]:
def prior_data():
    """
    Generates prior_user_products and product_frequency
    """
    order_products = pd.read_csv("order_products__prior.csv")
    order_products = order_products[["order_id", "product_id"]]

    # Frequency of each product id
    product_frequency = order_products['product_id'].value_counts()
    # Make list of each order: [list of products]
    order_products = order_products.groupby("order_id")["product_id"].apply(list)
    
    # Order id and user id of prior orders
    # contains each order id with duplicate user ids
    order_user_df = df_orders.loc[(df_orders.eval_set == "prior")]
    order_user_df = order_user_df[["order_id", "user_id"]]
    
    # Dataframe of order id - user id, [list of products]
    prior_user_products = pd.merge(order_user_df, order_products, on="order_id")
    # Dataframe of user id and [list of product id] for each order with no order listed
    prior_user_products = prior_user_products[["user_id", "product_id"]]
    prior_user_products = prior_user_products.groupby("user_id")["product_id"].agg(sum).reset_index()

    # Returns user id:[product id] and frequency of each product id in terms of number of orders.
    return prior_user_products, product_frequency

In [16]:
# Make prior data
df_prior_user_products, df_product_frequency = prior_data()
df_prior_user_products.head()

Unnamed: 0,user_id,product_id
0,1,"[196, 14084, 12427, 26088, 26405, 196, 10258, ..."
1,2,"[32792, 47766, 20574, 12000, 48110, 22474, 165..."
2,3,"[9387, 17668, 15143, 16797, 39190, 47766, 2190..."
3,4,"[36606, 7350, 35469, 2707, 42329, 7160, 1200, ..."
4,5,"[15349, 21413, 48775, 28289, 8518, 11777, 3171..."


In [17]:
# Function to save to disk to save time later
def save_data_to_disk(dataframe, df_name):
    filepath = "df_{}.pkl".format(df_name)
    dataframe.to_pickle(filepath)


In [18]:
# save data to disk for later use
save_data_to_disk(df_prior_user_products, "user_products_prior")
save_data_to_disk(df_product_frequency, "product_frequency")

In [19]:
# Read user_products and product_frequency from disk
df_prior_user_products = pd.read_pickle("df_user_products_prior.pkl")
df_product_frequency = pd.read_pickle("df_product_frequency.pkl")
df_product_frequency = pd.DataFrame(df_product_frequency).rename(columns={"product_id": "frequency"})

In [20]:
def test_data(test_data_path, df_orders, df_order_products_train):
    """
    Generates test dataset
    """

    # Read train csv: equivalent test dataset (naming convention says train but is a test dataset)
    df_order_user_test = df_orders.loc[(df_orders.eval_set == "train")]
    df_order_user_test = df_order_user_test[["order_id", "user_id"]]

    # Convert to similar format as before
    df_order_products_test = df_order_products_train[["order_id", "product_id"]]
    df_order_products_test = df_order_products_test.groupby("order_id")["product_id"].apply(list)

    # Merge on order id and get user and product ids
    df_user_products_test = pd.merge(df_order_user_test, df_order_products_test, on="order_id")
    df_user_products_test = df_user_products_test[["user_id", "product_id"]]

    # Write to disk
    df_user_products_test.to_csv(test_data_path, index_label=False)

In [23]:
# Make test data
test_data_path = "user_products__test.csv"
if not Path(test_data_path).is_file():
    test_data(test_data_path, df_orders, df_order_products_train)
