# Personalised recommendations to increase AOV of Instacart loyalists

---

## Part 4: Data preparation for RecSys

In this notebook, I will preparing datasets that will be used when building our recommendation systems in Part 5.

---

### Load libraries and datasets

In [2]:
from scipy.sparse import coo_matrix, csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy.sparse as sparse
import pandas as pd
import numpy as np
import pickle
import heapq
import random
from IPython.display import Markdown, display, display_html
pd.set_option('max_colwidth', None)

In [2]:
pd.set_option('max_colwidth', None)

In [3]:
df = pd.read_pickle('../datasets/full_combined_df_reduced.pkl')
products_df = pd.read_pickle('../datasets/products_df_reduced.pkl')

### Train test split: Set aside user's last 5 orders as test set

In [4]:
# load in dataframe of the last 20 orders for each user
user_last_20 = pd.read_pickle('../datasets/user_last_20_orders.pkl')

In [6]:
# get the last 5 values in the list
user_last_5 = user_last_20['last_20_order_nums'].apply(lambda x: x[-5:])

In [7]:
user_last_5

user_id
17        [37, 38, 39, 40, 41]
21        [30, 31, 32, 33, 34]
27        [78, 79, 80, 81, 82]
37        [20, 21, 22, 23, 24]
50        [64, 65, 66, 67, 68]
                  ...         
206200    [20, 21, 22, 23, 24]
206201    [28, 29, 30, 31, 32]
206202    [16, 18, 20, 21, 22]
206206    [53, 54, 61, 64, 65]
206208    [45, 46, 47, 48, 49]
Name: last_20_order_nums, Length: 41549, dtype: object

In [11]:
# to split the main df into train and test sets, i will create a list of tuples to filter out the last 5 orders for each user

# create a list of user ids and repeat each id 5 times to match the 5 orders we have per user
user_list = [[x] * 5 for x in user_last_5.index]

# flatten the list of lists
user_list = [item for sublist in user_list for item in sublist]

# create a flattened list of all the last 5 orders from all customers
last_5_orders_list = [item for sublist in user_last_5.values for item in sublist]

# use zip function to create list of tuples (user_id, order_number)
tuple_filt_list = list(zip(user_list, last_5_orders_list))

In [12]:
# create test set
df_test = df[df[['user_id', 'order_number']].apply(tuple, axis=1).isin(tuple_filt_list)]

In [14]:
# pickle df for use in Part 5
df_test.to_pickle('../datasets/df_test.pkl')

### Train test split: Create train set with users' first 15 of their last 20 orders

In [15]:
df_train = df[~df['order_id'].isin(df_test['order_id'].values)]

In [16]:
# pickle df for use in Part 5
df_train.to_pickle('../datasets/df_train.pkl')

### Create dataframes for use in building RecSys in Part 5

1. `user_products_train`, `user_products_test`
    - columns: 
        - `user_id`
        - `product_id` (list of all prior product_ids)


2. `product_frequency`
    - index: `product_id`
    - column: `frequency` number of times product was purchased from all prior orders
    
    
3. `user_product_matrix` – for user-based collaborative filtering

4. `product_user_matrix` – for item-based collaborative filtering

5. `product_product_matrix` - for content-based filtering

#### `product_frequency`

In [17]:
# Frequency of each product id in train df (number of times each product was purchased)
product_frequency_train = df_train['product_id'].value_counts()

In [18]:
product_frequency_train

24852    113792
13176     98040
21903     58240
47209     55083
21137     54471
          ...  
31578         7
36487         7
19286         6
16838         3
38957         3
Name: product_id, Length: 16859, dtype: int64

In [19]:
# put into a dataframe
product_frequency_train = pd.DataFrame(product_frequency_train).rename(columns={"product_id": "frequency"})

In [20]:
# pickle for later use
product_frequency_train.to_pickle('../datasets/product_frequency_train.pkl')

In [28]:
del tuple_filt_list, user_last_20, user_last_5

#### `user_products_train`

In [29]:
# Make list of products that each user has bought in df_train
user_products_train = df_train.groupby('user_id')['product_id'].agg(['unique']).reset_index()

In [30]:
user_products_train.head()

Unnamed: 0,user_id,unique
0,17,"[7350, 18534, 38618, 16797, 47141, 21553, 1946..."
1,21,"[44156, 33548, 37940, 33819, 46388, 23729, 468..."
2,27,"[1194, 35958, 20118, 33705, 5322, 17224, 38656..."
3,37,"[17948, 47500, 46969, 17794, 45200, 28278, 115..."
4,50,"[6182, 13176, 23165, 47018, 19678, 20367, 1083..."


In [31]:
user_products_train.rename(columns={'unique': 'product_id'}, inplace=True)

#### `user_products_test`

In [32]:
# Make list of products that each user has bought (test_df)
user_products_test = df_test.groupby('user_id')['product_id'].agg(['unique']).reset_index()

In [33]:
user_products_test.rename(columns={'unique': 'product_id'}, inplace=True)

In [34]:
# pickle files for later use
user_products_train.to_pickle('../datasets/user_products_train.pkl')
user_products_test.to_pickle('../datasets/user_products_test.pkl')

#### Pivot table – Utility Matrix (User x Product)

In [35]:
# create pivot table 
pivot_user_product = df_train.groupby(['user_id', 'product_id']).agg('size').unstack()

In [36]:
pivot_user_product

product_id,1,3,4,8,9,10,12,23,25,26,...,49655,49659,49664,49667,49668,49670,49677,49678,49680,49683
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17,,,,,,,,,,,...,,,,,,,,,,
21,,,,,,,,,,,...,,,,,,,,,,
27,,,,,,,,,,,...,,,,,,,,,,
37,,,,,,,,,,,...,,,,,,,,,,
50,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206200,,,,,,,,,,,...,,,,,,,,,,
206201,,,,,,,,,,,...,,,,,,,,,,1.0
206202,,,,,,,,,,,...,,,,,,,,,,3.0
206206,,,,,,,,,,,...,,,,,,,,,,


In [37]:
pivot_user_product = pivot_user_product.apply(np.float32)

In [38]:
pivot_user_product.to_pickle('../datasets/pivot_user_product.pkl')

#### De-mean the utility matrix

Collaborative filtering algorithms must measure the similarity of rows and/or columns of the utility matrix. It is often useful to normalize the utility matrix by subtracting the average value (either by row, by column, or both) before measuring the cosine distance. 

I will be de-meaning the matrix row-wise below.

In [40]:
# my kernel ran out of memory when de-meaning the entire pivot table due to its huge size.
# thus, im splitting the pivot table into chunks so that we can perform the operation on each chunk iteratively

# function for splitting dataframe
def split_dataframe(df, chunk_size = 10000): 
    """
    input - df: a Dataframe, chunkSize: the chunk size
    output - a list of DataFrame
    purpose - splits the DataFrame into smaller chunks
    """
    chunks = list()
    num_chunks = len(df) // chunk_size + 1
    for i in range(num_chunks):
        chunks.append(df[i*chunk_size:(i+1)*chunk_size])
    return chunks

In [41]:
# split the pivot table into chunks
pivot_user_product_chunks = split_dataframe(pivot_user_product, chunk_size = 10000)

# function for de-meaning the utility matrix row-wise
def normalize_df(df_chunk):
    df_chunk = (df_chunk.T - df_chunk.mean(axis=1)).T
    return df_chunk

In [42]:
del pivot_user_product

In [43]:
# de-mean the pivot table chunks
for i in range(len(pivot_user_product_chunks)):
    pivot_user_product_chunks[i] = normalize_df(pivot_user_product_chunks[i])

In [44]:
# convert the pivot table values from float64 to float32 to reduce memory usage
for i in range(len(pivot_user_product_chunks)):
    pivot_user_product_chunks[i] = pivot_user_product_chunks[i].apply(np.float32)

In [47]:
# concatenate the pivot table chunks back together into one dataframe
pivot_user_product_normed = pd.concat(pivot_user_product_chunks, axis=0)

In [48]:
pivot_user_product_normed

product_id,1,3,4,8,9,10,12,23,25,26,...,49655,49659,49664,49667,49668,49670,49677,49678,49680,49683
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
17,,,,,,,,,,,...,,,,,,,,,,
21,,,,,,,,,,,...,,,,,,,,,,
27,,,,,,,,,,,...,,,,,,,,,,
37,,,,,,,,,,,...,,,,,,,,,,
50,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
206200,,,,,,,,,,,...,,,,,,,,,,
206201,,,,,,,,,,,...,,,,,,,,,,-0.825581
206202,,,,,,,,,,,...,,,,,,,,,,0.446429
206206,,,,,,,,,,,...,,,,,,,,,,


In [49]:
del pivot_user_product_chunks

In [50]:
pivot_user_product_normed.to_pickle('../datasets/pivot_user_product_normed.pkl')

In [51]:
del pivot_user_product_normed

#### User-product matrix

This will be used in user-based collaborative filtering.

coo_matrix is the best and fastest format for constructing a new sparse matrix using large arrays and row/column indices. Once the sparse matrix is constructed, one can easily cast it from coo to other sparse matrix formats.

In [54]:
user_product_matrix = sparse.coo_matrix(pivot_user_product_normed.fillna(0))
print(user_product_matrix)

  (0, 361)	-1.5526316
  (0, 613)	2.4473684
  (0, 1014)	-1.5526316
  (0, 1547)	-1.5526316
  (0, 1737)	-0.5526316
  (0, 1958)	-1.5526316
  (0, 2503)	9.447369
  (0, 3055)	-0.5526316
  (0, 3190)	-0.5526316
  (0, 3283)	-1.5526316
  (0, 4286)	-1.5526316
  (0, 4775)	0.44736838
  (0, 5502)	-1.5526316
  (0, 5719)	2.4473684
  (0, 5803)	-0.5526316
  (0, 6337)	10.447369
  (0, 6449)	-0.5526316
  (0, 6658)	-1.5526316
  (0, 7234)	-0.5526316
  (0, 7365)	-1.5526316
  (0, 7897)	-1.5526316
  (0, 8646)	-1.5526316
  (0, 9099)	0.44736838
  (0, 10507)	-1.5526316
  (0, 12340)	-1.5526316
  :	:
  (41548, 12610)	-0.90196073
  (41548, 12701)	2.0980392
  (41548, 12807)	-0.90196073
  (41548, 12946)	-0.90196073
  (41548, 13471)	0.09803927
  (41548, 13582)	-0.90196073
  (41548, 14072)	-0.90196073
  (41548, 14439)	0.09803927
  (41548, 14675)	-0.90196073
  (41548, 14864)	-0.90196073
  (41548, 14956)	-0.90196073
  (41548, 14971)	0.09803927
  (41548, 15098)	-0.90196073
  (41548, 15126)	-0.90196073
  (41548, 15156)	-0.901

In [55]:
sparse.save_npz('../datasets/user_product_normed_sparse_matrix', user_product_matrix)

In [56]:
user_product_matrix = user_product_matrix.tocsr()

In [57]:
user_product_matrix

<41549x16859 sparse matrix of type '<class 'numpy.float32'>'
	with 3136874 stored elements in Compressed Sparse Row format>

#### Product-user matrix

This will be used in item-based collaborative filtering.

In [58]:
product_user_matrix = sparse.coo_matrix(pivot_user_product_normed.fillna(0).T)

In [59]:
sparse.save_npz('../datasets/product_user_normed_sparse_matrix', product_user_matrix)

#### Product-product matrix

This will be used in content-based filtering.

In [3]:
df_train = pd.read_pickle('../datasets/df_train.pkl')

In [4]:
df_train.head()

Unnamed: 0,order_id,user_id,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_id,add_to_cart_order,reordered,product_name,department,aisle,organic
4522179,3141030,17,21,2,12,4.0,7350,1,1,Natural Lime Flavor Sparkling Mineral Water,beverages,water seltzer sparkling water,0
10664805,3141030,17,21,2,12,4.0,18534,2,1,Grade A Extra Large Eggs,dairy eggs,eggs,0
10673916,3141030,17,21,2,12,4.0,38618,4,1,Organic Chunky Vegetable Soup,canned goods,soup broth bouillon,1
1423571,3141030,17,21,2,12,4.0,16797,5,1,Strawberries,produce,fresh fruits,0
10652616,603376,17,22,6,16,4.0,47141,1,1,Cola,beverages,soft drinks,0


In [7]:
df_train_products = df_train[['product_name', 'department', 'aisle']].drop_duplicates()

In [8]:
# concatenate department and aisle to form a new column
df_train_products['dept_aisle'] = df_train_products.department.astype(str) + ' ' + df_train_products.aisle.astype(str)

In [9]:
df_train_products.reset_index(inplace=True, drop=True)

In [10]:
df_train_products.sample(5)

Unnamed: 0,product_name,department,aisle,dept_aisle
5941,Organic Multigrain Tortilla Chips Sub Lime,snacks,chips pretzels,snacks chips pretzels
263,"Super Spinach! Baby Spinach, Baby Bok Choy, Sweet Baby Kale",produce,packaged vegetables fruits,produce packaged vegetables fruits
9288,Organic Campari Cocktail Tomatoes,produce,packaged vegetables fruits,produce packaged vegetables fruits
6456,Carrot Bunch,produce,fresh vegetables,produce fresh vegetables
11165,Artisan Blends Parmesan Shredded Cheese,dairy eggs,packaged cheese,dairy eggs packaged cheese


In [11]:
# TF-IDF vectorization of `dept_aisle` for feature extraction
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
tfidf_matrix = tf.fit_transform(df_train_products['dept_aisle'])
tfidf_matrix.shape

(16859, 493)

In [12]:
tfidf_matrix = sparse.csr_matrix(tfidf_matrix)

In [13]:
# pickle for later use
sparse.save_npz('../datasets/product_product_tfidf_matrix', tfidf_matrix)