# Location based recommendation - Data preprocessing

To reach a fully working system we need to extract features from the data and create product embeddings

In [1]:
import pandas as pd
import numpy as np
import os
import pickle as pkl
import glob

## Configurations

In [2]:
# General configurations
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, 'data/')
MODELS_DIR = os.path.join(BASE_DIR, 'models/')
RANDOM_STATE = 2017

In [3]:
%env DATA_DIR={DATA_DIR}
%env MODELS_DIR={MODELS_DIR}
%env RANDOM_STATE={RANDOM_STATE}

env: DATA_DIR=/User/demos/location-based-recommendation/data/
env: MODELS_DIR=/User/demos/location-based-recommendation/models/
env: RANDOM_STATE=2017


## Data loading

In [4]:
aisles_raw = pd.read_csv(DATA_DIR + 'aisles.csv', dtype={'aisle_id': np.uint8,
                                                         'aisle': 'category'})

departments_raw = pd.read_csv(DATA_DIR + 'departments.csv', dtype={'department_id': np.uint8,
                                                                   'department': 'category'})

order_prior_raw = pd.read_csv(DATA_DIR + 'order_products__prior.csv', dtype={'order_id': np.uint32,
                                                                             'product_id': np.uint16,
                                                                             'add_to_cart_order': np.uint8,
                                                                             'reordered': bool})

order_train_raw = pd.read_csv(DATA_DIR + 'order_products__train.csv', dtype={'order_id': np.uint32,
                                                                             'product_id': np.uint16,
                                                                             'add_to_cart_order': np.uint8,
                                                                             'reordered': bool})

orders_raw = pd.read_csv(DATA_DIR + 'orders.csv', dtype={'order_id': np.uint32,
                                                         'user_id': np.uint32,
                                                         'eval_set': 'category',
                                                         'order_number': np.uint8,
                                                         'order_dow': np.uint8,
                                                         'order_hour_of_day': np.uint8})

products_raw = pd.read_csv(DATA_DIR + 'products.csv', dtype={'product_id': np.uint16,
                                                             'aisle_id': np.uint8,
                                                             'department_id': np.uint8})

## Prepare data
### Create previous products

In [5]:
orders = orders_raw.loc[orders_raw.eval_set == 'prior', :]
orders_user = orders[['order_id', 'user_id']]
labels = pd.merge(order_prior_raw, orders_user, on='order_id')
labels = labels.loc[:, ['user_id', 'product_id']].drop_duplicates()

In [6]:
labels.to_pickle(DATA_DIR + 'previous_products.pkl')

### Generate chunks (folds) for later CV

In [7]:
FOLDS = 1

In [8]:
%env FOLDS={FOLDS}

env: FOLDS=5


In [9]:
orders = orders_raw.loc[(orders_raw.eval_set == 'train') | (orders_raw.eval_set == 'test'), :]
labels_2 = pd.merge(labels, orders[['order_id', 'user_id', 'eval_set']], on='user_id').drop(['user_id'], axis=1)

order_train = order_train_raw.drop(['add_to_cart_order'], axis=1)
orders = np.unique(labels_2.order_id)

In [10]:
orders.shape

(206209,)

In [11]:
size = orders.shape[0] // FOLDS

for fold in range(FOLDS):
    current = orders[fold * size:(fold + 1) * size]
    current = labels_2.loc[np.in1d(labels_2.order_id, current), :]
    current = pd.merge(order_train, current, on=['order_id', 'product_id'], how='right')
    current.reordered.fillna(False, inplace=True)
    print(current.columns)
    print(current.shape)

    current.to_pickle(DATA_DIR + 'chunk_{}.pkl'.format(fold))

Index(['order_id', 'product_id', 'reordered', 'eval_set'], dtype='object')
(2668945, 4)
Index(['order_id', 'product_id', 'reordered', 'eval_set'], dtype='object')
(2654629, 4)
Index(['order_id', 'product_id', 'reordered', 'eval_set'], dtype='object')
(2653526, 4)
Index(['order_id', 'product_id', 'reordered', 'eval_set'], dtype='object')
(2653889, 4)
Index(['order_id', 'product_id', 'reordered', 'eval_set'], dtype='object')
(2676766, 4)


### Generate Orders CumSum

In [12]:
order_prior = order_prior_raw
orders = orders_raw
products = products_raw
user_product = labels
labels = labels_2

In [13]:
order_comsum = orders[['user_id', 'order_number', 'days_since_prior_order']].groupby(['user_id', 'order_number']) \
    ['days_since_prior_order'].sum().groupby(level=[0]).cumsum().reset_index().rename(
    columns={'days_since_prior_order': 'days_since_prior_order_comsum'})

order_comsum.to_pickle(DATA_DIR + 'orders_comsum.pkl')

order_comsum = pd.merge(order_comsum, orders, on=['user_id', 'order_number'])[
    ['user_id', 'order_number', 'days_since_prior_order_comsum', 'order_id']]

order_product = pd.merge(order_prior, orders, on='order_id')[['order_id', 'product_id', 'eval_set']]
order_product_train_test = labels[['order_id', 'product_id', 'eval_set']]

order_product = pd.concat([order_product, order_product_train_test])

order_product = pd.merge(order_product, order_comsum, on='order_id')

print(f'order_products_columns:\n{order_product.columns}')
print(f'user_product_columns:\n{user_product.columns}')

order_product = pd.merge(order_product, user_product, on=['user_id',
                                                          'product_id'])  # user_id, order_id, product_id, eval_set, order_id, Days_since_prior (comsum)

print('Summing order distances')
temp = order_product.groupby(['user_id', 'product_id', 'order_number'])[
    'days_since_prior_order_comsum'].sum().groupby(level=[0, 1]).apply(lambda x: np.diff(np.nan_to_num(x)))
temp = temp.to_frame('periods').reset_index()

# temp.to_pickle(DATA_DIR + 'product_period.pkl')

print('Adding aggregations')
aggregated = temp.copy()
aggregated['last'] = aggregated.periods.apply(lambda x: x[-1])
aggregated['prev1'] = aggregated.periods.apply(lambda x: x[-2] if len(x) > 1 else np.nan)
aggregated['prev2'] = aggregated.periods.apply(lambda x: x[-3] if len(x) > 2 else np.nan)
aggregated['median'] = aggregated.periods.apply(lambda x: np.median(x[:-1]))
aggregated['mean'] = aggregated.periods.apply(lambda x: np.mean(x[:-1]))
aggregated.drop('periods', axis=1, inplace=True)

aggregated.to_pickle(DATA_DIR + 'product_periods_stat.pkl')

order_products_columns:
Index(['order_id', 'product_id', 'eval_set', 'user_id', 'order_number',
       'days_since_prior_order_comsum'],
      dtype='object')
user_product_columns:
Index(['user_id', 'product_id'], dtype='object')
Summing order distances
Adding aggregations


  out=out, **kwargs)


### Generate user-product rank

In [14]:
orders_products = pd.merge(orders, order_prior, on="order_id")

orders_products_products = pd.merge(orders_products, products[['product_id', 'department_id', 'aisle_id']],
                                    on='product_id')

user_dep_stat = orders_products_products.groupby(['user_id', 'department_id']).agg(
    {'product_id': lambda x: x.nunique(),
     'reordered': 'sum'
     })
print(user_dep_stat.columns)
user_dep_stat.rename(columns={'product_id': 'dep_products',
                              'reordered': 'dep_reordered'}, inplace=True)
user_dep_stat.reset_index(inplace=True)
print(user_dep_stat.columns)
user_dep_stat.to_pickle(DATA_DIR + 'user_department_products.pkl')

user_aisle_stat = orders_products_products.groupby(['user_id', 'aisle_id']).agg(
    {'product_id': lambda x: x.nunique(),
     'reordered': 'sum'
     })
print(user_aisle_stat.columns)
user_aisle_stat.rename(columns={'product_id': 'aisle_products',
                                'reordered': 'aisle_reordered'}, inplace=True)
user_aisle_stat.reset_index(inplace=True)
user_aisle_stat.to_pickle(DATA_DIR + 'user_aisle_products.pkl')

Index(['product_id', 'reordered'], dtype='object')
Index(['user_id', 'department_id', 'dep_products', 'dep_reordered'], dtype='object')
Index(['product_id', 'reordered'], dtype='object')


### Generate prod2vec

In [15]:
order_product = pd.merge(order_prior, orders, on='order_id')
prod2vec = order_prior.sort_values(['order_id']).groupby('order_id')['product_id']\
    .apply(lambda x: x.tolist()).to_frame('products').reset_index()
prod2vec = pd.merge(prod2vec, orders, on='order_id')
prod2vec.to_pickle(os.path.join(DATA_DIR, 'prod2vec.pkl'))