# LGBM Training

In [17]:
import gc
from concurrent.futures import ThreadPoolExecutor

import pandas as pd
import numpy as np
import os
import lightgbm as lgb
import json
import sklearn.metrics
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split
from scipy.sparse import dok_matrix, coo_matrix
from sklearn.utils.multiclass import type_of_target
import pickle as pkl

### Basic definitions

In [3]:
BASE_DIR = os.getcwd()
DATA_DIR = os.path.join(BASE_DIR, 'data/')
MODELS_DIR = os.path.join(BASE_DIR, 'models/')
RANDOM_STATE = 2017

### Load data

In [3]:
aisles = pd.read_csv(os.path.join(DATA_DIR, 'aisles.csv'), dtype={'aisle_id': np.uint8, 'aisle': 'category'})
departments = pd.read_csv(DATA_DIR + 'departments.csv', dtype={'department_id': np.uint8,
                                                               'department': 'category'})
order_prior = pd.read_csv(DATA_DIR + 'order_products__prior.csv', dtype={'order_id': np.uint32,
                                                                         'product_id': np.uint16,
                                                                         'add_to_cart_order': np.uint8,
                                                                         'reordered': bool})
order_train = pd.read_csv(DATA_DIR + 'order_products__train.csv', dtype={'order_id': np.uint32,
                                                                         'product_id': np.uint16,
                                                                         'add_to_cart_order': np.uint8,
                                                                         'reordered': bool})
orders = pd.read_csv(DATA_DIR + 'orders.csv', dtype={'order_id': np.uint32,
                                                     'user_id': np.uint32,
                                                     'eval_set': 'category',
                                                     'order_number': np.uint8,
                                                     'order_dow': np.uint8,
                                                     'order_hour_of_day': np.uint8})
products = pd.read_csv(DATA_DIR + 'products.csv', dtype={'product_id': np.uint16,
                                                         'aisle_id': np.uint8,
                                                         'department_id': np.uint8})
product_embeddings = pd.read_pickle(DATA_DIR + 'product_embeddings.pkl')

product_periods = pd.read_pickle(DATA_DIR + 'product_periods_stat.pkl').fillna(9999)

embedings = list(range(32))
product_embeddings = product_embeddings[embedings + ['product_id']]

user_dep_stat = pd.read_pickle(DATA_DIR + 'user_department_products.pkl')
user_aisle_stat = pd.read_pickle(DATA_DIR + 'user_aisle_products.pkl')

### Create weights and probabilities

In [14]:
print('Create weights')
weights = order_train.groupby('order_id')['reordered'].sum().to_frame('weights')
weights.reset_index(inplace=True)

print('creating probabilities')
prob = pd.merge(order_prior, orders, on='order_id')

prob = prob.groupby(['product_id', 'user_id']) \
    .agg({'reordered': 'sum',
          'user_id': 'size'})

prob.rename(columns={'sum': 'reordered',
                     'user_id': 'total'}, inplace=True)

prob.reordered = (prob.reordered > 0).astype(np.float32)
prob.total = (prob.total > 0).astype(np.float32)
prob['reorder_prob'] = prob.reordered / prob.total
prob = prob.groupby('product_id').agg({'reorder_prob': 'mean'}).rename(columns={'mean': 'reorder_prob'}) \
    .reset_index()
print(prob.columns)
prob.head(5)

Create weights
creating probabilities
Index(['product_id', 'reorder_prob'], dtype='object')


Unnamed: 0,product_id,reorder_prob
0,1,0.385475
1,2,0.102564
2,3,0.486486
3,4,0.351648
4,5,0.666667


### Create product stats

In [None]:
print('creating prod_stat')
prod_stat = order_prior.groupby('product_id').agg({'reordered': ['sum', 'size'],
                                                   'add_to_cart_order': 'mean'})
prod_stat.columns = prod_stat.columns.levels[1]
prod_stat.rename(columns={'sum': 'prod_reorders',
                          'size': 'prod_orders',
                          'mean': 'prod_add_to_card_mean'}, inplace=True)
prod_stat.reset_index(inplace=True)

prod_stat['reorder_ration'] = prod_stat['prod_reorders'] / prod_stat['prod_reorders']

prod_stat = pd.merge(prod_stat, prob, on='product_id')
print(prod_stat.columns)
prod_stat.head(5)

### Create user stats

In [None]:
print('creating user_stat')
user_stat = orders.loc[orders.eval_set == 'prior', :].groupby('user_id').agg({'order_number': 'max',
                                                                              'days_since_prior_order': ['sum',
                                                                                                         'mean',
                                                                                                         'median']})
user_stat.columns = user_stat.columns.droplevel(0)
user_stat.rename(columns={'max': 'user_orders',
                          'sum': 'user_order_starts_at',
                          'mean': 'user_mean_days_since_prior_order',
                          'median': 'user_median_days_since_prior'}, inplace=True)
user_stat.reset_index(inplace=True)

orders_products = pd.merge(orders, order_prior, on='order_id')

user_order_stat = orders_products.groupby('user_id').agg({'user_id': 'size',
                                                          'reordered': 'sum',
                                                          'product_id': lambda x: x.nunique()})
user_order_stat.rename(columns={'user_id': 'user_total_products',
                                'product_id': 'user_distinct_products',
                                'reordered': 'user_reorder_ratio'}, inplace=True)
user_order_stat.reset_index(inplace=True)
user_order_stat.user_reorder_ratio = user_order_stat.user_reorder_ratio / user_order_stat.user_total_products

user_stat = pd.merge(user_stat, user_order_stat, on='user_id')
user_stat['user_average_basket'] = user_stat.user_total_products / user_stat.user_orders
print(user_stat.columns)
print(user_stat.head(5))
print('user order stat')
print(user_order_stat.columns)
user_order.head(5)

### Create product-user stats

In [7]:
print('creating product user')
prod_usr = orders_products.groupby(['product_id']).agg({'user_id': lambda x: x.nunique()})
prod_usr.rename(columns={'user_id': 'prod_users_unq'}, inplace=True)
prod_usr.reset_index(inplace=True)
print(prod_usr.columns)

print('creating product user reordered')
prod_usr_reordered = orders_products.loc[orders_products.reordered, :].groupby(['product_id']).agg(
    {'user_id': lambda x: x.nunique()})
prod_usr_reordered.rename(columns={'user_id': 'prod_users_unq_reordered'}, inplace=True)
prod_usr_reordered.reset_index(inplace=True)
print(prod_usr_reordered.columns)

order_stat = orders_products.groupby('order_id').agg({'order_id': 'size'}) \
    .rename(columns={'order_id': 'order_size'}).reset_index()

creating product user
Index(['product_id', 'prod_users_unq'], dtype='object')
creating product user reordered
Index(['product_id', 'prod_users_unq_reordered'], dtype='object')


### Create order-product stats

In [8]:
print('creating order products')
orders_products = pd.merge(orders_products, order_stat, on='order_id')
orders_products['add_to_cart_order_inverted'] = orders_products.order_size - orders_products.add_to_cart_order
orders_products['add_to_cart_order_relative'] = orders_products.add_to_cart_order / orders_products.order_size
print(orders_products.columns)

creating order products
Index(['order_id', 'user_id', 'eval_set', 'order_number', 'order_dow',
       'order_hour_of_day', 'days_since_prior_order', 'product_id',
       'add_to_cart_order', 'reordered', 'order_size',
       'add_to_cart_order_inverted', 'add_to_cart_order_relative'],
      dtype='object')


### Create day-of-week stats

In [9]:
print('creating data_dow')
data_dow = orders_products.groupby(['user_id', 'product_id', 'order_dow']).agg({'reordered': ['sum', 'size']})
data_dow.columns = data_dow.columns.droplevel(0)
data_dow.columns = ['reordered_dow', 'reordered_dow_size']
data_dow['reordered_dow_ration'] = data_dow.reordered_dow / data_dow.reordered_dow_size
data_dow.reset_index(inplace=True)
print(data_dow.columns)

creating data_dow
Index(['user_id', 'product_id', 'order_dow', 'reordered_dow',
       'reordered_dow_size', 'reordered_dow_ration'],
      dtype='object')


### Create final dataset

In [10]:
print('creating data')
data = orders_products.groupby(['user_id', 'product_id']).agg({'user_id': 'size',
                                                               'order_number': ['min', 'max'],
                                                               'add_to_cart_order': ['mean', 'median'],
                                                               'days_since_prior_order': ['mean', 'median'],
                                                               'order_dow': ['mean', 'median'],
                                                               'order_hour_of_day': ['mean', 'median'],
                                                               'add_to_cart_order_inverted': ['mean', 'median'],
                                                               'add_to_cart_order_relative': ['mean', 'median'],
                                                               'reordered': ['sum']})
# data.columns = data.columns.droplevel(0)
data.columns = ['up_orders', 'up_first_order', 'up_last_order',
                'up_mean_cart_position', 'up_median_cart_position',
                'days_since_prior_order_mean', 'days_since_prior_order_median',
                'order_dow_mean', 'order_dow_median',
                'order_hour_of_day_mean', 'order_hour_of_day_median',
                'add_to_cart_order_inverted_mean', 'add_to_cart_order_inverted_median',
                'add_to_cart_order_relative_mean', 'add_to_cart_order_relative_median',
                'reordered_sum']
data['user_product_reordered_ratio'] = (data.reordered_sum + 1.0) / data.up_orders
data.reset_index(inplace=True)

data = pd.merge(data, prod_stat, on='product_id')
data = pd.merge(data, user_stat, on='user_id')

data['up_order_rate'] = data.up_orders / data.user_orders
data['up_orders_since_last_order'] = data.user_orders - data.up_last_order
data['up_order_rate_since_first_order'] = data.user_orders / (data.user_orders - data.up_first_order + 1)
print(data.columns)

creating data
Index(['user_id', 'product_id', 'up_orders', 'up_first_order', 'up_last_order',
       'up_mean_cart_position', 'up_median_cart_position',
       'days_since_prior_order_mean', 'days_since_prior_order_median',
       'order_dow_mean', 'order_dow_median', 'order_hour_of_day_mean',
       'order_hour_of_day_median', 'add_to_cart_order_inverted_mean',
       'add_to_cart_order_inverted_median', 'add_to_cart_order_relative_mean',
       'add_to_cart_order_relative_median', 'reordered_sum',
       'user_product_reordered_ratio', 'prod_add_to_card_mean', 'prod_orders',
       'prod_reorders', 'reorder_ration', 'reorder_prob', 'user_orders',
       'user_order_starts_at', 'user_mean_days_since_prior_order',
       'user_median_days_since_prior', 'user_total_products',
       'user_reorder_ratio', 'user_distinct_products', 'user_average_basket',
       'up_order_rate', 'up_orders_since_last_order',
       'up_order_rate_since_first_order'],
      dtype='object')


### Create train and test datasets

In [11]:
print('creating order_train')
order_train = pd.merge(order_train, products, on='product_id')
order_train = pd.merge(order_train, orders, on='order_id')
order_train = pd.merge(order_train, user_dep_stat, on=['user_id', 'department_id'])
order_train = pd.merge(order_train, user_aisle_stat, on=['user_id', 'aisle_id'])

order_train = pd.merge(order_train, prod_usr, on='product_id')
order_train = pd.merge(order_train, prod_usr_reordered, on='product_id', how='left')
order_train.prod_users_unq_reordered.fillna(0, inplace=True)

order_train = pd.merge(order_train, data, on=['product_id', 'user_id'])
order_train = pd.merge(order_train, data_dow, on=['product_id', 'user_id', 'order_dow'], how='left')

order_train['aisle_reordered_ratio'] = order_train.aisle_reordered / order_train.user_orders
order_train['dep_reordered_ratio'] = order_train.dep_reordered / order_train.user_orders

order_train = pd.merge(order_train, product_periods, on=['user_id', 'product_id'])
order_train = pd.merge(order_train, product_embeddings, on=['product_id'])
print(data.columns)
print('data is joined')

unique_orders = np.unique(order_train.order_id)
orders_train, orders_test = train_test_split(unique_orders, test_size=0.25, random_state=RANDOM_STATE)

order_test = order_train.loc[np.in1d(order_train.order_id, orders_test)]
order_train = order_train.loc[np.in1d(order_train.order_id, orders_train)]

creating order_train
Index(['user_id', 'product_id', 'up_orders', 'up_first_order', 'up_last_order',
       'up_mean_cart_position', 'up_median_cart_position',
       'days_since_prior_order_mean', 'days_since_prior_order_median',
       'order_dow_mean', 'order_dow_median', 'order_hour_of_day_mean',
       'order_hour_of_day_median', 'add_to_cart_order_inverted_mean',
       'add_to_cart_order_inverted_median', 'add_to_cart_order_relative_mean',
       'add_to_cart_order_relative_median', 'reordered_sum',
       'user_product_reordered_ratio', 'prod_add_to_card_mean', 'prod_orders',
       'prod_reorders', 'reorder_ration', 'reorder_prob', 'user_orders',
       'user_order_starts_at', 'user_mean_days_since_prior_order',
       'user_median_days_since_prior', 'user_total_products',
       'user_reorder_ratio', 'user_distinct_products', 'user_average_basket',
       'up_order_rate', 'up_orders_since_last_order',
       'up_order_rate_since_first_order'],
      dtype='object')
data is jo

### Select features and finalize train / validation sets

In [12]:
features = [
    # 'reordered_dow_ration', 'reordered_dow', 'reordered_dow_size',
    # 'reordered_prev', 'add_to_cart_order_prev', 'order_dow_prev', 'order_hour_of_day_prev',
    'user_product_reordered_ratio', 'reordered_sum',
    'add_to_cart_order_inverted_mean', 'add_to_cart_order_relative_mean',
    'reorder_prob',
    'last', 'prev1', 'prev2', 'median', 'mean',
    'dep_reordered_ratio', 'aisle_reordered_ratio',
    'aisle_products',
    'aisle_reordered',
    'dep_products',
    'dep_reordered',
    'prod_users_unq', 'prod_users_unq_reordered',
    'order_number', 'prod_add_to_card_mean',
    'days_since_prior_order',
    'order_dow', 'order_hour_of_day',
    'reorder_ration',
    'user_orders', 'user_order_starts_at', 'user_mean_days_since_prior_order',
    # 'user_median_days_since_prior',
    'user_average_basket', 'user_distinct_products', 'user_reorder_ratio', 'user_total_products',
    'prod_orders', 'prod_reorders',
    'up_order_rate', 'up_orders_since_last_order', 'up_order_rate_since_first_order',
    'up_orders', 'up_first_order', 'up_last_order', 'up_mean_cart_position',
    # 'up_median_cart_position',
    'days_since_prior_order_mean',
    # 'days_since_prior_order_median',
    'order_dow_mean',
    # 'order_dow_median',
    #                      'order_hour_of_day_mean',
    # 'order_hour_of_day_median'
]
categories = ['product_id', 'aisle_id', 'department_id', 'user_id', 'order_id']
features.extend(embedings)
cat_features = [len(features) + i for i, col in enumerate(categories)]
cat_features_array_str = list(map(lambda x: str(x), cat_features))
cat_features_str = ','.join([str(x) for x in cat_features])
features.extend(categories)

print('categories: ', categories)
print('cat features:', list(zip(cat_features, categories)))
print('cat str features:', cat_features_str)
print('not included:', set(order_train.columns.tolist()) - set(features))

data = order_train[features]
data.columns = list(map(lambda x: str(x), data.columns))
labels = order_train[['reordered']].values.astype(np.float32).flatten()

data_val = order_test[features]
data.columns = list(map(lambda x: str(x), data.columns))
labels_val = order_test[['reordered']].values.astype(np.float32).flatten()

features = list(map(lambda x: str(x), features))
data.to_csv(DATA_DIR + 'data_train_n.csv')
pkl.dump(labels, open(DATA_DIR + 'labels_train_n.csv', 'wb'))
pkl.dump(labels_val, open(DATA_DIR + 'labels_validation.csv', 'wb'))
data_val.to_csv(DATA_DIR + 'data_validation.csv')

categories:  ['product_id', 'aisle_id', 'department_id', 'user_id', 'order_id']
cat features: [(74, 'product_id'), (75, 'aisle_id'), (76, 'department_id'), (77, 'user_id'), (78, 'order_id')]
cat str features: 74,75,76,77,78
not included: {'product_name', 'add_to_cart_order', 'order_hour_of_day_median', 'user_median_days_since_prior', 'up_median_cart_position', 'reordered_dow_ration', 'add_to_cart_order_relative_median', 'days_since_prior_order_median', 'reordered', 'eval_set', 'reordered_dow_size', 'add_to_cart_order_inverted_median', 'order_dow_median', 'order_hour_of_day_mean', 'reordered_dow'}


### Train LGBM

In [13]:
print('features: {}, length: {}\n'.format(data.columns, len(data.columns)))
print('categorical features: {}\n'.format(categories))
lgb_train = lgb.Dataset(data, labels, feature_name=features, categorical_feature=categories)
lgb_eval = lgb.Dataset(data_val, labels_val, reference=lgb_train, feature_name=features, categorical_feature=categories)

params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': ['binary_logloss', 'auc'],
    'num_leaves': 256,
    'min_sum_hessian_in_leaf': 20,
    'max_depth': 12,
    'learning_rate': 0.05,
    'feature_fraction': 0.6,
    'verbose': 1,
}

print('Start Training')
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=2000,
                valid_sets=lgb_eval,
                early_stopping_rounds=30)
print('Feature names:', gbm.feature_name(), '\n')

print('Calculating feature importance')
df = pd.DataFrame({'feature': gbm.feature_name(),
                   'importances': gbm.feature_importance()})
print(df.sort_values('importances'))

print('Saving model')
df.to_csv(open(MODELS_DIR + 'lgb_feature_importance.csv', 'w'))
gbm.save_model(MODELS_DIR + 'lgb.model', num_iteration=-1)
gbm.dump_model(MODELS_DIR + 'lgb_json.model', num_iteration=-1)


features: Index(['user_product_reordered_ratio', 'reordered_sum',
       'add_to_cart_order_inverted_mean', 'add_to_cart_order_relative_mean',
       'reorder_prob', 'last', 'prev1', 'prev2', 'median', 'mean',
       'dep_reordered_ratio', 'aisle_reordered_ratio', 'aisle_products',
       'aisle_reordered', 'dep_products', 'dep_reordered', 'prod_users_unq',
       'prod_users_unq_reordered', 'order_number', 'prod_add_to_card_mean',
       'days_since_prior_order', 'order_dow', 'order_hour_of_day',
       'reorder_ration', 'user_orders', 'user_order_starts_at',
       'user_mean_days_since_prior_order', 'user_average_basket',
       'user_distinct_products', 'user_reorder_ratio', 'user_total_products',
       'prod_orders', 'prod_reorders', 'up_order_rate',
       'up_orders_since_last_order', 'up_order_rate_since_first_order',
       'up_orders', 'up_first_order', 'up_last_order', 'up_mean_cart_position',
       'days_since_prior_order_mean', 'order_dow_mean', '0', '1', '2', '3',
     



[1]	valid_0's auc: 1	valid_0's binary_logloss: 1.11022e-15
Training until validation scores don't improve for 30 rounds.
[2]	valid_0's auc: 1	valid_0's binary_logloss: 0
[3]	valid_0's auc: 1	valid_0's binary_logloss: 0
[4]	valid_0's auc: 1	valid_0's binary_logloss: 0
[5]	valid_0's auc: 1	valid_0's binary_logloss: 0
[6]	valid_0's auc: 1	valid_0's binary_logloss: 0
[7]	valid_0's auc: 1	valid_0's binary_logloss: 0
[8]	valid_0's auc: 1	valid_0's binary_logloss: 0
[9]	valid_0's auc: 1	valid_0's binary_logloss: 0
[10]	valid_0's auc: 1	valid_0's binary_logloss: 0
[11]	valid_0's auc: 1	valid_0's binary_logloss: 0
[12]	valid_0's auc: 1	valid_0's binary_logloss: 0
[13]	valid_0's auc: 1	valid_0's binary_logloss: 0
[14]	valid_0's auc: 1	valid_0's binary_logloss: 0
[15]	valid_0's auc: 1	valid_0's binary_logloss: 0
[16]	valid_0's auc: 1	valid_0's binary_logloss: 0
[17]	valid_0's auc: 1	valid_0's binary_logloss: 0
[18]	valid_0's auc: 1	valid_0's binary_logloss: 0
[19]	valid_0's auc: 1	valid_0's binar

TypeError: dump_model() got multiple values for argument 'num_iteration'

In [19]:
model = lgb.Booster(model_file=os.path.join(MODELS_DIR,'lgb.model'))