In [1]:
import os
import pandas as pd
import numpy as np

%load_ext autoreload
%autoreload 2

In [2]:
import data

In [3]:
from lightgbm import LGBMClassifier

In [4]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

# 0. Initialize model inputs

In [5]:
# Only need to run this if you want to 
# recalculate model inputs after changing
# the inputs in the 'raw' or 'engineered' folders 
data.update_model_inputs('raw_group2')

initializing raw data
initializing engineered data
calculating model inputs
saving model inputs


In [6]:
(
    X_prior,
    X_train,
    X_test,
    y_prior,
    y_train
) = data.initialize_model_inputs(output=True)

In [15]:
data.order_products_prior.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,6,40462,1,0
1,6,15873,2,0
2,6,41897,3,0
3,10,24852,1,1
4,10,4796,2,1


In [None]:
#getting average basket as afeature by using total items and number of orders
df['average_basket'] = (df.total_items /df.nb_orders).astype(np.float32)
df.head()
df['average_basket'] = df['average_basket'].replace(np.nan, 0)

In [None]:
#Getting average days between orders as a feature by using days_since_prior_order
df['average_days_between_orders'] = orders_new.groupby('user_id')['days_since_prior_order'].mean().astype(np.float32)
df['average_days_between_orders'] = df['average_days_between_orders'].replace(np.nan, 0)

# 1. Implement a model


In [8]:
# Try using the LightGBM model to make predictions 
# on our train and test datasets
# Use the code from the example below to get started

# Installation:
    # use "conda install -c conda-forge lightgbm"
    # or look here:
# Background Information: https://towardsdatascience.com/lightgbm-vs-xgboost-which-algorithm-win-the-race-1ff7dd4917d
# Simple Example: https://github.com/microsoft/LightGBM/blob/master/examples/python-guide/simple_example.py

In [7]:
relevant_cols = ['order_number','order_dow','order_hour_of_day','days_since_prior_order',
                 'aisle_id','department_id','unique_products','num_users','num_times_ordered','organic','reorder_ratio']

In [9]:
model = LGBMClassifier(
    n_estimators=30,
    learning_rate=0.05,
    min_samples_leaf=50,
)

In [10]:
model.fit(X_prior[relevant_cols], y_prior['reordered'])

KeyboardInterrupt: 

In [8]:
prob_train = model.predict_proba(X_train[relevant_cols])[:,0]

NameError: name 'model' is not defined

In [30]:
cutoff = 0.59

In [31]:
pred_train = (prob_train<=cutoff)*1

In [37]:
pred_train

array([0, 1, 1, ..., 1, 1, 1])

In [32]:
f1_score(y_train['reordered'], pred_train)

0.7652194179282753

In [33]:
feature_importance = pd.DataFrame({
    'columns':relevant_cols,
    'feature_importance': model.feature_importances_ / sum(model.feature_importances_)
}).sort_values('feature_importance',ascending=False)

In [34]:
feature_importance

Unnamed: 0,columns,feature_importance
10,reorder_ratio,0.432222
0,order_number,0.403333
3,days_since_prior_order,0.136667
6,unique_products,0.022222
8,num_times_ordered,0.005556
1,order_dow,0.0
2,order_hour_of_day,0.0
4,aisle_id,0.0
5,department_id,0.0
7,num_users,0.0


In [21]:
len(model.feature_importances_)

5

# 2. Yasser's Light GBM Code

In [None]:
# f1, precision, accuracy, recall
# https://towardsdatascience.com/accuracy-precision-recall-or-f1-331fb37c5cb9

In [None]:
# Grid Search

In [11]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier

In [12]:
# Create a pipeline
pipe = Pipeline([('classifier', RandomForestClassifier())])

# Create space of candidate learning algorithms and their hyperparameters
search_space = [{'classifier': [RandomForestClassifier()],
                 'classifier__n_estimators': [10, 20]}]
                # 'learning_rate':[0.01, 0.05, 0.1],
                # 'subsample':[0.5, 0.7, 0.9],
                 #'colsample_bytree':[.8],
                 #'num_leaves':[3, 5, 8, 10],
                # 'boosting':['gbdt', 'dart', 'goss']}]

In [89]:
clf_best = GridSearchCV(pipe, search_space, cv=5, verbose=0)

In [76]:
X_prior.head()

Unnamed: 0.1,order_id,product_id,user_id,eval_set,order_number,order_dow,order_hour_of_day,days_since_prior_order,product_name,aisle_id,department_id,aisle,department,unique_products,Unnamed: 0,num_users,num_times_ordered,organic,reorder_ratio
0,6,15873,22352,prior,4,1,12,30.0,Dryer Sheets Geranium Scent,75,17,laundry,household,98,15870,63,76,0,0.171053
1,6,40462,22352,prior,4,1,12,30.0,Cleanse,31,7,refrigerated,beverages,98,40459,179,322,0,0.42236
2,6,41897,22352,prior,4,1,12,30.0,Clean Day Lavender Scent Room Freshener Spray,101,17,air fresheners candles,household,98,41894,48,55,0,0.090909
3,10,1529,135442,prior,4,6,8,8.0,"Parsley, Italian (Flat), New England Grown",16,4,fresh herbs,produce,79,1528,5094,9118,0,0.420926
4,10,3464,135442,prior,4,6,8,8.0,"Organic Butterhead (Boston, Butter, Bibb) Lettuce",83,4,fresh vegetables,produce,79,3463,6204,12565,1,0.489853


In [78]:
X_prior_drop = pd.concat([X_prior[relevant_cols],y_prior['reordered']],axis=1)

In [79]:
X_prior_drop.head()

Unnamed: 0,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle_id,department_id,unique_products,num_users,num_times_ordered,organic,reorder_ratio,reordered
0,4,1,12,30.0,75,17,98,63,76,0,0.171053,0
1,4,1,12,30.0,31,7,98,179,322,0,0.42236,0
2,4,1,12,30.0,101,17,98,48,55,0,0.090909,0
3,4,6,8,8.0,16,4,79,5094,9118,0,0.420926,0
4,4,6,8,8.0,83,4,79,6204,12565,1,0.489853,0


In [80]:
X_prior_drop = X_prior_drop.dropna(subset=['days_since_prior_order'])

In [81]:
X_prior_drop.head()

Unnamed: 0,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle_id,department_id,unique_products,num_users,num_times_ordered,organic,reorder_ratio,reordered
0,4,1,12,30.0,75,17,98,63,76,0,0.171053,0
1,4,1,12,30.0,31,7,98,179,322,0,0.42236,0
2,4,1,12,30.0,101,17,98,48,55,0,0.090909,0
3,4,6,8,8.0,16,4,79,5094,9118,0,0.420926,0
4,4,6,8,8.0,83,4,79,6204,12565,1,0.489853,0


In [82]:
X_prior_drop.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6063141 entries, 0 to 6476314
Data columns (total 12 columns):
order_number              int64
order_dow                 int64
order_hour_of_day         int64
days_since_prior_order    float64
aisle_id                  int64
department_id             int64
unique_products           int64
num_users                 int64
num_times_ordered         int64
organic                   int64
reorder_ratio             float64
reordered                 int64
dtypes: float64(2), int64(10)
memory usage: 601.4 MB


In [83]:
y_prior_drop = X_prior_drop.reordered

In [84]:
X_prior_drop = X_prior_drop.drop('reordered',axis=1)

In [85]:
X_prior_drop.head()

Unnamed: 0,order_number,order_dow,order_hour_of_day,days_since_prior_order,aisle_id,department_id,unique_products,num_users,num_times_ordered,organic,reorder_ratio
0,4,1,12,30.0,75,17,98,63,76,0,0.171053
1,4,1,12,30.0,31,7,98,179,322,0,0.42236
2,4,1,12,30.0,101,17,98,48,55,0,0.090909
3,4,6,8,8.0,16,4,79,5094,9118,0,0.420926
4,4,6,8,8.0,83,4,79,6204,12565,1,0.489853


In [90]:
best_model = clf_best.fit(X_prior_drop, y_prior_drop)

MemoryError: 

In [42]:
## Set up the hyper parameters spaces to test during grid search
list_n_estimators = [100, 200, 300, 400]
list_learning_rate = [0.01, 0.05, 0.1]
list_subsample = [0.5, 0.7, 0.9]
list_colsample_bytree = [0.8]
list_num_leaves = [3, 5, 8, 10]
list_boosting = ['gbdt', 'dart', 'goss']

index = pd.MultiIndex.from_product([list_n_estimators,
                                    list_learning_rate,
                                    list_subsample,
                                    list_colsample_bytree,
                                    list_num_leaves,
                                    list_boosting,
                                   ],
                                   names = ['n_estimators',
                                            'learning_rate',
                                            'subsample',
                                            'colsample_bytree',
                                            'num_leaves',
                                            'boosting',
                                           ]
                                  )

grid_search_result = pd.MultiIndex.from_product([list_n_estimators,
                                    list_learning_rate,
                                    list_subsample,
                                    list_colsample_bytree,
                                    list_num_leaves,
                                    list_boosting,
                                   ],
                                   names = ['n_estimators',
                                            'learning_rate',
                                            'subsample',
                                            'colsample_bytree',
                                            'num_leaves',
                                            'boosting',
                                           ]
                                  )

print(f'Number of models to try during grid search is: {index.shape[0]}')

Number of models to try during grid search is: 432


In [43]:
from tqdm import tqdm

model_score = 'model_score'

list_prior_auc = []
list_train_auc = []

list_prior_ks = []
list_train_ks = []

list_prior_20perc_lift = []
list_train_20perc_lift = []


for i in tqdm(range(grid_search_result.shape[0])):

    ## Train the model
    LGBM_model = LGBMClassifier(min_samples_leaf=50,
                                random_state=42,
                                n_estimators=grid_search_result.n_estimators[i],
                                learning_rate=grid_search_result.learning_rate[i],
                                subsample=grid_search_result.subsample[i],
                                colsample_bytree=grid_search_result.colsample_bytree[i],
                                num_leaves=grid_search_result.num_leaves[i],
                                boosting=grid_search_result.boosting[i],
                               )

    LGBM_model.fit(X_prior, y_prior)


    ## Score the dataset with the trained model
    y_prior_scores = LGBM_model.predict_proba(X_prior)[:,1]
    y_train_scores = LGBM_model.predict_proba(X_train)[:,1]
   

    ## Append scores and target variable
    prior_scored = pd.DataFrame({target_variable: y_prior
                                , model_score: y_prior_scores})
    train_scored = pd.DataFrame({target_variable: y_train
                                , model_score: y_train_scores})
  

    ## Generate gains table
    prior_gains_table = gainstable(prior_scored, target_variable, model_score)
    train_gains_table = gainstable(train_scored, target_variable, model_score)

    ## Calculate different performance metrics
    prior_auc = roc_auc_score(y_prior, y_prior_scores)
    list_prior_auc.append(prior_auc)
    
    train_auc = roc_auc_score(y_train, y_train_scores)
    list_train_auc.append(train_auc)
    
    prior_ks = max(prior_gains_table['KS'])
    list_prior_ks.append(prior_ks)
    
    train_ks = max(train_gains_table['KS'])
    list_train_ks.append(train_ks)

grid_search_result['prior_auc'] = list_prior_auc
grid_search_result['train_auc'] = list_train_auc

grid_search_result['prior_ks'] = list_prior_ks
grid_search_result['train_ks'] = list_train_ks


grid_search_result = grid_search_result.sort_values(by='train_auc', ascending=False)


  0%|                                                                                          | 0/432 [00:00<?, ?it/s]


AttributeError: 'MultiIndex' object has no attribute 'n_estimators'

In [18]:
X_prior.head()

NameError: name 'X_prior' is not defined

# Ignore: Code for zipping the code base

In [20]:
import os
import zipfile


In [21]:

def zipdir(path, ziph):
    # ziph is zipfile handle
    for root, dirs, files in os.walk(path):
        for file in files:
            ziph.write(os.path.join(root, file))



In [22]:

zipf = zipfile.ZipFile('Python.zip', 'w', zipfile.ZIP_DEFLATED)
zipdir('master/', zipf)
zipf.close()