In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import GroupKFold
from sklearn.utils import shuffle

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline

In [3]:
X_train = pd.read_hdf('X_train_w32.h5')
y_train = pd.read_hdf('y_train_w32.h5')
X_test = pd.read_hdf('X_test_w32.h5')
X_test.drop('reordered', axis=1, inplace=True)

In [4]:
y_train = y_train.astype('int64')

In [5]:
y_train[:10]

0    1
1    1
2    0
3    0
4    1
5    0
6    0
7    0
8    1
9    1
Name: reordered, dtype: int64

In [6]:
y_train.unique()

array([1, 0], dtype=int64)

In [7]:
X = X_train
y = y_train

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [9]:
X_train.head()

Unnamed: 0,user_id,product_id,_up_order_count,_up_first_order_number,_up_last_order_number,_up_order_past_appears_mean,_up_order_past_appears_std,_up_average_cart_position,_up_std_cart_position,aisle_id,...,_up_order_since_last_order_normalize,_up_order_past_appears_mean_normalize,_up_order_past_appears_std_normalize,_up_order_rate_since_first_order,_up_order_past_appears_interval_mean,_up_order_past_appears_interval_std,_up_order_past_appears_interval_mean_normalize,_up_order_past_appears_interval_std_normalize,_up_order_expect_days_to_order,_up_order_expect_days_to_order_normalize
6980471,6363,13269,2,13,18,15.5,3.535534,9.5,4.949747,120,...,0.660377,0.861111,0.196419,0.04878,5.0,0.0,0.09434,0.0,-30.0,-0.566038
8228503,59675,12002,1,34,34,34.0,0.0,16.0,0.0,128,...,0.227273,1.0,0.0,0.090909,44.0,0.0,1.0,0.0,34.0,0.772727
12236205,112228,15867,1,2,2,2.0,0.0,12.0,0.0,4,...,0.6,1.0,0.0,0.25,5.0,0.0,1.0,0.0,2.0,0.4
8239885,78485,36878,1,5,5,5.0,0.0,18.0,0.0,1,...,0.375,1.0,0.0,0.25,8.0,0.0,1.0,0.0,5.0,0.625
12401690,161890,45842,1,2,2,2.0,0.0,12.0,0.0,108,...,0.333333,1.0,0.0,0.5,3.0,0.0,1.0,0.0,2.0,0.666667


In [10]:
X_train.shape

(6779728, 60)

In [11]:
X_test.shape

(1694933, 60)

In [12]:
len(y_train)

6779728

In [13]:
len(y_test)

1694933

In [14]:
K = 5
groups = X_train.user_id
gkf = GroupKFold(n_splits=K).split(X=X_train, y=y_train, groups=groups)

In [15]:
gridParams = {'bagging_fraction': [0.8, 0.7, 0.6, 0.5],
              'reg_alpha': [0.1, 0.3, 1, 3, 10, 30],
              'reg_lambda': [0.3, 1, 3, 10, 30]}

In [16]:
lgb_est = lgb.LGBMClassifier(task='train',
                             boosting_type='gbdt',
                             learning_rate=0.1,
                             objective='binary',
                             metric=['binary_logloss', 'auc'],
                             num_leaves=128,
                             max_depth=12,
                             num_threads=2,
                             feature_fraction=0.75,
                             bagging_freq=5,
                             bagging_seed=1969)#,
                             #bagging_fraction=0.8,
                             #reg_alpha=0.3,
                             #reg_lambda=1)

In [17]:
model = GridSearchCV(estimator=lgb_est, param_grid=gridParams, cv=gkf, scoring='accuracy')

In [18]:
model.fit(X=X_train, y=y_train)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x000002389D12B830>,
       error_score='raise',
       estimator=LGBMClassifier(bagging_freq=5, bagging_seed=1969, boosting_type='gbdt',
        class_weight=None, colsample_bytree=1.0, feature_fraction=0.75,
        learning_rate=0.1, max_depth=12, metric=['binary_logloss', 'auc'],
        min_child_samples=20, min_child_weight=0.001, min_split_gain=0....a=0.0, silent=True, subsample=1.0,
        subsample_for_bin=200000, subsample_freq=1, task='train'),
       fit_params=None, iid=True, n_jobs=1,
       param_grid={'bagging_fraction': [0.8, 0.7, 0.6, 0.5], 'reg_alpha': [0.1, 0.3, 1, 3, 10, 30], 'reg_lambda': [0.3, 1, 3, 10, 30]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring='accuracy', verbose=0)

In [20]:
model.best_params_

{'bagging_fraction': 0.8, 'reg_alpha': 10, 'reg_lambda': 10}

In [21]:
lgbclass_params = model.best_params_

In [22]:
import pickle

model_pickle_out = open('lgbclass_params.pickle', 'wb')
pickle.dump(lgbclass_params, model_pickle_out)
model_pickle_out.close()

In [23]:
# Prediction
y_pred = model.predict(X_test)

In [24]:
# Confusion Matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

In [25]:
print(cm)

[[1510814   17989]
 [ 133831   32299]]


In [26]:
# Accuracy Score
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_pred, y_test)

In [27]:
print(accuracy)

0.910427137828
