In [1]:
import pandas as pd
from matplotlib import pyplot as plt
from sklearn import tree
from sklearn.model_selection import cross_val_score
import lightgbm as lgb
import numpy as np

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
train_df = pd.read_csv("resources/train.csv")
test_df = pd.read_csv("resources/test.csv")
submit_sample = pd.read_csv("resources/sample_submission.csv")

In [3]:
X_train = train_df.iloc[:,2:].values
y_train = train_df.iloc[:,1].values
X_test = test_df.iloc[:,1:].values

In [4]:
from sklearn.cluster import KMeans
from sklearn.neighbors import DistanceMetric

kmeans = KMeans(init='k-means++', n_clusters= 20, n_init=10)
kmeans.fit(X_train)

dist = DistanceMetric.get_metric('euclidean')
ax_tr = dist.pairwise(X_train, kmeans.cluster_centers_)
ax_te = dist.pairwise(X_test, kmeans.cluster_centers_)

ax_tr = pd.DataFrame(ax_tr)
ax_te = pd.DataFrame(ax_te)
xcols =  ['dist' + str(f) for f in range(0, ax_tr.shape[1])]

ax_tr.columns = xcols
ax_te.columns = xcols

m1 = X_train.max(axis = 1)
m2 = X_train.min(axis = 1)
m3 = train_df.median(axis = 1)
m4 = 1/X_train.std(axis = 1)

train_df['xmax'] = m1
train_df['xmin'] = m2
train_df['xmed'] = m3
train_df['xstd'] = m4

m1 = X_test.max(axis = 1)
m2 = X_test.min(axis = 1)
m3 = test_df.median(axis = 1)
m4 = 1/X_test.std(axis = 1)

test_df['xmax'] = m1
test_df['xmin'] = m2
test_df['xmed'] = m3
test_df['xstd'] = m4

In [5]:
train_df_add = pd.concat([train_df, ax_tr], axis = 1)
test_df_add = pd.concat([test_df, ax_te], axis = 1)

X_train = train_df_add.iloc[:,2:].values
y_train = train_df_add.iloc[:,1].values
X_test = test_df_add.iloc[:,1:].values

In [6]:
#parameter tuning
from bayes_opt import BayesianOptimization

def lgb_evaluate(numLeaves, maxDepth, scaleWeight, minChildWeight, subsample, colSam):   
    clf = lgb.LGBMClassifier(
        class_weight = 'balanced',
        objective = 'binary',
        metric= 'auc',
        eval_metric= 'auc',
        n_estimators=1000,
        num_leaves= int(numLeaves),
        max_depth= int(maxDepth),
        scale_pos_weight= scaleWeight,
        min_child_weight= minChildWeight,
        subsample= subsample,
        colsample_bytree= colSam,
        verbose =-1
    )
    scores = cross_val_score(clf, X_train, y_train, cv=5, scoring='roc_auc')
    print(np.mean(scores))
    return np.mean(scores)


def bayesOpt(X_train, y_train):
    lgbBO = BayesianOptimization(lgb_evaluate, {'numLeaves':  (10, 50),
                                                'maxDepth': (2, 63),
                                                'scaleWeight': (1, 100),
                                                'minChildWeight': (0.01, 70),
                                                'subsample': (0.4, 1),                                                
                                                'colSam': (0.4, 1)
                                            })

    lgbBO.maximize(init_points=5, n_iter=5)
    print(lgbBO.res['max'])
    
bayesOpt(X_train, y_train)

|   iter    |  target   |  colSam   | maxDepth  | minChi... | numLeaves | scaleW... | subsample |
-------------------------------------------------------------------------------------------------
0.8822628486358797
| [0m 1       [0m | [0m 0.8823  [0m | [0m 0.7929  [0m | [0m 49.34   [0m | [0m 60.63   [0m | [0m 32.51   [0m | [0m 89.51   [0m | [0m 0.7266  [0m |
0.8842407313596586
| [95m 2       [0m | [95m 0.8842  [0m | [95m 0.743   [0m | [95m 37.81   [0m | [95m 63.87   [0m | [95m 42.96   [0m | [95m 26.87   [0m | [95m 0.5667  [0m |
0.8838531531384721
| [0m 3       [0m | [0m 0.8839  [0m | [0m 0.651   [0m | [0m 32.74   [0m | [0m 57.74   [0m | [0m 46.59   [0m | [0m 21.3    [0m | [0m 0.4893  [0m |
0.8830322982740864
| [0m 4       [0m | [0m 0.883   [0m | [0m 0.7965  [0m | [0m 31.02   [0m | [0m 54.88   [0m | [0m 39.99   [0m | [0m 30.77   [0m | [0m 0.7916  [0m |
0.8685330586531862
| [0m 5       [0m | [0m 0.8685  [0m | [0m 0.7329

TypeError: list indices must be integers or slices, not str

In [7]:
model = lgb.LGBMClassifier(class_weight = 'balanced',
        objective = 'binary',
        metric= 'auc',
        eval_metric= 'auc',
        n_estimators=1000,
        num_leaves= 11,
        max_depth= 12,
        scale_pos_weight= 1.476,
        min_child_weight= 69.29,
        subsample= 0.866,
        colsample_bytree= 0.7374,
)
model.fit(X_train, y_train)

LGBMClassifier(boosting_type='gbdt', class_weight='balanced',
        colsample_bytree=0.7374, eval_metric='auc',
        importance_type='split', learning_rate=0.1, max_depth=12,
        metric='auc', min_child_samples=20, min_child_weight=69.29,
        min_split_gain=0.0, n_estimators=1000, n_jobs=-1, num_leaves=11,
        objective='binary', random_state=None, reg_alpha=0.0,
        reg_lambda=0.0, scale_pos_weight=1.476, silent=True,
        subsample=0.866, subsample_for_bin=200000, subsample_freq=0)

In [8]:
predicted = model.predict(X_test)

In [9]:
submit_sample.drop('target', axis=1)
submit_sample['target'] = predicted
submit_sample.to_csv('lightgbm_kmeans20.csv', index = False)