In [1]:
import pandas as pd
import numpy as np

import gc
import os


In [22]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier, ExtraTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import log_loss, accuracy_score, 




In [9]:
import xgboost as xgb
import lightgbm as lgb
from bayes_opt import BayesianOptimization


In [70]:
tr = pd.read_csv('./data/train.csv')
te = pd.read_csv('./data/test.csv')

sub = pd.read_csv('./data/sample_submission.csv')

In [71]:
tr_X = tr.drop(['id', 'type', 'fiberID'], axis=1)
te_X = te.drop(['id', 'fiberID'], axis=1)

target = tr['type']

In [72]:
target_lbe = LabelEncoder().fit(target)

t = target_lbe.transform(target)
# target_lbe.inverse_transform(t)

In [33]:
# train_X = (train_X - np.mean(train_X))/np.std(train_X)
# test_X = (test_X - np.mean(test_X))/np.std(test_X)

In [73]:
train_X, test_X, train_y, test_y = train_test_split(tr_X, t, test_size=0.15, random_state=42, shuffle=True)



In [60]:
def LGB_bayesian(
    #learning_rate,
    num_leaves, 
    bagging_fraction,
    feature_fraction,
    min_child_weight, 
    min_data_in_leaf,
    max_depth,
    reg_alpha,
    reg_lambda
     ):
    # LightGBM expects next three parameters need to be integer. 
    num_leaves = int(num_leaves)
    min_data_in_leaf = int(min_data_in_leaf)
    max_depth = int(max_depth)

    assert type(num_leaves) == int
    assert type(min_data_in_leaf) == int
    assert type(max_depth) == int
    

    params = {
              'num_leaves': num_leaves, 
              'min_data_in_leaf': min_data_in_leaf,
              'min_child_weight': min_child_weight,
              'bagging_fraction' : bagging_fraction,
              'feature_fraction' : feature_fraction,
#               'learning_rate' : 0.03,
              'max_depth': max_depth,
              'reg_alpha': reg_alpha,
              'reg_lambda': reg_lambda,
              'objective': 'binary',
              'save_binary': True,
              'seed': 12,
              'feature_fraction_seed': 12,
              'bagging_seed': 12,
              'drop_seed': 12,
              'data_random_seed': 12,
              'boosting': 'gbdt', ## some get better result using 'dart'
              'verbose': 1,
              'is_unbalance': False,
              'boost_from_average': True,
              'metric':'multi_logloss'}    
    
    ## set clf options
    clf = lgb.LGBMClassifier(**params).fit(train_X, train_y, early_stopping_rounds=100,eval_set=[(test_X, test_y)], eval_metric='multi_logloss', verbose=0)
    
    score = accuracy_score(test_y, clf.predict(test_X))

    return score

In [61]:
bounds_LGB = {
    'num_leaves': (300, 1000), 
    'min_data_in_leaf': (0, 150),
    'bagging_fraction' : (0.3, 0.7),
    'feature_fraction' : (0.6, 0.9),
#     'learning_rate': (0.01, 0.3),
    'min_child_weight': (0.001, 3),   
    'reg_alpha': (0.25, 0.7), 
    'reg_lambda': (0.25, 0.7),
    'max_depth':(10, 30),
}

In [62]:
optimizer = BayesianOptimization(LGB_bayesian, bounds_LGB, random_state=0)


In [63]:
init_points = 10
n_iter = 20

optimizer.maximize(init_points=init_points, n_iter=n_iter)


|   iter    |  target   | baggin... | featur... | max_depth | min_ch... | min_da... | num_le... | reg_alpha | reg_la... |
-------------------------------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.8652  [0m | [0m 0.5195  [0m | [0m 0.8146  [0m | [0m 22.06   [0m | [0m 1.635   [0m | [0m 63.55   [0m | [0m 752.1   [0m | [0m 0.4469  [0m | [0m 0.6513  [0m |
| [95m 2       [0m | [95m 0.867   [0m | [95m 0.6855  [0m | [95m 0.715   [0m | [95m 25.83   [0m | [95m 1.587   [0m | [95m 85.21   [0m | [95m 947.9   [0m | [95m 0.282   [0m | [95m 0.2892  [0m |
| [0m 3       [0m | [0m 0.8667  [0m | [0m 0.3081  [0m | [0m 0.8498  [0m | [0m 25.56   [0m | [0m 2.61    [0m | [0m 146.8   [0m | [0m 859.4   [0m | [0m 0.4577  [0m | [0m 0.6012  [0m |
| [0m 4       [0m | [0m 0.8654  [0m | [0m 0.3473  [0m | [0m 0.792   [0m | [0m 12.87   [0m | [0m 2.834   [0m | [0m 78.28   [0m 

In [67]:
param_lgb = {
        'min_data_in_leaf': int(optimizer.max['params']['min_data_in_leaf']), 
        'num_leaves': int(optimizer.max['params']['num_leaves']), 
        #'learning_rate': LGB_BO.max['params']['learning_rate'],
        'min_child_weight': optimizer.max['params']['min_child_weight'],
        'bagging_fraction': optimizer.max['params']['bagging_fraction'], 
        'feature_fraction': optimizer.max['params']['feature_fraction'],
        'reg_lambda': optimizer.max['params']['reg_lambda'],
        'reg_alpha': optimizer.max['params']['reg_alpha'],
        'max_depth': int(optimizer.max['params']['max_depth']), 
        'objective': 'binary',
        'save_binary': True,
        'seed': 12,
        'feature_fraction_seed': 12,
        'bagging_seed': 12,
        'drop_seed': 12,
        'data_random_seed': 12,
        'boosting_type': 'gbdt',  # also consider 'dart'
        'verbose': 1,
        'is_unbalance': False,
        'boost_from_average': True,
        'metric':'multi_logloss'
    }

In [74]:
lgb_clf = lgb.LGBMClassifier(**param_lgb).fit(tr_X, t)

In [76]:
lgb_clf.predict_proba(te_X)

array([[2.96127082e-04, 7.38524172e-04, 7.85177405e-05, ...,
        9.79213142e-01, 3.70772356e-05, 2.09625781e-05],
       [2.96734526e-03, 7.95121355e-01, 4.34215122e-04, ...,
        5.11684060e-04, 5.75055934e-05, 1.59566277e-04],
       [9.97062155e-01, 5.48664965e-04, 7.43953537e-05, ...,
        3.34255900e-05, 7.16368246e-06, 2.66071800e-05],
       ...,
       [7.88512582e-04, 9.18208748e-01, 3.18070961e-04, ...,
        3.29144201e-04, 4.21876722e-05, 7.50984960e-04],
       [4.11340903e-05, 9.98234607e-01, 1.41618827e-05, ...,
        2.18915319e-05, 1.87500396e-06, 5.28648622e-06],
       [8.58325266e-05, 5.64392673e-04, 2.82138710e-05, ...,
        2.40096379e-05, 3.65288711e-06, 5.53549973e-05]])

In [81]:
sub[sub.columns[1:]] = lgb_clf.predict_proba(te_X)

In [84]:
sub.to_csv('./sub/lgb1.csv', index=False)

In [82]:
sub

Unnamed: 0,id,STAR_WHITE_DWARF,STAR_CATY_VAR,STAR_BROWN_DWARF,SERENDIPITY_RED,REDDEN_STD,STAR_BHB,GALAXY,SERENDIPITY_DISTANT,QSO,SKY,STAR_RED_DWARF,ROSAT_D,STAR_PN,SERENDIPITY_FIRST,STAR_CARBON,SPECTROPHOTO_STD,STAR_SUB_DWARF,SERENDIPITY_MANUAL,SERENDIPITY_BLUE
0,199991,0.000296,0.000739,0.000079,0.000779,0.000132,0.000134,0.000129,0.000261,0.017897,1.879041e-06,0.000072,0.000084,0.000036,0.000046,0.000042,1.062120e-06,0.979213,0.000037,0.000021
1,199992,0.002967,0.795121,0.000434,0.100209,0.000836,0.000557,0.097598,0.000029,0.000191,1.090124e-05,0.000427,0.000452,0.000051,0.000130,0.000249,8.620299e-06,0.000512,0.000058,0.000160
2,199993,0.997062,0.000549,0.000074,0.001658,0.000069,0.000027,0.000273,0.000013,0.000009,8.473814e-07,0.000033,0.000078,0.000003,0.000049,0.000032,2.143988e-06,0.000033,0.000007,0.000027
3,199994,0.000794,0.143896,0.000329,0.011549,0.829727,0.000870,0.010979,0.000053,0.000098,8.262108e-06,0.000324,0.000343,0.000027,0.000099,0.000267,1.438942e-04,0.000326,0.000039,0.000129
4,199995,0.000055,0.000139,0.000016,0.000388,0.000026,0.000010,0.000046,0.000007,0.000064,4.024000e-07,0.000016,0.000016,0.000004,0.000010,0.000009,3.370001e-07,0.999186,0.000002,0.000004
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10004,209995,0.000985,0.140896,0.000408,0.137204,0.671193,0.001151,0.027965,0.000026,0.000098,1.025094e-05,0.000402,0.001049,0.000031,0.000191,0.006199,2.488674e-05,0.000404,0.000048,0.011713
10005,209996,0.001270,0.244399,0.000486,0.020988,0.670162,0.003757,0.023301,0.000035,0.000117,1.220777e-05,0.000479,0.001148,0.000037,0.000185,0.008647,3.518461e-05,0.000482,0.000057,0.024402
10006,209997,0.000789,0.918209,0.000318,0.038750,0.018779,0.000587,0.019647,0.000013,0.000148,7.985367e-06,0.000313,0.000331,0.000029,0.000095,0.000827,3.554876e-05,0.000329,0.000042,0.000751
10007,209998,0.000041,0.998235,0.000014,0.000026,0.001542,0.000015,0.000040,0.000002,0.000007,6.236893e-07,0.000014,0.000015,0.000002,0.000004,0.000008,4.053142e-06,0.000022,0.000002,0.000005
