In [34]:
import lightgbm as lgb
import pandas as pd
import numpy as np
from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import train_test_split

In [3]:
print('Load training data...')
df_x_train = pd.read_csv('X_train.csv', header=0, index_col = 0)
df_y_train = pd.read_csv('y_train.csv', header=0, index_col = 0)

Load training data...


In [4]:
df_x_train.head()

Unnamed: 0_level_0,x0,x1,x2,x3,x4,x5,x6,x7,x8,x9,...,x990,x991,x992,x993,x994,x995,x996,x997,x998,x999
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,-4.52448,-0.718917,0.827537,-2.336564,1.905992,-1.424985,-5.622933,-0.739429,0.334367,-0.245799,...,-3.21435,-4.083583,-1.240234,1.581522,-3.147444,0.423618,2.387999,1.784247,-1.689361,-1.586569
1,-0.561814,-0.115757,-0.113303,-0.322508,-0.080855,0.042634,-0.31999,-0.066997,0.281196,-0.064463,...,-0.262083,-0.437542,0.300902,0.502415,-0.537463,0.455991,-0.3788,-0.53647,-0.810315,-0.021378
2,-0.547026,-0.045593,1.016072,-0.068002,-0.670472,-0.551299,-0.550926,0.393147,1.022467,-0.113551,...,0.064608,-0.361322,-0.440028,0.278972,-0.57096,-0.708099,-0.025025,0.552631,-1.365591,-0.584266
3,-1.939258,-0.284554,1.276007,-0.500731,1.088817,-0.897736,-1.53066,-0.952914,1.157809,0.149595,...,-2.341105,-1.66325,-1.224091,0.617387,-0.964099,-0.034949,0.157197,0.137123,-0.16507,-0.740363
4,-0.386835,-0.143997,0.506509,-0.648928,-0.614121,0.211504,0.0256,-0.272372,-0.215564,0.070822,...,0.083148,0.568495,0.05098,0.317415,-0.163551,-0.24009,-0.27002,-0.296239,-0.722527,0.986404


In [5]:
df_y_train.head()

Unnamed: 0_level_0,y
id,Unnamed: 1_level_1
0,1.0
1,0.0
2,1.0
3,1.0
4,1.0


In [45]:
print('Splitting into training and validation dataset')
X = df_x_train.values
y = df_y_train['y'].values
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2) # , random_state = 19960503)
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)

Splitting into training and validation dataset


In [46]:
# specify your configurations as a dict
params = {
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'multiclassova',
    'is_unbalance': True,
    #'metric': {'l1', 'l2'},
    'num_class': 3,
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.5,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'verbose': 0,
}

def custom_accuracy(preds, train_data):
    labels = train_data.get_label()
    n = len(labels)
    results = []
    for i in range(n):
        results.append(np.argmax([preds[i], preds[n + i], preds[2*n + i]]))
    return 'BMAC', balanced_accuracy_score(labels, results), True

print('Start training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=500,
                feval=custom_accuracy,
                valid_sets={lgb_train, lgb_eval},
                early_stopping_rounds=20)

Start training...
[1]	training's multi_logloss: 0.665169	training's BMAC: 0.759616	valid_1's multi_logloss: 0.673457	valid_1's BMAC: 0.57915
Training until validation scores don't improve for 20 rounds.
[2]	training's multi_logloss: 0.639849	training's BMAC: 0.808987	valid_1's multi_logloss: 0.654788	valid_1's BMAC: 0.601451
[3]	training's multi_logloss: 0.615877	training's BMAC: 0.839199	valid_1's multi_logloss: 0.637204	valid_1's BMAC: 0.604237
[4]	training's multi_logloss: 0.593564	training's BMAC: 0.851697	valid_1's multi_logloss: 0.62191	valid_1's BMAC: 0.619788
[5]	training's multi_logloss: 0.573463	training's BMAC: 0.857855	valid_1's multi_logloss: 0.605925	valid_1's BMAC: 0.63482
[6]	training's multi_logloss: 0.554301	training's BMAC: 0.867036	valid_1's multi_logloss: 0.592816	valid_1's BMAC: 0.639809
[7]	training's multi_logloss: 0.536118	training's BMAC: 0.874134	valid_1's multi_logloss: 0.581444	valid_1's BMAC: 0.645526
[8]	training's multi_logloss: 0.519481	training's BMAC:

In [48]:
print('Load testing data...')
df_x_test = pd.read_csv('X_test.csv', header=0, index_col = 0)
# X_test = df_x_test[selected_features].values
X_test = df_x_test.values
y_pred = gbm.predict(X_test, num_iteration=gbm.best_iteration)
y_pred
f = open("submission.csv", "w")
f.write("id,y\n")
for i,x in enumerate(y_pred):
    f.write("{},{}\n".format(i,np.argmax(x)))
f.close()

Load testing data...


In [52]:
a = [np.argmax(x) for x in y_pred]
print("0: {}".format(len([x for x in a if x == 0])))
print("1: {}".format(len([x for x in a if x == 1])))      
print("2: {}".format(len([x for x in a if x == 2])))      

0: 571
1: 2876
2: 653
