In [45]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [46]:
train = pd.read_csv('./train_featureV1.csv')
test = pd.read_csv('./test_featureV1.csv')

In [47]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [48]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
#    'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 64,
    'learning_rate': 0.08,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}

In [49]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre= pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True

In [50]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

[5]	cv_agg's res: 0.741806 + 0.00679538
[10]	cv_agg's res: 0.748348 + 0.00685021
[15]	cv_agg's res: 0.754806 + 0.00563483
[20]	cv_agg's res: 0.755131 + 0.00360902
[25]	cv_agg's res: 0.760378 + 0.00475988
[30]	cv_agg's res: 0.75986 + 0.00420126
[35]	cv_agg's res: 0.758595 + 0.00599234
[40]	cv_agg's res: 0.75619 + 0.00899295
[45]	cv_agg's res: 0.762618 + 0.00765451
[50]	cv_agg's res: 0.763165 + 0.0087598
[55]	cv_agg's res: 0.759689 + 0.00833843
[60]	cv_agg's res: 0.76224 + 0.0100473
[65]	cv_agg's res: 0.760896 + 0.0113191
[70]	cv_agg's res: 0.764909 + 0.00925973
[75]	cv_agg's res: 0.765394 + 0.00738822
[80]	cv_agg's res: 0.765962 + 0.00968689
[85]	cv_agg's res: 0.767004 + 0.01047
[90]	cv_agg's res: 0.77126 + 0.0117527
[95]	cv_agg's res: 0.767277 + 0.0113129
[100]	cv_agg's res: 0.76695 + 0.0125316
[105]	cv_agg's res: 0.766514 + 0.0104282
[110]	cv_agg's res: 0.766242 + 0.00986623
[115]	cv_agg's res: 0.764822 + 0.0101962
[120]	cv_agg's res: 0.766464 + 0.0089527
[125]	cv_agg's res: 0.765428 

{'res-mean': [0.6938136775243966,
  0.7310789525618637,
  0.7300932104639197,
  0.7347711149216898,
  0.7418063707368746,
  0.7434092262796589,
  0.7434091060829183,
  0.7477662707909604,
  0.7465820971048415,
  0.7483476954729681,
  0.7465995515664533,
  0.7459173758944456,
  0.7474448017169658,
  0.7496147809018594,
  0.7548062361939918,
  0.7528494909762347,
  0.7544966224161134,
  0.7522741066067099,
  0.7524143657825567,
  0.7551313321952776,
  0.7563926485119201,
  0.7574005116084921,
  0.7589049971014606,
  0.7551082598809424,
  0.7603784939153919,
  0.7595587690295998,
  0.7630088897844415,
  0.7623217470971723,
  0.7609311697502061,
  0.7598600250709232,
  0.7616512258879191,
  0.7615218666714411,
  0.7610358202620392,
  0.7610984144128995,
  0.7585945303128335,
  0.7594649020648557,
  0.7593577149429339,
  0.757097598560344,
  0.7572200426396677,
  0.7561901401396004,
  0.758226376311839,
  0.758308288866881,
  0.76016260292537,
  0.7606998195103412,
  0.762618181754144,
  0.

In [51]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

[5]	training's res: 0.884562
[10]	training's res: 0.907723
[15]	training's res: 0.925765
[20]	training's res: 0.937738
[25]	training's res: 0.948606
[30]	training's res: 0.959457
[35]	training's res: 0.971119
[40]	training's res: 0.979164
[45]	training's res: 0.986162
[50]	training's res: 0.988839
[55]	training's res: 0.992688
[60]	training's res: 0.995035
[65]	training's res: 0.997313
[70]	training's res: 0.998661
[75]	training's res: 0.999555
[80]	training's res: 1
[85]	training's res: 1
[90]	training's res: 1
[95]	training's res: 1
[100]	training's res: 1
[105]	training's res: 1
[110]	training's res: 1
[115]	training's res: 1
[120]	training's res: 1
[125]	training's res: 1
[130]	training's res: 1
[135]	training's res: 1
[140]	training's res: 1
[145]	training's res: 1
[150]	training's res: 1
[155]	training's res: 1
[160]	training's res: 1
[165]	training's res: 1
[170]	training's res: 1
[175]	training's res: 1
[180]	training's res: 1
[185]	training's res: 1
[190]	training's res: 1
[19

In [52]:
pred=model.predict(test.drop(['uid'],axis=1))

In [53]:
res =pd.DataFrame({'uid':test.uid,'label':pred})

In [54]:
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

In [55]:
res.to_csv('./lgb-baseline.csv',index=False,header=False,sep=',',columns=['uid','label'])