In [3]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
train = pd.read_csv('../data/train_featureV1.csv')
test = pd.read_csv('../data/test_featureV1.csv')

In [5]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [6]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
#    'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 64,
    'learning_rate': 0.08,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}    

In [7]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre = pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.5 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True

### 本地CV

In [8]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

[5]	cv_agg's res: 0.729792 + 0.0105826
[10]	cv_agg's res: 0.747329 + 0.00636164
[15]	cv_agg's res: 0.752422 + 0.00804488
[20]	cv_agg's res: 0.754108 + 0.0130397
[25]	cv_agg's res: 0.757501 + 0.014463
[30]	cv_agg's res: 0.764827 + 0.014463
[35]	cv_agg's res: 0.762266 + 0.0140381
[40]	cv_agg's res: 0.764277 + 0.0145081
[45]	cv_agg's res: 0.76503 + 0.0167553
[50]	cv_agg's res: 0.767282 + 0.0140822
[55]	cv_agg's res: 0.762904 + 0.0137758
[60]	cv_agg's res: 0.763019 + 0.0173136
[65]	cv_agg's res: 0.765083 + 0.016851
[70]	cv_agg's res: 0.764442 + 0.0154388
[75]	cv_agg's res: 0.766058 + 0.0165169
[80]	cv_agg's res: 0.766676 + 0.0138415
[85]	cv_agg's res: 0.765655 + 0.0122923
[90]	cv_agg's res: 0.766815 + 0.0149806
[95]	cv_agg's res: 0.767298 + 0.014781
[100]	cv_agg's res: 0.766745 + 0.0156456
[105]	cv_agg's res: 0.768242 + 0.0161285
[110]	cv_agg's res: 0.769164 + 0.0163135
[115]	cv_agg's res: 0.771244 + 0.015379
[120]	cv_agg's res: 0.774123 + 0.0133544
[125]	cv_agg's res: 0.772586 + 0.0147947

{'res-mean': [0.6667307594368429,
  0.7028541020405527,
  0.7237256661690851,
  0.7189180863132393,
  0.7297919346715803,
  0.7370826111284702,
  0.7423839703673902,
  0.7418128706032959,
  0.7453705633069552,
  0.747329146003508,
  0.7489136454436577,
  0.7486214749441285,
  0.7516675575512769,
  0.7500927540082039,
  0.7524221388403095,
  0.7500440477497635,
  0.7491946838392024,
  0.7522741980276223,
  0.7516149174953841,
  0.75410757848539,
  0.7558405640811389,
  0.7571313894897399,
  0.7587939952159003,
  0.7568635793485917,
  0.7575009595668961,
  0.7603536970822743,
  0.7590328367089924,
  0.7640861919308269,
  0.7645223662445494,
  0.764826602402812,
  0.7638764388672378,
  0.7660092548948367,
  0.7635194868142573,
  0.7628393180093025,
  0.7622660618622854,
  0.7651084190028753,
  0.7660067272962509,
  0.7649907480668872,
  0.7639673395964618,
  0.7642771043263031,
  0.7663236225870312,
  0.7655338128294528,
  0.7635258457261943,
  0.7640725209259717,
  0.7650304449448138,
  

## 训练

In [9]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

[5]	training's res: 0.891253
[10]	training's res: 0.904689
[15]	training's res: 0.915139
[20]	training's res: 0.928225
[25]	training's res: 0.940162
[30]	training's res: 0.95265
[35]	training's res: 0.959136
[40]	training's res: 0.971477
[45]	training's res: 0.977544
[50]	training's res: 0.983351
[55]	training's res: 0.988803
[60]	training's res: 0.991074
[65]	training's res: 0.996158
[70]	training's res: 0.998205
[75]	training's res: 0.999331
[80]	training's res: 0.999555
[85]	training's res: 0.999777
[90]	training's res: 0.999778
[95]	training's res: 0.999778
[100]	training's res: 0.999778
[105]	training's res: 0.999778
[110]	training's res: 1
[115]	training's res: 1
[120]	training's res: 1
[125]	training's res: 1
[130]	training's res: 1
[135]	training's res: 1
[140]	training's res: 1
[145]	training's res: 1
[150]	training's res: 1
[155]	training's res: 1
[160]	training's res: 1
[165]	training's res: 1
[170]	training's res: 1
[175]	training's res: 1
[180]	training's res: 1
[185]	trai

### 预测

In [10]:
pred=model.predict(test.drop(['uid'],axis=1))

In [11]:
res =pd.DataFrame({'uid':test.uid,'label':pred})


In [13]:
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.5 else 0)
res.label = res.label.map(lambda x: int(x))

In [15]:
res.to_csv('../result/lgb-baseline.csv',index=False,header=False,sep=',',columns=['uid','label'])