In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
train = pd.read_csv('../data/train_featureV1.csv')
test = pd.read_csv('../data/test_featureV2.csv')

In [3]:
train = train.sample(frac=1).reset_index(drop=True)
train.head()

Unnamed: 0,uid,label,voice_talk_time_count,voice_talk_time_std,voice_talk_time_max,voice_talk_time_min,voice_talk_time_median,voice_talk_time_mean,voice_talk_time_sum,voice_talk_time_more_than_60,...,wa_day_visit_app_42,wa_day_visit_app_43,wa_day_visit_app_44,wa_day_visit_app_45,wa_day_visit_app_max,wa_day_visit_app_min,wa_day_visit_app_mean,wa_day_visit_app_median,wa_day_visit_app_std,wa_day_visit_app_sum
0,u1039,0,375.0,133.351273,1347.0,2.0,37.0,76.069333,28526.0,130.0,...,16.0,13.0,26.0,40.0,52.0,0.0,18.822222,17.0,13.061034,847.0
1,u1622,0,84.0,121.688406,595.0,2.0,45.5,86.833333,7294.0,33.0,...,69.0,50.0,42.0,45.0,71.0,1.0,43.844444,46.0,14.802335,1973.0
2,u2586,0,156.0,7212.485079,90175.0,2.0,64.5,715.801282,111665.0,86.0,...,0.0,0.0,0.0,0.0,6.0,0.0,0.133333,0.0,0.884433,6.0
3,u0633,0,83.0,70.440762,385.0,4.0,31.0,56.39759,4681.0,24.0,...,0.0,0.0,0.0,0.0,27.0,0.0,2.977778,0.0,6.977707,134.0
4,u3396,0,140.0,67.446991,568.0,6.0,45.5,58.064286,8129.0,40.0,...,32.0,37.0,36.0,25.0,51.0,3.0,28.2,28.0,9.705897,1269.0


In [4]:
dtrain = lgb.Dataset(train.drop(['uid','label'],axis=1),label=train.label)
dtest = lgb.Dataset(test.drop(['uid'],axis=1))

In [20]:
lgb_params =  {
    'boosting_type': 'gbdt',
    'objective': 'binary',
#    'metric': ('multi_logloss', 'multi_error'),
    #'metric_freq': 100,
    'is_training_metric': False,
    'min_data_in_leaf': 12,
    'num_leaves': 512,
    'learning_rate': 0.1,
    'feature_fraction': 0.8,
    'bagging_fraction': 0.8,
    'verbosity':-1,
#    'gpu_device_id':2,
#    'device':'gpu'
#    'lambda_l1': 0.001,
#    'skip_drop': 0.95,
#    'max_drop' : 10
    #'lambda_l2': 0.005
    #'num_threads': 18
}    

In [21]:
def evalMetric(preds,dtrain):
    
    label = dtrain.get_label()
    
    
    pre = pd.DataFrame({'preds':preds,'label':label})
    pre = pre.sort_values(by='preds',ascending=False)
    
    auc = metrics.roc_auc_score(pre.label,pre.preds)

    pre.preds=pre.preds.map(lambda x: 1 if x>=0.18 else 0)

    f1 = metrics.f1_score(pre.label,pre.preds)
    
    
    res = 0.6*auc +0.4*f1
    
    return 'res',res,True

### 本地CV

In [22]:
lgb.cv(lgb_params,dtrain,feval=evalMetric,early_stopping_rounds=100,verbose_eval=5,num_boost_round=10000,nfold=3,metrics=['evalMetric'])

[5]	cv_agg's res: 0.651237 + 0.00482366
[10]	cv_agg's res: 0.691149 + 0.00471904
[15]	cv_agg's res: 0.764912 + 0.00611667
[20]	cv_agg's res: 0.782871 + 0.00916835
[25]	cv_agg's res: 0.788782 + 0.00797644
[30]	cv_agg's res: 0.792697 + 0.00657508
[35]	cv_agg's res: 0.796434 + 0.00504437
[40]	cv_agg's res: 0.800787 + 0.00687248
[45]	cv_agg's res: 0.801624 + 0.00746372
[50]	cv_agg's res: 0.801568 + 0.00437073
[55]	cv_agg's res: 0.801346 + 0.0054694
[60]	cv_agg's res: 0.801689 + 0.00421042
[65]	cv_agg's res: 0.802513 + 0.00507893
[70]	cv_agg's res: 0.800884 + 0.00604461
[75]	cv_agg's res: 0.799489 + 0.00431406
[80]	cv_agg's res: 0.799641 + 0.0048531
[85]	cv_agg's res: 0.798873 + 0.00828273
[90]	cv_agg's res: 0.796947 + 0.00760557
[95]	cv_agg's res: 0.79618 + 0.00552813
[100]	cv_agg's res: 0.79544 + 0.00446027
[105]	cv_agg's res: 0.795847 + 0.00444487
[110]	cv_agg's res: 0.796074 + 0.00450408
[115]	cv_agg's res: 0.793334 + 0.00549694
[120]	cv_agg's res: 0.795554 + 0.00626476
[125]	cv_agg's r

{'res-mean': [0.5660320193901337,
  0.6150436717279022,
  0.6370179336209589,
  0.6457990676014224,
  0.6512373017856355,
  0.6551523451828999,
  0.6560129592958739,
  0.6564879793156313,
  0.6571971749801532,
  0.6911489532591227,
  0.719639431565203,
  0.7367771943267069,
  0.7493639328598395,
  0.7596712425933537,
  0.764911721052076,
  0.7695872551374752,
  0.7721079986759477,
  0.7766036262972883,
  0.7801885009148101,
  0.7828711753598937,
  0.782402164141903,
  0.7850709398150327,
  0.7866172797758609,
  0.7871261716992155,
  0.7887820822358075,
  0.7899299851659407,
  0.7907420345407717,
  0.7908452052882611,
  0.7920435869469659,
  0.7926972776051101,
  0.7948187312035011,
  0.794956164885182,
  0.7981184331074423,
  0.7976666103659316,
  0.7964335866579133,
  0.7969397686008213,
  0.7978258896453113,
  0.7993893384478228,
  0.8006560730720498,
  0.8007868611677619,
  0.8009657594368402,
  0.8010391847519501,
  0.7999466541304349,
  0.8017690982069835,
  0.8016235368858103,
  

## 训练

In [23]:
model =lgb.train(lgb_params,dtrain,feval=evalMetric,verbose_eval=5,num_boost_round=300,valid_sets=[dtrain])

[5]	training's res: 0.719201
[10]	training's res: 0.759907
[15]	training's res: 0.879368
[20]	training's res: 0.930846
[25]	training's res: 0.96511
[30]	training's res: 0.984
[35]	training's res: 0.992585
[40]	training's res: 0.998892
[45]	training's res: 1
[50]	training's res: 1
[55]	training's res: 1
[60]	training's res: 1
[65]	training's res: 1
[70]	training's res: 1
[75]	training's res: 1
[80]	training's res: 1
[85]	training's res: 1
[90]	training's res: 1
[95]	training's res: 1
[100]	training's res: 1
[105]	training's res: 1
[110]	training's res: 1
[115]	training's res: 1
[120]	training's res: 1
[125]	training's res: 1
[130]	training's res: 1
[135]	training's res: 1
[140]	training's res: 1
[145]	training's res: 1
[150]	training's res: 1
[155]	training's res: 1
[160]	training's res: 1
[165]	training's res: 1
[170]	training's res: 1
[175]	training's res: 1
[180]	training's res: 1
[185]	training's res: 1
[190]	training's res: 1
[195]	training's res: 1
[200]	training's res: 1
[205]	tr

### 预测

In [24]:
pred=model.predict(test.drop(['uid'],axis=1))

In [25]:
res =pd.DataFrame({'uid':test.uid,'label':pred})


In [26]:
res=res.sort_values(by='label',ascending=False)
res.label=res.label.map(lambda x: 1 if x>=0.18 else 0)
res.label = res.label.map(lambda x: int(x))

In [27]:
res.to_csv('../resultB/lgb0_18_2.csv',index=False,header=False,sep=',',columns=['uid','label'])