In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score

import xgboost as xgb

## wa features
1. 访问网站的中的个数， wa_all_cnt
2. 不同网站的次数， wa_all_unique_cnt
3. visit_cnt的，sum, mean, std, max, min, skew
4. visit_dura
5. up_flow
6. down_flow
7. down_flow - up_flow = down_up_flow_diff
8. wa_type, 分布统计
9. date的分布个数，以及对应的不同次数，up_flow, down_flow, visit_dura, visit_cnt的统计值

In [2]:
df_train_wa = pd.read_csv('../data/train/wa_train.txt',sep='\t',low_memory=False)
df_train_label = pd.read_csv('../data/train/uid_train.txt',sep='\t',low_memory=False)

df_testA_wa = pd.read_csv('../data/testA/wa_test_a.txt',sep='\t',low_memory=False)

df_testB_wa = pd.read_csv('../data/testB/wa_test_b.txt',sep='\t',low_memory=False)

In [3]:
df_train_wa.info()
df_train_wa.head()




<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4808343 entries, 0 to 4808342
Data columns (total 8 columns):
uid           object
wa_name       object
visit_cnt     float64
visit_dura    float64
up_flow       float64
down_flow     float64
wa_type       float64
date          float64
dtypes: float64(6), object(2)
memory usage: 293.5+ MB


Unnamed: 0,uid,wa_name,visit_cnt,visit_dura,up_flow,down_flow,wa_type,date
0,u0001,155导航,5.0,207.0,313.0,457.0,0.0,14.0
1,u0001,155导航,7.0,396.0,547.0,659.0,0.0,4.0
2,u0001,155导航,10.0,3212.0,781.0,941.0,0.0,12.0
3,u0001,155导航,14.0,723.0,1094.0,1318.0,0.0,10.0
4,u0001,155导航,18.0,990.0,1406.0,1694.0,0.0,11.0


In [4]:
wa_name_risk = list(set(df_train_wa[df_train_wa['uid'].isin(df_train_label[df_train_label['label']==1]['uid'])]['wa_name']))


In [16]:
def get_wa_feature_plus(df_train_wa, target='train', Type=None):
    if target == 'train':
        # 复制lable的数据，作为所有的特征的标示
        df_train = df_train_label.copy()
    else:
        if Type == 'A':
            df_train = pd.DataFrame(data={'uid':['u'+str(id) for id in range(5000, 7000)]})
        else:
            df_train = pd.DataFrame(data={'uid':['u'+str(id) for id in range(7000, 10000)]})
    
    df_train_wa['date'].fillna(0,inplace=True)
    df_train_wa['date'] = df_train_wa['date'].astype(int)
    
    df_train_wa['down_up_flow_diff'] = df_train_wa['down_flow'] - df_train_wa['up_flow']
    
    df_train_wa['visit_dura_div_visit_cnt'] = df_train_wa['visit_dura'] / df_train_wa['visit_cnt']
    df_train_wa['up_flow_div_visit_cnt'] = df_train_wa['up_flow'] / df_train_wa['visit_cnt']
    df_train_wa['down_flow_div_visit_cnt'] = df_train_wa['down_flow'] / df_train_wa['visit_cnt']
    
    # 总的通话次数
    df_tmp = pd.DataFrame(df_train_wa.groupby('uid',as_index=True)['wa_name'].count())
    df_tmp.columns = ['wa_all_cnt']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # 联系最多和最少的次数的wa_name
    tmp = df_train_wa.groupby('uid',as_index=True)['wa_name']
    wa_all_wa_name_many_wite = []
    wa_all_wa_name_little_wite = []
    uids = []
    for uid, values in tmp:
        uids.append(uid)
        if len(values.value_counts()) == 0:
            wa_all_wa_name_many_wite.append(np.nan)
            wa_all_wa_name_little_wite.append(np.nan)
            continue
        wa_all_wa_name_many_wite.append(values.value_counts().index[0])
        wa_all_wa_name_little_wite.append(values.value_counts().index[-1])
    
    df_tmp = pd.DataFrame(data={'uid':uids, 'wa_all_wa_name_many_wite':wa_all_wa_name_many_wite, 
                                'wa_all_wa_name_little_wite':wa_all_wa_name_little_wite})
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # 总的通话的对端的不重复的个数
    tmp = df_train_wa.groupby('uid',as_index=True)['wa_name'].unique()
    uids = tmp.index
    opp_nums = []
    for opp_num in tmp:
        opp_nums.append(len(opp_num))
    df_tmp = pd.DataFrame(data={'uid':uids, 'wa_all_unique_cnt':opp_nums})
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # 通话次数 / 人数的比例，每个人通话的次数， wa_all_per_opp_rate
    df_train['wa_all_per_opp_rate'] = df_train['wa_all_cnt'] / df_train['wa_all_unique_cnt']
            
    # wa_type 分布
    df_tmp = df_train_wa.groupby('uid',as_index=True)['wa_type'].value_counts().unstack()
    df_tmp.columns = ['wa_all_in_out_'+str(i) for i in range(2)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # in_out 的比例
    for feat in ['wa_all_in_out_'+str(i) for i in range(2)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['wa_all_cnt']

    # day的分布，和比例，注意天的粒度很大，只需使用start end一种即可
    df_tmp = df_train_wa.groupby('uid',as_index=True)['date'].value_counts().unstack()[[i for i in range(1,46)]]
    df_tmp.columns = ['wa_all_date_day_'+str(i) for i in range(1,46)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    for feat in ['wa_all_date_day_'+str(i) for i in range(1,46)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['wa_all_cnt']
    
    
    # visit_cnt, visit_dura, up_flow, down_flow, down_up_flow_diff, stats_values
    for feat in ['visit_cnt', 'visit_dura', 'up_flow', 'down_flow', 'down_up_flow_diff','visit_dura_div_visit_cnt','up_flow_div_visit_cnt','down_flow_div_visit_cnt']:
        # sum 
        df_tmp = pd.DataFrame(df_train_wa.groupby('uid',as_index=True)[feat].sum())
        df_tmp.columns = ['wa_all_'+feat+'_sum']
        df_tmp['uid'] = df_tmp.index
        df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

        # meam
        df_tmp = pd.DataFrame(df_train_wa.groupby('uid',as_index=True)[feat].mean())
        df_tmp.columns = ['wa_all_'+feat+'_avg']
        df_tmp['uid'] = df_tmp.index
        df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

        # max
        df_tmp = pd.DataFrame(df_train_wa.groupby('uid',as_index=True)[feat].max())
        df_tmp.columns = ['wa_all_'+feat+'_max']
        df_tmp['uid'] = df_tmp.index
        df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
        # min
        df_tmp = pd.DataFrame(df_train_wa.groupby('uid',as_index=True)[feat].min())
        df_tmp.columns = ['wa_all_'+feat+'_min']
        df_tmp['uid'] = df_tmp.index
        df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
        # std
        df_tmp = pd.DataFrame(df_train_wa.groupby('uid',as_index=True)[feat].std())
        df_tmp.columns = ['wa_all_'+feat+'_std']
        df_tmp['uid'] = df_tmp.index
        df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
        # skew 
        df_tmp = pd.DataFrame(df_train_wa.groupby('uid',as_index=True)[feat].skew())
        df_tmp.columns = ['wa_all_'+feat+'_skew']
        df_tmp['uid'] = df_tmp.index
        df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
        df_train['wa_all_'+feat+'_jc'] = df_train['wa_all_'+feat+'_max'] - df_train['wa_all_'+feat+'_min']
        df_train['wa_all_'+feat+'_fd'] = df_train['wa_all_'+feat+'_std'] / df_train['wa_all_'+feat+'_avg']
    
    # 交叉特征
    for feat in ['sum','avg','max','min','std','skew']:
#         print feat
        df_train['wa_all_visit_dura_cnt_%s_rate' % feat] = df_train['wa_all_visit_dura_%s' % feat] / df_train['wa_all_visit_cnt_%s' % feat]
        df_train['wa_all_up_flow_cnt_%s_rate' % feat] = df_train['wa_all_up_flow_%s' % feat] / df_train['wa_all_visit_cnt_%s' % feat]
        df_train['wa_all_down_flow_cnt_%s_rate' % feat] = df_train['wa_all_down_flow_%s' % feat] / df_train['wa_all_visit_cnt_%s' % feat]
        df_train['wa_all_down_up_flow_diff_cnt_%s_rate' % feat] = df_train['wa_all_down_up_flow_diff_%s' % feat] / df_train['wa_all_visit_cnt_%s' % feat]

        df_train['wa_all_up_flow_dura_%s_rate' % feat] = df_train['wa_all_up_flow_%s' % feat] / df_train['wa_all_visit_dura_%s' % feat]
        df_train['wa_all_down_flow_dura_%s_rate' % feat] = df_train['wa_all_down_flow_%s' % feat] / df_train['wa_all_visit_dura_%s' % feat]
        df_train['wa_all_down_up_flow_diff_dura_%s_rate' % feat] = df_train['wa_all_down_up_flow_diff_%s' % feat] / df_train['wa_all_visit_dura_%s' % feat]
        
        df_train['wa_all_visit_cnt_all_cnt_%s_rate' % feat] = df_train['wa_all_visit_cnt_%s' % feat] / df_train['wa_all_cnt']
        df_train['wa_all_visit_dura_all_cnt_%s_rate' % feat] = df_train['wa_all_visit_dura_%s' % feat] / df_train['wa_all_cnt']
        df_train['wa_all_up_flow_all_cnt_%s_rate' % feat] = df_train['wa_all_up_flow_%s' % feat] / df_train['wa_all_cnt']
        df_train['wa_all_down_flow_all_cnt_%s_rate' % feat] = df_train['wa_all_down_flow_%s' % feat] / df_train['wa_all_cnt']
        df_train['wa_all_down_up_flow_diff_all_cnt_%s_rate' % feat] = df_train['wa_all_down_up_flow_diff_%s' % feat] / df_train['wa_all_cnt']

    
    
    
    # wa_type in visit_cnt, visit_dura, up_flow, down_flow, down_up_flow_diff, --> stats_values
    # prob 其对应的没有分类的比例
    for feat in ['visit_cnt', 'visit_dura', 'up_flow', 'down_flow', 'down_up_flow_diff']:
        # 计算所有的比例
        # sum
        df_tmp = df_train_wa.groupby(['uid','wa_type'])[feat].sum().unstack() 
        df_tmp.columns = ['wa_all_'+feat+'_wa_type_' + str(i) + '_sum' for i in range(2)]
        df_tmp['uid'] = df_tmp.index
        df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

        # mean
        df_tmp = df_train_wa.groupby(['uid','wa_type'])[feat].mean().unstack() 
        df_tmp.columns = ['wa_all_'+feat+'_wa_type_' + str(i) + '_avg' for i in range(2)]
        df_tmp['uid'] = df_tmp.index
        df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

        # max
        df_tmp = df_train_wa.groupby(['uid','wa_type'])[feat].max().unstack() 
        df_tmp.columns = ['wa_all_'+feat+'_wa_type_' + str(i) + '_max' for i in range(2)]
        df_tmp['uid'] = df_tmp.index
        df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
        # min
        df_tmp = df_train_wa.groupby(['uid','wa_type'])[feat].min().unstack() 
        df_tmp.columns = ['wa_all_'+feat+'_wa_type_' + str(i) + '_min' for i in range(2)]
        df_tmp['uid'] = df_tmp.index
        df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
#         std
        df_tmp = df_train_wa.groupby(['uid','wa_type'])[feat].std().unstack() 
        df_tmp.columns = ['wa_all_'+feat+'_wa_type_' + str(i) + '_std' for i in range(2)]
        df_tmp['uid'] = df_tmp.index
        df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
        # skew
        df_tmp = df_train_wa.groupby(['uid','wa_type'])[feat].skew().unstack() 
        df_tmp.columns = ['wa_all_'+feat+'_wa_type_' + str(i) + '_skew' for i in range(2)]
        df_tmp['uid'] = df_tmp.index
        df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
        
        # wa_type的分来的总数 / 所有的sum，
        df_train['wa_all_'+feat+'_wa_type_' + "_all_multify"] = [1 for i in range(len(df_train))]
        
        for f in ['wa_all_'+feat+'_wa_type_' + str(i) + '_sum' for i in range(2)]:
            df_train[f+'_rate'] = df_train[f] / df_train['wa_all_'+feat+'_sum']  
            df_train['wa_all_'+feat+'_wa_type_' +  "_all_multify"] = df_train['wa_all_'+feat+'_wa_type_' + "_all_multify"] * df_train[f]
        
        # wa_type的分来的avg / 所有的avg，
        for f in ['wa_all_'+feat+'_wa_type_' + str(i) + '_avg' for i in range(2)]:
            df_train[f+'_rate'] = df_train[f] / df_train['wa_all_'+feat+'_avg']  
            df_train['wa_all_'+feat+'_wa_type_' + "_all_multify"] = df_train['wa_all_'+feat+'_wa_type_' + "_all_multify"] * df_train[f]

        # wa_type的分来的std / 所有的std，
        for f in ['wa_all_'+feat+'_wa_type_' + str(i) + '_std' for i in range(2)]:
            df_train[f+'_rate'] = df_train[f] / df_train['wa_all_'+feat+'_std']  
            df_train['wa_all_'+feat+'_wa_type_' + "_all_multify"] = df_train['wa_all_'+feat+'_wa_type_' + "_all_multify"] * df_train[f]
        
        # wa_type的分来的skew / 所有的skew，
        for f in ['wa_all_'+feat+'_wa_type_' + str(i) + '_skew' for i in range(2)]:
            df_train[f+'_rate'] = df_train[f] / df_train['wa_all_'+feat+'_skew']  
            df_train['wa_all_'+feat+'_wa_type_' +  "_all_multify"] = df_train['wa_all_'+feat+'_wa_type_' + "_all_multify"] * df_train[f]
    
    
    
    
    # 根据date进行分类的统计值，以及比例
    # count
    df_tmp = df_train_wa.groupby(['uid','date'])['date'].count().unstack()[[i for i in range(1,46)]]
    df_tmp.columns = ['wa_all_date_' + str(i) + '_count' for i in range(1,46)]
    df_tmp['uid'] = df_tmp.index
    
    # 横行分布的统计值
    df_tmp['wa_all_date_count_avg'] = df_tmp[['wa_all_date_' + str(i) + '_count' for i in range(1,46)]].mean(axis=1)
    df_tmp['wa_all_date_count_std'] = df_tmp[['wa_all_date_' + str(i) + '_count' for i in range(1,46)]].std(axis=1)
    df_tmp['wa_all_date_count_skew'] = df_tmp[['wa_all_date_' + str(i) + '_count' for i in range(1,46)]].skew(axis=1)
    df_tmp['wa_all_date_count_max'] = df_tmp[['wa_all_date_' + str(i) + '_count' for i in range(1,46)]].max(axis=1)
    df_tmp['wa_all_date_count_min'] = df_tmp[['wa_all_date_' + str(i) + '_count' for i in range(1,46)]].min(axis=1)
    
    df_tmp['wa_all_date_count_jc'] = df_tmp['wa_all_date_count_max'] - df_tmp['wa_all_date_count_min']
    df_tmp['wa_all_date_count_fb'] = df_tmp['wa_all_date_count_std'] - df_tmp['wa_all_date_count_avg']
    
    
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    
    
    # date in visit_cnt, visit_dura, up_flow, down_flow, down_up_flow_diff, --> stats_values
    # prob 其对应的没有分类的比例
    for feat in ['visit_cnt', 'visit_dura', 'up_flow', 'down_flow', 'down_up_flow_diff']:
        # 计算所有的比例
        
        # sum
        df_tmp = df_train_wa.groupby(['uid','date'])[feat].sum().unstack()[[i for i in range(1,46)]]
        df_tmp.columns = ['wa_all_'+feat+'_date_' + str(i) + '_sum' for i in range(1,46)]
        df_tmp['uid'] = df_tmp.index
        
        
        # 横行分布的统计值
        df_tmp['wa_all_'+feat+'_date_sum_avg'] = df_tmp[['wa_all_'+feat+'_date_' + str(i) + '_sum' for i in range(1,46)]].mean(axis=1)
        df_tmp['wa_all_'+feat+'_date_sum_std'] = df_tmp[['wa_all_'+feat+'_date_' + str(i) + '_sum' for i in range(1,46)]].std(axis=1)
        df_tmp['wa_all_'+feat+'_date_sum_skew'] = df_tmp[['wa_all_'+feat+'_date_' + str(i) + '_sum' for i in range(1,46)]].skew(axis=1)
        df_tmp['wa_all_'+feat+'_date_sum_max'] = df_tmp[['wa_all_'+feat+'_date_' + str(i) + '_sum' for i in range(1,46)]].max(axis=1)
        df_tmp['wa_all_'+feat+'_date_sum_min'] = df_tmp[['wa_all_'+feat+'_date_' + str(i) + '_sum' for i in range(1,46)]].min(axis=1)

        df_tmp['wa_all_'+feat+'_date_sum_jc'] = df_tmp['wa_all_'+feat+'_date_sum_max'] - df_tmp['wa_all_'+feat+'_date_sum_min']
        df_tmp['wa_all_'+feat+'_date_sum_fb'] = df_tmp['wa_all_'+feat+'_date_sum_std'] - df_tmp['wa_all_'+feat+'_date_sum_avg']

        
        df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
        
        

#         # mean
#         df_tmp = df_train_wa.groupby(['uid','date'])[feat].mean().unstack()[[i for i in range(1,46)]]
#         df_tmp.columns = ['wa_all_'+feat+'_date_' + str(i) + '_avg' for i in range(1,46)]
#         df_tmp['uid'] = df_tmp.index
#         df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
#         # std
#         df_tmp = df_train_wa.groupby(['uid','date'])[feat].std().unstack()[[i for i in range(1,46)]] 
#         df_tmp.columns = ['wa_all_'+feat+'_date_' + str(i) + '_std' for i in range(1,46)]
#         df_tmp['uid'] = df_tmp.index
#         df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
#         # skew
#         df_tmp = df_train_wa.groupby(['uid','date'])[feat].skew().unstack()[[i for i in range(1,46)]] 
#         df_tmp.columns = ['wa_all_'+feat+'_date_' + str(i) + '_skew' for i in range(1,46)]
#         df_tmp['uid'] = df_tmp.index
#         df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
        
        # date的分来的总数 / 所有的sum，
        for f in ['wa_all_'+feat+'_date_' + str(i) + '_sum' for i in range(1,46)]:
            df_train[f+'_rate'] = df_train[f] / df_train['wa_all_'+feat+'_sum']  
        
#         # date的分来的avg / 所有的avg，
#         for f in ['wa_all_'+feat+'_date_' + str(i) + '_avg' for i in range(1,46)]:
#             df_train[f+'_rate'] = df_train[f] / df_train['wa_all_'+feat+'_avg']  

#         # date的分来的std / 所有的std，
#         for f in ['wa_all_'+feat+'_date_' + str(i) + '_std' for i in range(1,46)]:
#             df_train[f+'_rate'] = df_train[f] / df_train['wa_all_'+feat+'_std']  
        
#         # date的分来的skew / 所有的skew，
#         for f in ['wa_all_'+feat+'_date_' + str(i) + '_skew' for i in range(1,46)]:
#             df_train[f+'_rate'] = df_train[f] / df_train['wa_all_'+feat+'_skew']  

    
    return df_train
    


In [15]:
df_test = get_wa_feature_plus(df_testB_wa, target='test', Type='B')
df_testA = get_wa_feature_plus(df_testA_wa, target='test', Type='A')

df_train = get_wa_feature_plus(df_train_wa)

# ID
# df_test['id'] = df_test['uid'].apply(lambda x: int(x[1:]))
# df_train['id'] = df_train['uid'].apply(lambda x: int(x[1:]))

df_train['wa_all_wa_name_little_wite_risk'] = df_train['wa_all_wa_name_little_wite'].isin(wa_name_risk)
df_train['wa_all_wa_name_many_wite_risk'] = df_train['wa_all_wa_name_many_wite'].isin(wa_name_risk)

df_test['wa_all_wa_name_little_wite_risk'] = df_test['wa_all_wa_name_little_wite'].isin(wa_name_risk)
df_test['wa_all_wa_name_many_wite_risk'] = df_test['wa_all_wa_name_many_wite'].isin(wa_name_risk)

df_train.fillna(0,inplace=True)
df_test.fillna(0,inplace=True)


df_train.to_csv('../xdata/df_train_wa_feat.csv',index=False)
df_test.to_csv('../xdata/df_testB_wa_feat.csv',index=False)
df_testA.to_csv('../xdata/df_testA_wa_feat.csv',index=False)


df_train.info()
df_test.info()

df_test.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4999 entries, 0 to 4998
Columns: 881 entries, uid to wa_all_wa_name_many_wite_risk
dtypes: bool(2), float64(873), int64(3), object(3)
memory usage: 33.6+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 0 to 2999
Columns: 880 entries, uid to wa_all_wa_name_many_wite_risk
dtypes: bool(2), float64(873), int64(2), object(3)
memory usage: 20.1+ MB


Unnamed: 0,uid,wa_all_cnt,wa_all_wa_name_little_wite,wa_all_wa_name_many_wite,wa_all_unique_cnt,wa_all_per_opp_rate,wa_all_in_out_0,wa_all_in_out_1,wa_all_in_out_0_rate,wa_all_in_out_1_rate,...,wa_all_down_up_flow_diff_date_38_sum_rate,wa_all_down_up_flow_diff_date_39_sum_rate,wa_all_down_up_flow_diff_date_40_sum_rate,wa_all_down_up_flow_diff_date_41_sum_rate,wa_all_down_up_flow_diff_date_42_sum_rate,wa_all_down_up_flow_diff_date_43_sum_rate,wa_all_down_up_flow_diff_date_44_sum_rate,wa_all_down_up_flow_diff_date_45_sum_rate,wa_all_wa_name_little_wite_risk,wa_all_wa_name_many_wite_risk
0,u7000,1817,amazon云服务,百度地图,220,8.259091,1483.0,334.0,0.816181,0.183819,...,0.004838,0.003467,0.035108,0.020812,0.038114,0.002371,0.043478,0.038492,True,True
1,u7001,186,腾讯手机管家,DNS查询流量,46,4.043478,161.0,25.0,0.865591,0.134409,...,0.0,0.003857,0.429139,0.018449,0.010784,0.001612,0.202429,0.211472,True,True
2,u7002,396,腾讯云平台,微信,98,4.040816,316.0,80.0,0.79798,0.20202,...,0.000956,0.00011,0.014388,0.002861,0.001792,0.003969,0.000141,0.001405,True,True
3,u7003,1115,Mob官网,支付宝,145,7.689655,943.0,172.0,0.84574,0.15426,...,0.008828,0.000713,0.025455,0.010085,0.093552,0.005682,0.000103,0.002463,True,True
4,u7004,15,WAP上网（get）,WAP上网连接流量,2,7.5,15.0,0.0,1.0,0.0,...,0.0,0.0,-1.228319,0.646018,-0.219469,1.362832,0.0,0.0,True,True


In [8]:
seed = 71

np.random.seed(seed)
valid_size = 0.2
LOOP = 1
ESR = 50
# XGB param
nround = 3000
#nround = 10

param = {'max_depth':5, # 基准是5 
         'eta':0.05,
         'gamma ':0.1,
         'colsample_bytree':0.85, # old 0.8
         'subsample':0.85,
         'silent':1,
         'eval_metric':'auc',
         'objective':'binary:logistic',
#          'scale_pos_weight':5,
         'seed': seed
        }

# 设置特征数据，去除id数据，不能进行预测
features = df_test.columns
features = list(features)
features.remove('uid')
features.remove('wa_all_wa_name_little_wite')
features.remove('wa_all_wa_name_many_wite')

label = 'label'

print len(features)

877


In [9]:
def split_train_valid(df_train,test_size=0.2):
    '''
    k-fold交叉验证,默认k=10
    df_train:训练数据
    '''
    X_train, X_vali, y_train, y_vali = train_test_split(df_train[features], df_train[label], test_size=test_size, random_state=40000)
    #added some parameters
    
#     dtrain = df_train.iloc[train_list]
#     dvali =  df_train.iloc[vali_list]
    
    dtrain = xgb.DMatrix(X_train,label=y_train)
    dvalid = xgb.DMatrix(X_vali,label=y_vali)
    watchlist = [(dtrain, 'train'),(dvalid, 'valid')]
    
    return dtrain, dvalid, watchlist

In [10]:
models = []
seeds = [71,73,91]
for i in range(LOOP):
    print('LOOP',i)
#     dbuild, dvalid, watchlist = split_build_valid(df_train)
    dbuild, dvalid, watchlist = split_train_valid(df_train,test_size=0.2)
    param['seed'] = seeds[i]
    model = xgb.train(param, dbuild, nround, watchlist,early_stopping_rounds=ESR,verbose_eval=20)
    models.append(model)
#     model.save_model('./model1'+ str(i) + '.model')
    # VALID
    valid_yhat = model.predict(dvalid,ntree_limit=model.best_iteration)
    print('Valid Mean:---------------------->', np.mean(valid_yhat))
    del dbuild, dvalid, watchlist


('LOOP', 0)
[0]	train-auc:0.716621	valid-auc:0.626092
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[20]	train-auc:0.890585	valid-auc:0.664315
[40]	train-auc:0.93267	valid-auc:0.673614
[60]	train-auc:0.963007	valid-auc:0.678996
[80]	train-auc:0.978136	valid-auc:0.682833
[100]	train-auc:0.987793	valid-auc:0.680211
[120]	train-auc:0.992289	valid-auc:0.67936
Stopping. Best iteration:
[83]	train-auc:0.980271	valid-auc:0.685435

('Valid Mean:---------------------->', 0.17722413)


In [11]:
# 计算特征重要程度
import operator
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1),reverse=True)

print importance

[('wa_all_in_out_0_rate', 38), ('wa_all_visit_cnt_all_cnt_std_rate', 27), ('wa_all_visit_dura_div_visit_cnt_avg', 22), ('wa_all_date_count_std', 21), ('wa_all_up_flow_wa_type_0_skew_rate', 21), ('wa_all_visit_cnt_date_sum_skew', 21), ('wa_all_up_flow_date_42_sum_rate', 20), ('wa_all_date_day_35_rate', 17), ('wa_all_visit_dura_div_visit_cnt_skew', 16), ('wa_all_visit_dura_cnt_sum_rate', 16), ('wa_all_visit_dura_cnt_std_rate', 15), ('wa_all_visit_cnt_wa_type_0_skew_rate', 14), ('wa_all_visit_dura_date_sum_skew', 14), ('wa_all_visit_dura_cnt_max_rate', 14), ('wa_all_down_up_flow_diff_date_39_sum', 14), ('wa_all_down_flow_all_cnt_avg_rate', 13), ('wa_all_visit_dura_date_44_sum_rate', 13), ('wa_all_date_day_16_rate', 13), ('wa_all_date_day_43_rate', 13), ('wa_all_down_flow_div_visit_cnt_fd', 12), ('wa_all_down_flow_wa_type_0_skew_rate', 12), ('wa_all_up_flow_div_visit_cnt_avg', 12), ('wa_all_visit_dura_cnt_skew_rate', 12), ('wa_all_visit_dura_div_visit_cnt_fd', 12), ('wa_all_down_flow_all_c