In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score

import xgboost as xgb

## voice features
#### 先对所有的时间进行特征提取
1. 用户电话的总的通话次数 opp
2. 通话的人数，voice_all_unique_cnt
3. 通话次数 / 人数的比例， voice_all_cnt_all_unique_cnt_rate
4. 对端电话的前n位的个数，所有的不同号码的个数。 opp_head
5. 对端号码长度的分布个数   opp_len
6. 通话最大时长，平均时长，最小时长，极差时长等统计的信息  start_time, end_time
7. 通话类型的分布个数或者比例 call_type
8. 通话类型的分布个数和比例   in_out


## 两种思路
1. 自己原来的思路
2. 使用新的思路

In [2]:
df_train_voice = pd.read_csv('../data/train/voice_train.txt',sep='\t',low_memory=False)
df_train_label = pd.read_csv('../data/train/uid_train.txt',sep='\t',low_memory=False)


In [3]:
def get_voice_feature_plus(df_train_voice, target='train', Type=None):
    if target == 'train':
        # 复制lable的数据，作为所有的特征的标示
        df_train = df_train_label.copy()
    else:
        if Type == 'A':
            df_train = pd.DataFrame(data={'uid':['u'+str(id) for id in range(5000, 7000)]})
        else:
            df_train = pd.DataFrame(data={'uid':['u'+str(id) for id in range(7000, 10000)]})
        
    
    
    df_train_voice['start_time_day'] = df_train_voice['start_time'].apply(lambda x: x / 1000000)
    df_train_voice['start_time_hour'] = df_train_voice['start_time'].apply(lambda x: x / 10000 % 100)
    df_train_voice['start_time_minute'] = df_train_voice['start_time'].apply(lambda x: x / 100 % 100)
    df_train_voice['start_time_second'] = df_train_voice['start_time'].apply(lambda x: x % 100)

    df_train_voice['end_time_day'] = df_train_voice['end_time'].apply(lambda x: x / 1000000)
    df_train_voice['end_time_hour'] = df_train_voice['end_time'].apply(lambda x: x / 10000 % 100)
    df_train_voice['end_time_minute'] = df_train_voice['end_time'].apply(lambda x: x / 100 % 100)
    df_train_voice['end_time_second'] = df_train_voice['end_time'].apply(lambda x: x % 100)


    df_train_voice['diff_time'] = get_diff_time(df_train_voice[['start_time','end_time']].values)
    
    
    # 总的通话次数
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['opp_num'].count())
    df_tmp.columns = ['voice_all_cnt']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # 总的通话的对端的不重复的个数
    tmp = df_train_voice.groupby('uid',as_index=True)['opp_num'].unique()
    uids = tmp.index
    opp_nums = []
    for opp_num in tmp:
        opp_nums.append(len(opp_num))
    df_tmp = pd.DataFrame(data={'uid':uids, 'voice_all_unique_cnt':opp_nums})
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # 通话次数 / 人数的比例，每个人通话的次数， voice_all_per_opp_rate
    df_train['voice_all_per_opp_rate'] = df_train['voice_all_cnt'] / df_train['voice_all_unique_cnt']
    
    
    # 4. 对端电话的前n位的个数，所有的不同号码的个数以及其所有的分布个数和比例(部分特征待定)。 opp_head_cnt_{k}, opp_head_rate_{k}
    # 全部的不同开头的次数,唯一的标示
    tmp = df_train_voice.groupby('uid',as_index=True)['opp_head'].unique()
    uids = tmp.index
    opp_nums = []
    for opp_num in tmp:
        opp_nums.append(len(opp_num))
    df_tmp = pd.DataFrame(data={'uid':uids, 'voice_all_opp_head_unique_cnt':opp_nums})
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # 联系最多和最少的次数的opp_head
    tmp = df_train_voice.groupby('uid',as_index=True)['opp_head']
    voice_all_opp_head_many_head = []
    voice_all_opp_head_many_head_cnt = []
    voice_all_opp_head_little_head = []
#     voice_all_opp_head_little_head_cnt = []
    
    uids = []
    for uid, values in tmp:
        uids.append(uid)
        voice_all_opp_head_many_head.append(values.value_counts().index[0])
        voice_all_opp_head_little_head.append(values.value_counts().index[-1])

        voice_all_opp_head_many_head_cnt.append(values.value_counts().values[0])
#         voice_all_opp_head_little_head_cnt.append(values.value_counts().values[-1])
        

    df_tmp = pd.DataFrame(data={'uid':uids, 'voice_all_opp_head_many_head':voice_all_opp_head_many_head, 
                                'voice_all_opp_head_little_head':voice_all_opp_head_little_head,
                                'voice_all_opp_head_many_head_cnt':voice_all_opp_head_many_head_cnt})
    
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    df_train['voice_all_opp_head_many_head_cnt_rate'] = df_train['voice_all_opp_head_many_head_cnt'] / df_train['voice_all_cnt']
    df_train['voice_all_opp_head_many_head_cnt_rate_unique'] = df_train['voice_all_opp_head_many_head_cnt'] / df_train['voice_all_unique_cnt']
    
    # 通话最多的head的个数
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['opp_len'].value_counts().unstack())[[3,5,6,7,8,9,10,11,12,13,14,15,16,17,19,20]]
    df_tmp.columns = ['voice_all_opp_len_'+str(k) for k in [3,5,6,7,8,9,10,11,12,13,14,15,16,17,19,20]]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    df_train['voice_all_opp_len_11_rate'] = df_train['voice_all_opp_len_11'] / df_train['voice_all_cnt']
    
    # 最近一次通话的号码的长度
    tmp = df_train_voice.groupby('uid',as_index=True)['opp_len']
    voice_all_opp_len_many_head = []
    voice_all_opp_len_little_head = []
    
    uids = []
    for uid, values in tmp:
        uids.append(uid)
        voice_all_opp_len_many_head.append(values.value_counts().index[0])
        voice_all_opp_len_little_head.append(values.value_counts().index[-1])        
    
    df_tmp = pd.DataFrame(data={'uid':uids, 'voice_all_opp_len_many_head':voice_all_opp_len_many_head, 
                                'voice_all_opp_len_little_head':voice_all_opp_len_little_head})
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # call_type 分布
    df_tmp = df_train_voice.groupby('uid',as_index=True)['call_type'].value_counts().unstack()[[1,2,3,5]]
    df_tmp.columns = ['voice_all_call_type_'+str(i) for i in range(1,5)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    # call_type 的比例
    for feat in ['voice_all_call_type_'+str(i) for i in range(1,5)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['voice_all_cnt']
    
    # in_out 分布
    df_tmp = df_train_voice.groupby('uid',as_index=True)['in_out'].value_counts().unstack()
    df_tmp.columns = ['voice_all_in_out_'+str(i) for i in range(2)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # in_out 的比例
    for feat in ['voice_all_in_out_'+str(i) for i in range(2)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['voice_all_cnt']

    
    # 处理是假相关的特征，目标100维
    # 6. 通话最大时长，平均时长，最小时长，极差时长等统计的信息  start_time, end_time, diff_time
    
    # day的分布，和比例，注意天的粒度很大，只需使用start end一种即可
    df_tmp = df_train_voice.groupby('uid',as_index=True)['start_time_day'].value_counts().unstack()[[i for i in range(1,46)]]
    df_tmp.columns = ['voice_all_start_end_time_day_'+str(i) for i in range(1,46)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    for feat in ['voice_all_start_end_time_day_'+str(i) for i in range(1,46)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['voice_all_cnt']
    
    # hour分布， 我们认为电话时间超过一小时可能是有问题的，所以使用两种
    # start_time_hour
    df_tmp = df_train_voice.groupby('uid',as_index=True)['start_time_hour'].value_counts().unstack()[[i for i in range(0,24)]]
    df_tmp.columns = ['voice_all_start_time_hour_'+str(i) for i in range(24)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    for feat in ['voice_all_start_time_hour_'+str(i) for i in range(0,24)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['voice_all_cnt']
    
    # end_time_hour
    df_tmp = df_train_voice.groupby('uid',as_index=True)['end_time_hour'].value_counts().unstack()[[i for i in range(0,24)]]
    df_tmp.columns = ['voice_all_end_time_hour_'+str(i) for i in range(24)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    for feat in ['voice_all_end_time_hour_'+str(i) for i in range(0,24)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['voice_all_cnt']
    
    
    # minute 分布
    # start_time_minute
    df_tmp = df_train_voice.groupby('uid',as_index=True)['start_time_minute'].value_counts().unstack()[[i for i in range(0,60)]]
    df_tmp.columns = ['voice_all_start_time_minute_'+str(i) for i in range(60)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    for feat in ['voice_all_start_time_minute_'+str(i) for i in range(60)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['voice_all_cnt']

    # end_time_minute
    df_tmp = df_train_voice.groupby('uid',as_index=True)['end_time_minute'].value_counts().unstack()[[i for i in range(0,60)]]
    df_tmp.columns = ['voice_all_end_time_minute_'+str(i) for i in range(60)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    for feat in ['voice_all_end_time_minute_'+str(i) for i in range(60)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['voice_all_cnt']


    # second 分布
    # start_time_second
    df_tmp = df_train_voice.groupby('uid',as_index=True)['start_time_second'].value_counts().unstack()[[i for i in range(0,60)]]
    df_tmp.columns = ['voice_all_start_time_second_'+str(i) for i in range(60)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    for feat in ['voice_all_start_time_second_'+str(i) for i in range(60)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['voice_all_cnt']

    # end_time_minute
    df_tmp = df_train_voice.groupby('uid',as_index=True)['end_time_second'].value_counts().unstack()[[i for i in range(0,60)]]
    df_tmp.columns = ['voice_all_end_time_second_'+str(i) for i in range(60)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    for feat in ['voice_all_end_time_second_'+str(i) for i in range(60)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['voice_all_cnt']
    
    
    # diff_time
    # sum 
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['diff_time'].sum())
    df_tmp.columns = ['voice_all_diff_time_sum']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    # meam
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['diff_time'].mean())
    df_tmp.columns = ['voice_all_diff_time_avg']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # max
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['diff_time'].max())
    df_tmp.columns = ['voice_all_diff_time_max']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # min
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['diff_time'].min())
    df_tmp.columns = ['voice_all_diff_time_min']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # std
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['diff_time'].std())
    df_tmp.columns = ['voice_all_diff_time_std']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # skew 
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['diff_time'].skew())
    df_tmp.columns = ['voice_all_diff_time_skew']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    df_train['voice_all_diff_time_jc'] = df_train['voice_all_diff_time_max'] - df_train['voice_all_diff_time_min']
    df_train['voice_all_diff_time_fd'] = df_train['voice_all_diff_time_std'] / df_train['voice_all_diff_time_avg']

    # start_time_first, start_time_last, end_time_first, end_time_last, 以及对应的差值， 以及差值 / 总的次数
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].max())
    df_tmp.columns = ['voice_all_start_time_last']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].min())
    df_tmp.columns = ['voice_all_start_time_first']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].max())
    df_tmp.columns = ['voice_all_end_time_last']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].min())
    df_tmp.columns = ['voice_all_end_time_first']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # start_time_last - start_time_first,
    # end_time_last - end_time_first,    
    # end_time_first - start_time_first,
    # end_time_last - start_time_last,
    df_train['voice_all_start_time_last_start_time_first_diff'] = get_diff_time(df_train[['voice_all_start_time_first','voice_all_start_time_last']].values)
    df_train['voice_all_end_time_last_end_time_first_diff'] = get_diff_time(df_train[['voice_all_end_time_first','voice_all_end_time_last']].values)
    df_train['voice_all_end_time_first_start_time_first_diff'] = get_diff_time(df_train[['voice_all_end_time_first','voice_all_start_time_first']].values)
    df_train['voice_all_end_time_last_start_time_last_diff'] = get_diff_time(df_train[['voice_all_end_time_last','voice_all_start_time_last']].values)
    
    # 平均多久打一次电话
    df_train['voice_all_end_time_first_start_time_first_diff_rate'] = df_train['voice_all_end_time_first_start_time_first_diff'] / df_train['voice_all_cnt']
    df_train['voice_all_end_time_last_start_time_last_diff_rate'] = df_train['voice_all_end_time_last_start_time_last_diff'] / df_train['voice_all_cnt']
    
    # 待定对所有的start_time, end_time进行统计信息
    # start_time
    # sum 
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].sum())
    df_tmp.columns = ['voice_all_start_time_sum']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    # meam
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].mean())
    df_tmp.columns = ['voice_all_start_time_avg']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # max
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].max())
    df_tmp.columns = ['voice_all_start_time_max']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # min
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].min())
    df_tmp.columns = ['voice_all_start_time_min']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # std
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].std())
    df_tmp.columns = ['voice_all_start_time_std']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # skew 
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].skew())
    df_tmp.columns = ['voice_all_start_time_skew']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    df_train['voice_all_start_time_jc'] = df_train['voice_all_start_time_max'] - df_train['voice_all_start_time_min']
    df_train['voice_all_start_time_fd'] = df_train['voice_all_start_time_std'] / df_train['voice_all_start_time_avg']

    # end_timne
    # sum 
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].sum())
    df_tmp.columns = ['voice_all_end_time_sum']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    # meam
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].mean())
    df_tmp.columns = ['voice_all_end_time_avg']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # max
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].max())
    df_tmp.columns = ['voice_all_end_time_max']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # min
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].min())
    df_tmp.columns = ['voice_all_end_time_min']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # std
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].std())
    df_tmp.columns = ['voice_all_end_time_std']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # skew 
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].skew())
    df_tmp.columns = ['voice_all_end_time_skew']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    df_train['voice_all_end_time_jc'] = df_train['voice_all_end_time_max'] - df_train['voice_all_end_time_min']
    df_train['voice_all_end_time_fd'] = df_train['voice_all_end_time_std'] / df_train['voice_all_end_time_avg']


                      
    return df_train
    
def diff_time(a,b):
    a_day, a_hour, a_minute, a_second = (a / 1000000, a / 10000 % 100, a / 100 % 100, a % 100)
    b_day, b_hour, b_minute, b_second = (b / 1000000, b / 10000 % 100, b / 100 % 100, b % 100)
    
    d_day = b_day - a_day
    d_hour = b_hour - a_hour
    d_minute = b_minute - a_minute
    d_second = b_second - a_second
    
    diff = d_day * 24 * 60 * 60 + d_hour * 60 * 60 + d_minute * 60 + d_second
    return diff

def get_diff_time(x):
    diff_t = []
    for d in x:
#         print d
        diff_t.append(diff_time(d[0],d[1]))
    return diff_t

In [4]:
df_testA_voice = pd.read_csv('../data/testA/voice_test_a.txt',sep='\t',low_memory=False)
df_testB_voice = pd.read_csv('../data/testB/voice_test_b.txt',sep='\t',low_memory=False)

df_testB_voice.head()

Unnamed: 0,uid,opp_num,opp_head,opp_len,start_time,end_time,call_type,in_out
0,u8925,F5FCD87EA7AF344FF408D4A7842CAEF8,187,11,12122145,12122302,1,1
1,u8925,06AE2E5A890E9407AD6C0486406B1FD8,159,11,12114240,12114323,1,0
2,u8884,E97935F68B5C09034E1259A6A4729D93,183,11,12150612,12150649,1,0
3,u8884,CF8091C4559FEE3D78B5EC86617085EF,183,11,12144742,12144953,1,1
4,u8884,CADA2EEDD787DF9F882BA75D840FA5F6,136,11,12144405,12144519,1,1


In [5]:
df_test = get_voice_feature_plus(df_testB_voice, target='test', Type='B')
df_testA = get_voice_feature_plus(df_testA_voice, target='test', Type='A')


df_train = get_voice_feature_plus(df_train_voice)
df_train.fillna(0,inplace=True)
df_test.fillna(0,inplace=True)

df_test['voice_all_opp_head_unique_cnt'] = df_test['voice_all_opp_head_unique_cnt'].astype(float)
df_test['voice_all_opp_head_many_head'] = df_test['voice_all_opp_head_many_head'].astype(float)
df_train['voice_all_opp_head_little_head'] = df_train['voice_all_opp_head_little_head'].astype(int)
df_train['voice_all_opp_head_many_head'] = df_train['voice_all_opp_head_many_head'].astype(int)

df_train.to_csv('../xdata/df_train_voice_feat.csv',index=False)
df_test.to_csv('../xdata/df_testB_voice_feat.csv',index=False)

df_testA.to_csv('../xdata/df_testA_voice_feat.csv',index=False)


In [6]:
df_test.head()

Unnamed: 0,uid,voice_all_cnt,voice_all_unique_cnt,voice_all_per_opp_rate,voice_all_opp_head_unique_cnt,voice_all_opp_head_little_head,voice_all_opp_head_many_head,voice_all_opp_head_many_head_cnt,voice_all_opp_head_many_head_cnt_rate,voice_all_opp_head_many_head_cnt_rate_unique,...,voice_all_start_time_jc,voice_all_start_time_fd,voice_all_end_time_sum,voice_all_end_time_avg,voice_all_end_time_max,voice_all_end_time_min,voice_all_end_time_std,voice_all_end_time_skew,voice_all_end_time_jc,voice_all_end_time_fd
0,u7000,4.0,4.0,1.0,4.0,177.0,25.0,1.0,0.25,0.25,...,21933827.0,0.464699,88435370.0,22108840.0,34104204.0,12170229.0,10273970.0,0.319884,21933975.0,0.4647
1,u7001,3.0,2.0,1.5,2.0,182.0,1.0,2.0,0.666667,1.0,...,18010221.0,0.371604,76352800.0,25450930.0,36124831.0,18114634.0,9457614.0,1.390278,18010197.0,0.371602
2,u7002,65.0,16.0,4.0625,9.0,131.0,138.0,31.0,0.476923,1.9375,...,39940011.0,0.443107,1647565000.0,25347150.0,44093507.0,4153001.0,11231430.0,-0.268917,39940506.0,0.443104
3,u7003,90.0,22.0,4.090909,16.0,1.0,150.0,45.0,0.5,2.045455,...,41121793.0,0.549884,1904467000.0,21160740.0,42204340.0,1081352.0,11635790.0,0.14345,41122988.0,0.549876
4,u7004,28.0,7.0,4.0,6.0,1.0,182.0,9.0,0.321429,1.285714,...,37006044.0,0.778069,464562400.0,16591510.0,38121840.0,1115719.0,12909030.0,0.401156,37006121.0,0.77805


In [7]:
seed = 71

np.random.seed(seed)
valid_size = 0.2
LOOP = 1
ESR = 50
# XGB param
nround = 3000
#nround = 10

param = {'max_depth':5, # 基准是5 
         'eta':0.05,
         'gamma ':0.1,
         'colsample_bytree':0.8, # old 0.8
         'subsample':0.8,
         'silent':1,
         'eval_metric':'auc',
         'objective':'binary:logistic',
#          'scale_pos_weight':5,
         'seed': seed
        }

# 设置特征数据，去除id数据，不能进行预测
features = df_test.columns
features = list(features)
features.remove('uid')

label = 'label'

print len(features)

740


In [9]:
def split_train_valid(df_train,test_size=0.2):
    '''
    k-fold交叉验证,默认k=10
    df_train:训练数据
    '''
    X_train, X_vali, y_train, y_vali = train_test_split(df_train[features], df_train[label], test_size=test_size, random_state=40000)
    #added some parameters
    
#     dtrain = df_train.iloc[train_list]
#     dvali =  df_train.iloc[vali_list]
    
    dtrain = xgb.DMatrix(X_train,label=y_train)
    dvalid = xgb.DMatrix(X_vali,label=y_vali)
    watchlist = [(dtrain, 'train'),(dvalid, 'valid')]
    
    return dtrain, dvalid, watchlist

In [10]:
models = []
seeds = [71,73,91]
for i in range(LOOP):
    print('LOOP',i)
#     dbuild, dvalid, watchlist = split_build_valid(df_train)
    dbuild, dvalid, watchlist = split_train_valid(df_train,test_size=0.2)
    param['seed'] = seeds[i]
    model = xgb.train(param, dbuild, nround, watchlist,early_stopping_rounds=ESR,verbose_eval=20)
    models.append(model)
#     model.save_model('./model1'+ str(i) + '.model')
    # VALID
    valid_yhat = model.predict(dvalid,ntree_limit=model.best_iteration)
    print('Valid Mean:---------------------->', np.mean(valid_yhat))
    del dbuild, dvalid, watchlist


('LOOP', 0)
[0]	train-auc:0.785477	valid-auc:0.775497
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[20]	train-auc:0.911333	valid-auc:0.844485
[40]	train-auc:0.947661	valid-auc:0.845299
[60]	train-auc:0.96862	valid-auc:0.851223
[80]	train-auc:0.983797	valid-auc:0.853983
[100]	train-auc:0.990765	valid-auc:0.854971
[120]	train-auc:0.994958	valid-auc:0.85458
Stopping. Best iteration:
[86]	train-auc:0.986727	valid-auc:0.856262

('Valid Mean:---------------------->', 0.17001845)


In [11]:
dtest  = xgb.DMatrix(df_test[features])
proba_test = pd.DataFrame()
proba_test['uid'] = df_test['uid']
proba_test['score'] = [0 for i in range(len(df_test))]
for model in models:
    proba_test['score'] += model.predict(dtest)
proba_test['score'] /= LOOP

proba_test = proba_test.sort_values('score',ascending=False)
proba_test['label'] = [0 for i in range(len(proba_test))]

proba_test.loc[proba_test['score']>0.28, 'label'] = 1

proba_test[['uid','label']].to_csv('../result/result2.csv',index=False,header=False)

In [12]:
# 计算特征重要程度
import operator
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1),reverse=True)

print importance

[('voice_all_opp_len_12', 79), ('voice_all_opp_len_15', 49), ('voice_all_in_out_0_rate', 47), ('voice_all_opp_head_many_head', 44), ('voice_all_opp_head_many_head_cnt_rate', 36), ('voice_all_start_time_hour_7', 35), ('voice_all_end_time_hour_7_rate', 34), ('voice_all_start_time_first', 34), ('voice_all_start_time_hour_8_rate', 26), ('voice_all_diff_time_max', 26), ('voice_all_opp_len_5', 26), ('voice_all_opp_head_many_head_cnt_rate_unique', 25), ('voice_all_start_time_hour_7_rate', 25), ('voice_all_diff_time_avg', 24), ('voice_all_unique_cnt', 22), ('voice_all_per_opp_rate', 22), ('voice_all_call_type_1_rate', 21), ('voice_all_start_time_minute_29_rate', 20), ('voice_all_opp_head_little_head', 20), ('voice_all_call_type_3_rate', 19), ('voice_all_start_time_last', 19), ('voice_all_start_time_hour_11_rate', 19), ('voice_all_start_end_time_day_39_rate', 19), ('voice_all_opp_len_8', 18), ('voice_all_start_time_hour_21_rate', 18), ('voice_all_start_time_minute_51_rate', 18), ('voice_all_end