In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score

import xgboost as xgb

## voice features
#### 先对所有的时间进行特征提取
1. 用户电话的总的通话次数 opp
2. 通话的人数，voice_all_unique_cnt
3. 通话次数 / 人数的比例， voice_all_cnt_all_unique_cnt_rate
4. 对端电话的前n位的个数，所有的不同号码的个数。 opp_head
5. 对端号码长度的分布个数   opp_len
6. 通话最大时长，平均时长，最小时长，极差时长等统计的信息  start_time, end_time
7. 通话类型的分布个数或者比例 call_type
8. 通话类型的分布个数和比例   in_out


In [2]:
df_train_voice = pd.read_csv('../data/train/voice_train.txt',sep='\t',low_memory=False)
df_train_label = pd.read_csv('../data/train/uid_train.txt',sep='\t',low_memory=False)


In [63]:
df_train_voice.info()
df_train_voice.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1150778 entries, 0 to 1150777
Data columns (total 17 columns):
uid                  1150778 non-null object
opp_num              1150778 non-null object
opp_head             1150778 non-null object
opp_len              1150778 non-null int64
start_time           1150778 non-null int64
end_time             1150778 non-null int64
call_type            1150778 non-null int64
in_out               1150778 non-null int64
start_time_day       1150778 non-null int64
start_time_hour      1150778 non-null int64
start_time_minute    1150778 non-null int64
start_time_second    1150778 non-null int64
end_time_day         1150778 non-null int64
end_time_hour        1150778 non-null int64
end_time_minute      1150778 non-null int64
end_time_second      1150778 non-null int64
diff_time            1150778 non-null int64
dtypes: int64(14), object(3)
memory usage: 149.3+ MB


Unnamed: 0,uid,opp_num,opp_head,opp_len,start_time,end_time,call_type,in_out,start_time_day,start_time_hour,start_time_minute,start_time_second,end_time_day,end_time_hour,end_time_minute,end_time_second,diff_time
0,u0113,38D54642A237A11BB18455FC1E505292,132,11,26115956,26120033,1,1,26,11,59,56,26,12,0,33,37
1,u0113,38D54642A237A11BB18455FC1E505292,132,11,26115623,26115707,1,1,26,11,56,23,26,11,57,7,44
2,u0113,38D54642A237A11BB18455FC1E505292,132,11,26174233,26174321,1,1,26,17,42,33,26,17,43,21,48
3,u0113,38D54642A237A11BB18455FC1E505292,132,11,26070423,26070512,1,0,26,7,4,23,26,7,5,12,49
4,u3340,010A66F2AD42C48C44897A3DEC96A2A1,139,11,26201745,26201825,1,1,26,20,17,45,26,20,18,25,40


In [38]:
def get_voice_feature_plus(df_train_voice, target='train'):
    if target == 'train':
        # 复制lable的数据，作为所有的特征的标示
        df_train = df_train_label.copy()
    else:
        df_train = pd.DataFrame(data={'uid':['u'+str(id) for id in range(5000, 7000)]})
    
    
    df_train_voice['start_time_day'] = df_train_voice['start_time'].apply(lambda x: x / 1000000)
    df_train_voice['start_time_hour'] = df_train_voice['start_time'].apply(lambda x: x / 10000 % 100)
    df_train_voice['start_time_minute'] = df_train_voice['start_time'].apply(lambda x: x / 100 % 100)
    df_train_voice['start_time_second'] = df_train_voice['start_time'].apply(lambda x: x % 100)

    df_train_voice['end_time_day'] = df_train_voice['end_time'].apply(lambda x: x / 1000000)
    df_train_voice['end_time_hour'] = df_train_voice['end_time'].apply(lambda x: x / 10000 % 100)
    df_train_voice['end_time_minute'] = df_train_voice['end_time'].apply(lambda x: x / 100 % 100)
    df_train_voice['end_time_second'] = df_train_voice['end_time'].apply(lambda x: x % 100)


    df_train_voice['diff_time'] = get_diff_time(df_train_voice[['start_time','end_time']].values)
    
    
    # 总的通话次数
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['opp_num'].count())
    df_tmp.columns = ['voice_all_cnt']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # 总的通话的对端的不重复的个数
    tmp = df_train_voice.groupby('uid',as_index=True)['opp_num'].unique()
    uids = tmp.index
    opp_nums = []
    for opp_num in tmp:
        opp_nums.append(len(opp_num))
    df_tmp = pd.DataFrame(data={'uid':uids, 'voice_all_unique_cnt':opp_nums})
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # 通话次数 / 人数的比例，每个人通话的次数， voice_all_per_opp_rate
    df_train['voice_all_per_opp_rate'] = df_train['voice_all_cnt'] / df_train['voice_all_unique_cnt']
    
    
    # 4. 对端电话的前n位的个数，所有的不同号码的个数以及其所有的分布个数和比例(部分特征待定)。 opp_head_cnt_{k}, opp_head_rate_{k}
    # 全部的不同开头的次数,唯一的标示
    tmp = df_train_voice.groupby('uid',as_index=True)['opp_head'].unique()
    uids = tmp.index
    opp_nums = []
    for opp_num in tmp:
        opp_nums.append(len(opp_num))
    df_tmp = pd.DataFrame(data={'uid':uids, 'voice_all_opp_head_unique_cnt':opp_nums})
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # 联系最多和最少的次数的opp_head
    tmp = df_train_voice.groupby('uid',as_index=True)['opp_head']
    voice_all_opp_head_many_head = []
    voice_all_opp_head_many_head_cnt = []
    voice_all_opp_head_little_head = []
#     voice_all_opp_head_little_head_cnt = []
    
    uids = []
    for uid, values in tmp:
        uids.append(uid)
        voice_all_opp_head_many_head.append(values.value_counts().index[0])
        voice_all_opp_head_little_head.append(values.value_counts().index[-1])

        voice_all_opp_head_many_head_cnt.append(values.value_counts().values[0])
#         voice_all_opp_head_little_head_cnt.append(values.value_counts().values[-1])
        

    df_tmp = pd.DataFrame(data={'uid':uids, 'voice_all_opp_head_many_head':voice_all_opp_head_many_head, 
                                'voice_all_opp_head_little_head':voice_all_opp_head_little_head,
                                'voice_all_opp_head_many_head_cnt':voice_all_opp_head_many_head_cnt})
    
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    df_train['voice_all_opp_head_many_head_cnt_rate'] = df_train['voice_all_opp_head_many_head_cnt'] / df_train['voice_all_cnt']
    df_train['voice_all_opp_head_many_head_cnt_rate_unique'] = df_train['voice_all_opp_head_many_head_cnt'] / df_train['voice_all_unique_cnt']
    
    # 通话最多的head的个数
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['opp_len'].value_counts().unstack())[[3,5,6,7,8,9,10,11,12,13,14,15,16,17,19,20]]
    df_tmp.columns = ['voice_all_opp_len_'+str(k) for k in [3,5,6,7,8,9,10,11,12,13,14,15,16,17,19,20]]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    df_train['voice_all_opp_len_11_rate'] = df_train['voice_all_opp_len_11'] / df_train['voice_all_cnt']
    
    # 最近一次通话的号码的长度
    tmp = df_train_voice.groupby('uid',as_index=True)['opp_len']
    voice_all_opp_len_many_head = []
    voice_all_opp_len_little_head = []
    
    uids = []
    for uid, values in tmp:
        uids.append(uid)
        voice_all_opp_len_many_head.append(values.value_counts().index[0])
        voice_all_opp_len_little_head.append(values.value_counts().index[-1])        
    
    df_tmp = pd.DataFrame(data={'uid':uids, 'voice_all_opp_len_many_head':voice_all_opp_len_many_head, 
                                'voice_all_opp_len_little_head':voice_all_opp_len_little_head})
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # call_type 分布
    df_tmp = df_train_voice.groupby('uid',as_index=True)['call_type'].value_counts().unstack()
    df_tmp.columns = ['voice_all_call_type_'+str(i) for i in range(1,6)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    # call_type 的比例
    for feat in ['voice_all_call_type_'+str(i) for i in range(1,6)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['voice_all_cnt']
    
    # in_out 分布
    df_tmp = df_train_voice.groupby('uid',as_index=True)['in_out'].value_counts().unstack()
    df_tmp.columns = ['voice_all_in_out_'+str(i) for i in range(2)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # in_out 的比例
    for feat in ['voice_all_in_out_'+str(i) for i in range(2)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['voice_all_cnt']

    
    # 处理是假相关的特征，目标100维
    # 6. 通话最大时长，平均时长，最小时长，极差时长等统计的信息  start_time, end_time, diff_time
    
    # day的分布，和比例，注意天的粒度很大，只需使用start end一种即可
    df_tmp = df_train_voice.groupby('uid',as_index=True)['start_time_day'].value_counts().unstack()[[i for i in range(1,46)]]
    df_tmp.columns = ['voice_all_start_end_time_day_'+str(i) for i in range(1,46)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    for feat in ['voice_all_start_end_time_day_'+str(i) for i in range(1,46)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['voice_all_cnt']
    
    # hour分布， 我们认为电话时间超过一小时可能是有问题的，所以使用两种
    # start_time_hour
    df_tmp = df_train_voice.groupby('uid',as_index=True)['start_time_hour'].value_counts().unstack()[[i for i in range(0,24)]]
    df_tmp.columns = ['voice_all_start_time_hour_'+str(i) for i in range(24)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    for feat in ['voice_all_start_time_hour_'+str(i) for i in range(0,24)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['voice_all_cnt']
    
    # end_time_hour
    df_tmp = df_train_voice.groupby('uid',as_index=True)['end_time_hour'].value_counts().unstack()[[i for i in range(0,24)]]
    df_tmp.columns = ['voice_all_end_time_hour_'+str(i) for i in range(24)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    for feat in ['voice_all_end_time_hour_'+str(i) for i in range(0,24)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['voice_all_cnt']
    
    
    # minute 分布
    # start_time_minute
    df_tmp = df_train_voice.groupby('uid',as_index=True)['start_time_minute'].value_counts().unstack()[[i for i in range(0,60)]]
    df_tmp.columns = ['voice_all_start_time_minute_'+str(i) for i in range(60)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    for feat in ['voice_all_start_time_minute_'+str(i) for i in range(60)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['voice_all_cnt']

    # end_time_minute
    df_tmp = df_train_voice.groupby('uid',as_index=True)['end_time_minute'].value_counts().unstack()[[i for i in range(0,60)]]
    df_tmp.columns = ['voice_all_end_time_minute_'+str(i) for i in range(60)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    for feat in ['voice_all_end_time_minute_'+str(i) for i in range(60)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['voice_all_cnt']


    # second 分布
    # start_time_second
    df_tmp = df_train_voice.groupby('uid',as_index=True)['start_time_second'].value_counts().unstack()[[i for i in range(0,60)]]
    df_tmp.columns = ['voice_all_start_time_second_'+str(i) for i in range(60)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    for feat in ['voice_all_start_time_second_'+str(i) for i in range(60)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['voice_all_cnt']

    # end_time_minute
    df_tmp = df_train_voice.groupby('uid',as_index=True)['end_time_second'].value_counts().unstack()[[i for i in range(0,60)]]
    df_tmp.columns = ['voice_all_end_time_second_'+str(i) for i in range(60)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    for feat in ['voice_all_end_time_second_'+str(i) for i in range(60)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['voice_all_cnt']
    
    
    # diff_time
    # sum 
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['diff_time'].sum())
    df_tmp.columns = ['voice_all_diff_time_sum']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    # meam
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['diff_time'].mean())
    df_tmp.columns = ['voice_all_diff_time_avg']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # max
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['diff_time'].max())
    df_tmp.columns = ['voice_all_diff_time_max']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # min
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['diff_time'].min())
    df_tmp.columns = ['voice_all_diff_time_min']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # std
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['diff_time'].std())
    df_tmp.columns = ['voice_all_diff_time_std']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # skew 
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['diff_time'].skew())
    df_tmp.columns = ['voice_all_diff_time_skew']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    df_train['voice_all_diff_time_jc'] = df_train['voice_all_diff_time_max'] - df_train['voice_all_diff_time_min']
    df_train['voice_all_diff_time_fd'] = df_train['voice_all_diff_time_std'] / df_train['voice_all_diff_time_avg']

    # start_time_first, start_time_last, end_time_first, end_time_last, 以及对应的差值， 以及差值 / 总的次数
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].max())
    df_tmp.columns = ['voice_all_start_time_last']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].min())
    df_tmp.columns = ['voice_all_start_time_first']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].max())
    df_tmp.columns = ['voice_all_end_time_last']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].min())
    df_tmp.columns = ['voice_all_end_time_first']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # start_time_last - start_time_first,
    # end_time_last - end_time_first,    
    # end_time_first - start_time_first,
    # end_time_last - start_time_last,
    df_train['voice_all_start_time_last_start_time_first_diff'] = get_diff_time(df_train[['voice_all_start_time_first','voice_all_start_time_last']].values)
    df_train['voice_all_end_time_last_end_time_first_diff'] = get_diff_time(df_train[['voice_all_end_time_first','voice_all_end_time_last']].values)
    df_train['voice_all_end_time_first_start_time_first_diff'] = get_diff_time(df_train[['voice_all_end_time_first','voice_all_start_time_first']].values)
    df_train['voice_all_end_time_last_start_time_last_diff'] = get_diff_time(df_train[['voice_all_end_time_last','voice_all_start_time_last']].values)
    
    # 平均多久打一次电话
    df_train['voice_all_end_time_first_start_time_first_diff_rate'] = df_train['voice_all_end_time_first_start_time_first_diff'] / df_train['voice_all_cnt']
    df_train['voice_all_end_time_last_start_time_last_diff_rate'] = df_train['voice_all_end_time_last_start_time_last_diff'] / df_train['voice_all_cnt']
    
    # 待定对所有的start_time, end_time进行统计信息
    # start_time
    # sum 
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].sum())
    df_tmp.columns = ['voice_all_start_time_sum']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    # meam
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].mean())
    df_tmp.columns = ['voice_all_start_time_avg']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # max
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].max())
    df_tmp.columns = ['voice_all_start_time_max']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # min
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].min())
    df_tmp.columns = ['voice_all_start_time_min']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # std
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].std())
    df_tmp.columns = ['voice_all_start_time_std']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # skew 
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].skew())
    df_tmp.columns = ['voice_all_start_time_skew']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    df_train['voice_all_start_time_jc'] = df_train['voice_all_start_time_max'] - df_train['voice_all_start_time_min']
    df_train['voice_all_start_time_fd'] = df_train['voice_all_start_time_std'] / df_train['voice_all_start_time_avg']

    # end_timne
    # sum 
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].sum())
    df_tmp.columns = ['voice_all_end_time_sum']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    # meam
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].mean())
    df_tmp.columns = ['voice_all_end_time_avg']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # max
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].max())
    df_tmp.columns = ['voice_all_end_time_max']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # min
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].min())
    df_tmp.columns = ['voice_all_end_time_min']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # std
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].std())
    df_tmp.columns = ['voice_all_end_time_std']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # skew 
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].skew())
    df_tmp.columns = ['voice_all_end_time_skew']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    df_train['voice_all_end_time_jc'] = df_train['voice_all_end_time_max'] - df_train['voice_all_end_time_min']
    df_train['voice_all_end_time_fd'] = df_train['voice_all_end_time_std'] / df_train['voice_all_end_time_avg']
                      
    return df_train
    


In [39]:
def get_voice_feature(df_train_voice, target='train'):
    if target == 'train':
        # 复制lable的数据，作为所有的特征的标示
        df_train = df_train_label.copy()
    else:
        tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['uid'].count())
        df_train = pd.DataFrame(data={'uid':tmp.index})
    
    tmp = df_train_voice.groupby('uid',as_index=True)['opp_num'].unique()
    uids = tmp.index
    opp_nums = []
    for opp_num in tmp:
        opp_nums.append(len(opp_num))

    df_tmp = pd.DataFrame(data={'uid':uids, 'voice_all_opp_num_unique_cnt':opp_nums})
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['opp_num'].count())
    df_tmp.columns = ['voice_all_opp_num_cnt']
    df_tmp['uid'] = df_tmp.index

    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    # 通话次数 / 通话的人数
    df_train['voice_opp_num_all_cnt_unique_cnt_rate'] = df_train['voice_all_opp_num_cnt'] / df_train['voice_all_opp_num_unique_cnt']

    
    # 全部的不同开头的次数,唯一的标示
    tmp = df_train_voice.groupby('uid',as_index=True)['opp_head'].unique()
    uids = tmp.index
    opp_nums = []
    for opp_num in tmp:
        opp_nums.append(len(opp_num))

    df_tmp = pd.DataFrame(data={'uid':uids, 'voice_all_opp_head_unique_cnt':opp_nums})
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    # 通话次数 / opp_len的次数
    df_train['voice_opp_head_all_cnt_unique_cnt_rate'] = df_train['voice_all_opp_num_cnt'] / df_train['voice_all_opp_head_unique_cnt']

    # 通话最多的head的个数，
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['opp_head'].value_counts().unstack().max(axis=1))
    df_tmp.columns = ['voice_all_opp_head_max']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')


    # 通话最小的head的个数，
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['opp_head'].value_counts().unstack().min(axis=1))
    df_tmp.columns = ['voice_all_opp_head_min']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    # 极差，和占总个比例
    df_train['voice_all_opp_head_jc'] = df_train['voice_all_opp_head_max'] - df_train['voice_all_opp_head_min']
    df_train['voice_opp_head_all_max_rate'] = df_train['voice_all_opp_head_max'] / df_train['voice_all_opp_num_cnt']

    
    # call_type 分布
    df_tmp = df_train_voice.groupby('uid',as_index=True)['call_type'].value_counts().unstack()
    df_tmp.columns = ['voice_all_call_type_'+str(i) for i in range(1,6)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)

    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    # call_type 分布
    df_tmp = df_train_voice.groupby('uid',as_index=True)['in_out'].value_counts().unstack()

    df_tmp.columns = ['voice_all_in_out_'+str(i) for i in range(2)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)

    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # diff_time
    df_train_voice['diff_time'] = df_train_voice['end_time'] - df_train_voice['start_time']

    # sum 
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['diff_time'].sum())
    df_tmp.columns = ['voice_all_diff_time_sum']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    # meam
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['diff_time'].mean())
    df_tmp.columns = ['voice_all_diff_time_avg']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # max
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['diff_time'].max())
    df_tmp.columns = ['voice_all_diff_time_max']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # min
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['diff_time'].min())
    df_tmp.columns = ['voice_all_diff_time_min']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # std
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['diff_time'].std())
    df_tmp.columns = ['voice_all_diff_time_std']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # skew 
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['diff_time'].skew())
    df_tmp.columns = ['voice_all_diff_time_skew']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    df_train['voice_all_diff_time_jc'] = df_train['voice_all_diff_time_max'] - df_train['voice_all_diff_time_min']
    df_train['voice_all_diff_time_fd'] = df_train['voice_all_diff_time_std'] / df_train['voice_all_diff_time_avg']

    # sum 
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].sum())
    df_tmp.columns = ['voice_all_start_time_sum']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    # meam
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].mean())
    df_tmp.columns = ['voice_all_start_time_avg']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # max
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].max())
    df_tmp.columns = ['voice_all_start_time_max']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # min
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].min())
    df_tmp.columns = ['voice_all_start_time_min']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # std
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].std())
    df_tmp.columns = ['voice_all_start_time_std']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # skew 
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['start_time'].skew())
    df_tmp.columns = ['voice_all_start_time_skew']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    df_train['voice_all_start_time_jc'] = df_train['voice_all_start_time_max'] - df_train['voice_all_start_time_min']
    df_train['voice_all_start_time_fd'] = df_train['voice_all_start_time_std'] / df_train['voice_all_start_time_avg']

        # sum 
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].sum())
    df_tmp.columns = ['voice_all_end_time_sum']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    # meam
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].mean())
    df_tmp.columns = ['voice_all_end_time_avg']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # max
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].max())
    df_tmp.columns = ['voice_all_end_time_max']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # min
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].min())
    df_tmp.columns = ['voice_all_end_time_min']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # std
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].std())
    df_tmp.columns = ['voice_all_end_time_std']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # skew 
    df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['end_time'].skew())
    df_tmp.columns = ['voice_all_end_time_skew']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    df_train['voice_all_end_time_jc'] = df_train['voice_all_end_time_max'] - df_train['voice_all_end_time_min']
    df_train['voice_all_end_time_fd'] = df_train['voice_all_end_time_std'] / df_train['voice_all_end_time_avg']

    
    return df_train
    

In [40]:
def diff_time(a,b):
    a_day, a_hour, a_minute, a_second = (a / 1000000, a / 10000 % 100, a / 100 % 100, a % 100)
    b_day, b_hour, b_minute, b_second = (b / 1000000, b / 10000 % 100, b / 100 % 100, b % 100)
    
    d_day = b_day - a_day
    d_hour = b_hour - a_hour
    d_minute = b_minute - a_minute
    d_second = b_second - a_second
    
    diff = d_day * 24 * 60 * 60 + d_hour * 60 * 60 + d_minute * 60 + d_second
    return diff

def get_diff_time(x):
    diff_t = []
    for d in x:
#         print d
        diff_t.append(diff_time(d[0],d[1]))
    return diff_t

In [41]:
df_test_voice = pd.read_csv('../data/test/voice_test_a.txt',sep='\t',low_memory=False)

In [42]:
# df_test_voice['opp_len'].value_counts()

In [43]:
df_test = get_voice_feature_plus(df_test_voice, target='test')
df_train = get_voice_feature_plus(df_train_voice)

df_test['voice_all_opp_head_unique_cnt'] = df_test['voice_all_opp_head_unique_cnt'].astype(float)
df_test['voice_all_opp_head_many_head'] = df_test['voice_all_opp_head_many_head'].astype(float)
df_train['voice_all_opp_head_little_head'] = df_train['voice_all_opp_head_little_head'].astype(int)
df_train['voice_all_opp_head_many_head'] = df_train['voice_all_opp_head_many_head'].astype(int)

df_train.fillna(0,inplace=True)
df_test.fillna(0,inplace=True)

In [44]:
# df_valid = df_train.sample(n=300, replace=True)


In [45]:

df_train['voice_all_opp_head_little_head'] = df_train['voice_all_opp_head_little_head'].astype(int)
df_train['voice_all_opp_head_many_head'] = df_train['voice_all_opp_head_many_head'].astype(int)
df_test.info()

df_train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Columns: 743 entries, uid to voice_all_end_time_fd
dtypes: float64(742), object(1)
memory usage: 11.4+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 4999 entries, 0 to 4998
Columns: 744 entries, uid to voice_all_end_time_fd
dtypes: float64(740), int64(3), object(1)
memory usage: 28.4+ MB


In [59]:
seed = 71

np.random.seed(seed)
valid_size = 0.2
LOOP = 1
ESR = 50
# XGB param
nround = 3000
#nround = 10

param = {'max_depth':5, # 基准是5 
         'eta':0.05,
         'gamma ':0.1,
         'colsample_bytree':0.8, # old 0.8
         'subsample':0.8,
         'silent':1,
         'eval_metric':'auc',
         'objective':'binary:logistic',
#          'scale_pos_weight':5,
         'seed': seed
        }

In [60]:
# 设置特征数据，去除id数据，不能进行预测
features = df_test.columns
features = list(features)
features.remove('uid')

label = 'label'

print len(features)

742


In [61]:
def split_train_valid(df_train,test_size=0.2):
    '''
    k-fold交叉验证,默认k=10
    df_train:训练数据
    '''
    X_train, X_vali, y_train, y_vali = train_test_split(df_train[features], df_train[label], test_size=test_size, random_state=40000)
    #added some parameters
    
#     dtrain = df_train.iloc[train_list]
#     dvali =  df_train.iloc[vali_list]
    
    dtrain = xgb.DMatrix(X_train,label=y_train)
    dvalid = xgb.DMatrix(X_vali,label=y_vali)
    watchlist = [(dtrain, 'train'),(dvalid, 'valid')]
    
    return dtrain, dvalid, watchlist

In [62]:
models = []
seeds = [71,73,91]
for i in range(LOOP):
    print('LOOP',i)
#     dbuild, dvalid, watchlist = split_build_valid(df_train)
    dbuild, dvalid, watchlist = split_train_valid(df_train,test_size=0.2)
    param['seed'] = seeds[i]
    model = xgb.train(param, dbuild, nround, watchlist,early_stopping_rounds=ESR,verbose_eval=20)
    models.append(model)
#     model.save_model('./model1'+ str(i) + '.model')
    # VALID
    valid_yhat = model.predict(dvalid,ntree_limit=model.best_iteration)
    print('Valid Mean:---------------------->', np.mean(valid_yhat))
    del dbuild, dvalid, watchlist


('LOOP', 0)
[0]	train-auc:0.824913	valid-auc:0.801899
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[20]	train-auc:0.913341	valid-auc:0.838544
[40]	train-auc:0.946254	valid-auc:0.84396
[60]	train-auc:0.968482	valid-auc:0.849129
[80]	train-auc:0.982766	valid-auc:0.848944
[100]	train-auc:0.991236	valid-auc:0.851752
[120]	train-auc:0.994963	valid-auc:0.852864
[140]	train-auc:0.997341	valid-auc:0.853083
Stopping. Best iteration:
[107]	train-auc:0.992645	valid-auc:0.853907

('Valid Mean:---------------------->', 0.16489457)


In [50]:
def valid_score(pre_result, real_result):
    '''
    score = 0.6 * acu + 0.4 * F1
    :param pre_result:
    :param real_result:
    :return:
    '''
    print len(real_result['label'].values)
    print len(pre_result['score'].values)
    auc = roc_auc_score(real_result['label'].values, pre_result['score'].values)
    f1 = f1_score(real_result['label'], pre_result['label'])
    score = 0.6 * auc + 0.4 * f1
    print "auc = %f, f1 = %f, score = %f" % (auc, f1, score)
    return score

In [51]:
# valid_score(proba_test, df_valid)

In [52]:
dtest  = xgb.DMatrix(df_test[features])
proba_test = pd.DataFrame()
proba_test['uid'] = df_test['uid']
proba_test['score'] = [0 for i in range(len(df_test))]
for model in models:
    proba_test['score'] += model.predict(dtest)
proba_test['score'] /= LOOP



In [53]:
proba_test = proba_test.sort_values('score',ascending=False)
proba_test['label'] = [0 for i in range(len(proba_test))]

proba_test.loc[proba_test['score']>0.28, 'label'] = 1


In [54]:
proba_test['label'].value_counts()



0    1537
1     463
Name: label, dtype: int64

In [55]:
proba_test[['uid','label']].to_csv('../result/result2.csv',index=False,header=False)

In [56]:
set(['u'+str(i) for i in range(5000,7000)]) - set(proba_test.uid)

set()

In [57]:
# 计算特征重要程度
import operator
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1),reverse=True)

print importance

[('voice_all_opp_len_12', 81), ('voice_all_opp_len_15', 48), ('voice_all_in_out_0_rate', 47), ('voice_all_opp_head_many_head_cnt_rate', 40), ('voice_all_start_time_first', 31), ('voice_all_start_time_hour_8_rate', 30), ('voice_all_per_opp_rate', 30), ('voice_all_start_time_hour_7', 28), ('voice_all_end_time_hour_7_rate', 27), ('voice_all_opp_head_many_head', 25), ('voice_all_opp_head_many_head_cnt_rate_unique', 24), ('voice_all_start_time_hour_7_rate', 24), ('voice_all_start_end_time_day_39_rate', 24), ('voice_all_start_time_minute_29_rate', 22), ('voice_all_call_type_3', 22), ('voice_all_unique_cnt', 21), ('voice_all_start_time_hour_21_rate', 21), ('voice_all_start_time_last', 20), ('voice_all_opp_len_8', 20), ('voice_all_diff_time_max', 19), ('voice_all_start_end_time_day_25_rate', 18), ('voice_all_call_type_1_rate', 18), ('voice_all_start_time_hour_11_rate', 17), ('voice_all_start_time_std', 17), ('voice_all_end_time_first', 17), ('voice_all_diff_time_avg', 16), ('voice_all_start_ti

In [3]:
# 复制lable的数据，作为所有的特征的标示
df_train = df_train_label.copy()

In [4]:
df_train_voice.head()
df_train_voice.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1150778 entries, 0 to 1150777
Data columns (total 8 columns):
uid           1150778 non-null object
opp_num       1150778 non-null object
opp_head      1150778 non-null object
opp_len       1150778 non-null int64
start_time    1150778 non-null int64
end_time      1150778 non-null int64
call_type     1150778 non-null int64
in_out        1150778 non-null int64
dtypes: int64(5), object(3)
memory usage: 70.2+ MB


In [25]:
tmp = df_train_voice.groupby('uid',as_index=True)['opp_num'].unique()
uids = tmp.index
opp_nums = []
for opp_num in tmp:
    opp_nums.append(len(opp_num))

df_tmp = pd.DataFrame(data={'uid':uids, 'voice_all_opp_num_unique_cnt':opp_nums})
df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    

In [26]:
df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['opp_num'].count())
df_tmp.columns = ['voice_all_opp_num_cnt']
df_tmp['uid'] = df_tmp.index

df_train = pd.merge(df_train, df_tmp, on='uid', how='left')


In [8]:
# 通话次数 / 通话的人数
df_train['voice_opp_num_all_cnt_unique_cnt_rate'] = df_train['voice_all_opp_num_cnt'] / df_train['voice_all_opp_num_unique_cnt']




In [37]:
# 全部的不同开头的次数,唯一的标示
tmp = df_train_voice.groupby('uid',as_index=True)['opp_head'].unique()
uids = tmp.index
opp_nums = []
for opp_num in tmp:
    opp_nums.append(len(opp_num))

df_tmp = pd.DataFrame(data={'uid':uids, 'voice_all_opp_head_unique_cnt':opp_nums})
df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
# 通话次数 / opp_len的次数
df_train['voice_opp_head_all_cnt_unique_cnt_rate'] = df_train['voice_all_opp_num_cnt'] / df_train['voice_all_opp_head_unique_cnt']


uid
u0001      79
u0002       2
u0003      21
u0004     254
u0005     401
u0006      44
u0007     101
u0008     234
u0009      96
u0010     130
u0011     227
u0012     302
u0013       2
u0014     151
u0015      81
u0016     291
u0017      43
u0018     178
u0019     647
u0020     315
u0021      19
u0022      22
u0023      32
u0024     653
u0025     723
u0026     521
u0027    1618
u0028     323
u0029      13
u0030      62
         ... 
u4970     415
u4971     277
u4972     468
u4973     485
u4974     180
u4975     288
u4976     262
u4977      76
u4978      97
u4979       8
u4980       3
u4981       3
u4982      35
u4983      30
u4984       4
u4985       3
u4986       1
u4987       5
u4988       1
u4989      93
u4990     324
u4991      60
u4992     263
u4993      11
u4994      25
u4995      18
u4996      30
u4997      54
u4998      18
u4999      19
Name: opp_head, Length: 4987, dtype: int64

In [65]:
# 通话最多的head的个数，
df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['opp_head'].value_counts().unstack().max(axis=1))
df_tmp.columns = ['voice_all_opp_head_max']
df_tmp['uid'] = df_tmp.index
df_train = pd.merge(df_train, df_tmp, on='uid', how='left')


# 通话最小的head的个数，
df_tmp = pd.DataFrame(df_train_voice.groupby('uid',as_index=True)['opp_head'].value_counts().unstack().min(axis=1))
df_tmp.columns = ['voice_all_opp_head_min']
df_tmp['uid'] = df_tmp.index
df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

# 极差，和占总个比例
df_train['voice_all_opp_head_jc'] = df_train['voice_all_opp_head_max'] - df_train['voice_all_opp_head_min']
df_train['voice_opp_head_all_max_rate'] = df_train['voice_all_opp_head_max'] / df_train['voice_all_opp_num_cnt']




In [68]:
df_train_voice['call_type'].value_counts()

1    1014721
3      78307
2      57595
5        125
4         30
Name: call_type, dtype: int64

In [74]:
# call_type 分布
df_tmp = df_train_voice.groupby('uid',as_index=True)['call_type'].value_counts().unstack()
df_tmp.columns = ['voice_all_call_type_'+str(i) for i in range(1,6)]
df_tmp['uid'] = df_tmp.index
df_tmp.fillna(0,inplace=True)

df_train = pd.merge(df_train, df_tmp, on='uid', how='left')


In [76]:
# call_type 分布
df_tmp = df_train_voice.groupby('uid',as_index=True)['in_out'].value_counts().unstack()

df_tmp.columns = ['voice_all_in_out_'+str(i) for i in range(2)]
df_tmp['uid'] = df_tmp.index
df_tmp.fillna(0,inplace=True)

df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
