In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score

import xgboost as xgb

## sms features
#### 先对所有的时间进行特征提取
1. 用户电话的总的通话次数 opp_num
2. 通话的人数，sms_all_unique_cnt
3. 通话次数 / 人数的比例， sms_all_cnt_all_unique_cnt_rate
4. 对端电话的前n位的个数，所有的不同号码的个数。 opp_head
5. 对端号码长度的分布个数   opp_len
6. 通话最大时长，平均时长，最小时长，极差时长等统计的信息  start_time
7. 通话类型的分布个数或者比例 call_type
8. 通话类型的分布个数和比例   in_out


In [2]:
df_train_sms = pd.read_csv('../data/train/sms_train.txt',sep='\t',low_memory=False)
df_train_label = pd.read_csv('../data/train/uid_train.txt',sep='\t',low_memory=False)

df_testA_sms = pd.read_csv('../data/testA/sms_test_a.txt',sep='\t',low_memory=False)
df_testB_sms = pd.read_csv('../data/testB/sms_test_b.txt',sep='\t',low_memory=False)


In [3]:
df_testB_sms.info()

df_train_sms.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159502 entries, 0 to 159501
Data columns (total 6 columns):
uid           159502 non-null object
opp_num       159502 non-null object
opp_head      159502 non-null int64
opp_len       159502 non-null int64
start_time    159502 non-null int64
in_out        159502 non-null int64
dtypes: int64(4), object(2)
memory usage: 7.3+ MB


Unnamed: 0,uid,opp_num,opp_head,opp_len,start_time,in_out
0,u4003,B378E065731B897E7295926B27CBA0D5,186,11,20174042,1
1,u4003,B378E065731B897E7295926B27CBA0D5,186,11,20174130,1
2,u4003,1B15607F3E6D167B44D46046D5993D87,189,11,20015746,0
3,u4003,1B15607F3E6D167B44D46046D5993D87,189,11,20015640,0
4,u4003,B378E065731B897E7295926B27CBA0D5,186,11,20045300,1


In [4]:
def get_sms_feature_plus(df_train_sms, target='train', Type=None):
    if target == 'train':
        # 复制lable的数据，作为所有的特征的标示
        df_train = df_train_label.copy()
    else:
        if Type=='A':
            df_train = pd.DataFrame(data={'uid':['u'+str(id) for id in range(5000, 7000)]})
        else:
            df_train = pd.DataFrame(data={'uid':['u'+str(id) for id in range(7000, 10000)]})
    
    df_train_sms['start_time_day'] = df_train_sms['start_time'].apply(lambda x: x / 1000000)
    df_train_sms['start_time_hour'] = df_train_sms['start_time'].apply(lambda x: x / 10000 % 100)
    df_train_sms['start_time_minute'] = df_train_sms['start_time'].apply(lambda x: x / 100 % 100)
    df_train_sms['start_time_second'] = df_train_sms['start_time'].apply(lambda x: x % 100)
    print len(df_train_sms['start_time_day'].value_counts())

    
    # 总的通话次数
    df_tmp = pd.DataFrame(df_train_sms.groupby('uid',as_index=True)['opp_num'].count())
    df_tmp.columns = ['sms_all_cnt']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # 总的通话的对端的不重复的个数
    tmp = df_train_sms.groupby('uid',as_index=True)['opp_num'].unique()
    uids = tmp.index
    opp_nums = []
    for opp_num in tmp:
        opp_nums.append(len(opp_num))
    df_tmp = pd.DataFrame(data={'uid':uids, 'sms_all_unique_cnt':opp_nums})
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # 通话次数 / 人数的比例，每个人通话的次数， sms_all_per_opp_rate
    df_train['sms_all_per_opp_rate'] = df_train['sms_all_cnt'] / df_train['sms_all_unique_cnt']
    
    
    # 4. 对端电话的前n位的个数，所有的不同号码的个数以及其所有的分布个数和比例(部分特征待定)。 opp_head_cnt_{k}, opp_head_rate_{k}
    # 全部的不同开头的次数,唯一的标示
    tmp = df_train_sms.groupby('uid',as_index=True)['opp_head'].unique()
    uids = tmp.index
    opp_nums = []
    for opp_num in tmp:
        opp_nums.append(len(opp_num))
    df_tmp = pd.DataFrame(data={'uid':uids, 'sms_all_opp_head_unique_cnt':opp_nums})
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # 联系最多和最少的次数的opp_head
    tmp = df_train_sms.groupby('uid',as_index=True)['opp_head']
    sms_all_opp_head_many_head = []
    sms_all_opp_head_many_head_cnt = []
    sms_all_opp_head_little_head = []
#     sms_all_opp_head_little_head_cnt = []
    
    uids = []
    for uid, values in tmp:
        uids.append(uid)
        sms_all_opp_head_many_head.append(values.value_counts().index[0])
        sms_all_opp_head_little_head.append(values.value_counts().index[-1])

        sms_all_opp_head_many_head_cnt.append(values.value_counts().values[0])
#         sms_all_opp_head_little_head_cnt.append(values.value_counts().values[-1])
        

    df_tmp = pd.DataFrame(data={'uid':uids, 'sms_all_opp_head_many_head':sms_all_opp_head_many_head, 
                                'sms_all_opp_head_little_head':sms_all_opp_head_little_head,
                                'sms_all_opp_head_many_head_cnt':sms_all_opp_head_many_head_cnt})
    
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    df_train['sms_all_opp_head_many_head_cnt_rate'] = df_train['sms_all_opp_head_many_head_cnt'] / df_train['sms_all_cnt']
    df_train['sms_all_opp_head_many_head_cnt_rate_unique'] = df_train['sms_all_opp_head_many_head_cnt'] / df_train['sms_all_unique_cnt']
    
    # 通话最多的head的个数
    df_tmp = pd.DataFrame(df_train_sms.groupby('uid',as_index=True)['opp_len'].value_counts().unstack())[[5,7,8,9,10,11,12,13,14,15,16,17,19,20]]
    df_tmp.columns = ['sms_all_opp_len_'+str(k) for k in [5,7,8,9,10,11,12,13,14,15,16,17,19,20]]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    df_train['sms_all_opp_len_11_rate'] = df_train['sms_all_opp_len_11'] / df_train['sms_all_cnt']
    
    # 最近一次通话的号码的长度
    tmp = df_train_sms.groupby('uid',as_index=True)['opp_len']
    sms_all_opp_len_many_head = []
    sms_all_opp_len_little_head = []
    
    uids = []
    for uid, values in tmp:
        uids.append(uid)
        sms_all_opp_len_many_head.append(values.value_counts().index[0])
        sms_all_opp_len_little_head.append(values.value_counts().index[-1])        
    
    df_tmp = pd.DataFrame(data={'uid':uids, 'sms_all_opp_len_many_head':sms_all_opp_len_many_head, 
                                'sms_all_opp_len_little_head':sms_all_opp_len_little_head})
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
        
    # in_out 分布
    df_tmp = df_train_sms.groupby('uid',as_index=True)['in_out'].value_counts().unstack()
    df_tmp.columns = ['sms_all_in_out_'+str(i) for i in range(2)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # in_out 的比例
    for feat in ['sms_all_in_out_'+str(i) for i in range(2)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['sms_all_cnt']

    
    # 处理是假相关的特征，目标100维
    # 6. 通话最大时长，平均时长，最小时长，极差时长等统计的信息  start_time, end_time, diff_time
    
    # day的分布，和比例，注意天的粒度很大，只需使用start end一种即可
    df_tmp = df_train_sms.groupby('uid',as_index=True)['start_time_day'].value_counts().unstack()[[i for i in range(1,46)]]
    df_tmp.columns = ['sms_all_start_time_day_'+str(i) for i in range(1,46)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    for feat in ['sms_all_start_time_day_'+str(i) for i in range(1,46)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['sms_all_cnt']
    
    # hour分布， 我们认为电话时间超过一小时可能是有问题的，所以使用两种
    # start_time_hour
    df_tmp = df_train_sms.groupby('uid',as_index=True)['start_time_hour'].value_counts().unstack()[[i for i in range(0,24)]]
    df_tmp.columns = ['sms_all_start_time_hour_'+str(i) for i in range(24)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    for feat in ['sms_all_start_time_hour_'+str(i) for i in range(0,24)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['sms_all_cnt']
    
    # minute 分布
    # start_time_minute
    df_tmp = df_train_sms.groupby('uid',as_index=True)['start_time_minute'].value_counts().unstack()[[i for i in range(0,60)]]
    df_tmp.columns = ['sms_all_start_time_minute_'+str(i) for i in range(60)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    for feat in ['sms_all_start_time_minute_'+str(i) for i in range(60)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['sms_all_cnt']


    # second 分布
    # start_time_second
    df_tmp = df_train_sms.groupby('uid',as_index=True)['start_time_second'].value_counts().unstack()[[i for i in range(0,60)]]
    df_tmp.columns = ['sms_all_start_time_second_'+str(i) for i in range(60)]
    df_tmp['uid'] = df_tmp.index
    df_tmp.fillna(0,inplace=True)
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    for feat in ['sms_all_start_time_second_'+str(i) for i in range(60)]:
        df_train[feat+'_rate'] = df_train[feat] / df_train['sms_all_cnt']
    
    # start_time_first, start_time_last, end_time_first, end_time_last, 以及对应的差值， 以及差值 / 总的次数
    df_tmp = pd.DataFrame(df_train_sms.groupby('uid',as_index=True)['start_time'].max())
    df_tmp.columns = ['sms_all_start_time_last']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    df_tmp = pd.DataFrame(df_train_sms.groupby('uid',as_index=True)['start_time'].min())
    df_tmp.columns = ['sms_all_start_time_first']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # start_time_last - start_time_first,
    df_train['sms_all_start_time_last_start_time_first_diff'] = get_diff_time(df_train[['sms_all_start_time_first','sms_all_start_time_last']].values)
    
    # 平均多久打一次电话
    df_train['sms_all_start_time_last_start_time_first_diff_rate'] = df_train['sms_all_start_time_last_start_time_first_diff'] / df_train['sms_all_cnt']
    
    # 待定对所有的start_time, end_time进行统计信息
    # start_time
    # sum 
    df_tmp = pd.DataFrame(df_train_sms.groupby('uid',as_index=True)['start_time'].sum())
    df_tmp.columns = ['sms_all_start_time_sum']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    # meam
    df_tmp = pd.DataFrame(df_train_sms.groupby('uid',as_index=True)['start_time'].mean())
    df_tmp.columns = ['sms_all_start_time_avg']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    
    # max
    df_tmp = pd.DataFrame(df_train_sms.groupby('uid',as_index=True)['start_time'].max())
    df_tmp.columns = ['sms_all_start_time_max']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # min
    df_tmp = pd.DataFrame(df_train_sms.groupby('uid',as_index=True)['start_time'].min())
    df_tmp.columns = ['sms_all_start_time_min']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # std
    df_tmp = pd.DataFrame(df_train_sms.groupby('uid',as_index=True)['start_time'].std())
    df_tmp.columns = ['sms_all_start_time_std']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')
    # skew 
    df_tmp = pd.DataFrame(df_train_sms.groupby('uid',as_index=True)['start_time'].skew())
    df_tmp.columns = ['sms_all_start_time_skew']
    df_tmp['uid'] = df_tmp.index
    df_train = pd.merge(df_train, df_tmp, on='uid', how='left')

    df_train['sms_all_start_time_jc'] = df_train['sms_all_start_time_max'] - df_train['sms_all_start_time_min']
    df_train['sms_all_start_time_fd'] = df_train['sms_all_start_time_std'] / df_train['sms_all_start_time_avg']
                      
    return df_train
    


In [5]:
def diff_time(a,b):
    a_day, a_hour, a_minute, a_second = (a / 1000000, a / 10000 % 100, a / 100 % 100, a % 100)
    b_day, b_hour, b_minute, b_second = (b / 1000000, b / 10000 % 100, b / 100 % 100, b % 100)
    
    d_day = b_day - a_day
    d_hour = b_hour - a_hour
    d_minute = b_minute - a_minute
    d_second = b_second - a_second
    
    diff = d_day * 24 * 60 * 60 + d_hour * 60 * 60 + d_minute * 60 + d_second
    return diff

def get_diff_time(x):
    diff_t = []
    for d in x:
#         print d
        diff_t.append(diff_time(d[0],d[1]))
    return diff_t

In [6]:
df_test = get_sms_feature_plus(df_testB_sms, target='test')
df_testA = get_sms_feature_plus(df_testA_sms, target='test')

df_train = get_sms_feature_plus(df_train_sms)

df_train.fillna(0,inplace=True)
df_test.fillna(0,inplace=True)

df_train.to_csv('../xdata/df_train_sms_feat.csv',index=False)
df_test.to_csv('../xdata/df_testB_sms_feat.csv',index=False)
df_testA.to_csv('../xdata/df_testA_sms_feat.csv',index=False)



45
45
45


In [23]:
df_train.info()
df_test.info()
df_test.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4999 entries, 0 to 4998
Columns: 422 entries, uid to sms_all_start_time_fd
dtypes: float64(420), int64(1), object(1)
memory usage: 16.1+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 0 to 2999
Columns: 421 entries, uid to sms_all_start_time_fd
dtypes: float64(420), object(1)
memory usage: 9.7+ MB


Unnamed: 0,uid,sms_all_cnt,sms_all_unique_cnt,sms_all_per_opp_rate,sms_all_opp_head_unique_cnt,sms_all_opp_head_little_head,sms_all_opp_head_many_head,sms_all_opp_head_many_head_cnt,sms_all_opp_head_many_head_cnt_rate,sms_all_opp_head_many_head_cnt_rate_unique,...,sms_all_start_time_last_start_time_first_diff,sms_all_start_time_last_start_time_first_diff_rate,sms_all_start_time_sum,sms_all_start_time_avg,sms_all_start_time_max,sms_all_start_time_min,sms_all_start_time_std,sms_all_start_time_skew,sms_all_start_time_jc,sms_all_start_time_fd
0,u7000,10.0,2.0,5.0,2.0,153.0,138.0,6.0,0.6,3.0,...,954337.4,95433.74384,88712250.0,8871225.0,12184727.0,1180946.0,5306787.0,-1.035059,11003781.0,0.598202
1,u7001,3.0,3.0,1.0,3.0,0.0,130.0,1.0,0.333333,0.333333,...,1971923.0,657307.7216,38542370.0,12847460.0,24155021.0,1192539.0,11485180.0,-0.135973,22962482.0,0.893965
2,u7002,46.0,7.0,6.571429,3.0,106.0,0.0,33.0,0.717391,4.714286,...,3214847.0,69887.97527,833403700.0,18117470.0,42224023.0,5183832.0,10119020.0,0.744371,37040191.0,0.558523
3,u7003,82.0,3.0,27.333333,3.0,9.0,0.0,59.0,0.719512,19.666667,...,3752284.0,45759.560273,1589361000.0,19382450.0,45030618.0,1135102.0,13945310.0,0.516474,43895516.0,0.719481
4,u7004,7.0,2.0,3.5,2.0,106.0,130.0,4.0,0.571429,2.0,...,2120800.0,302971.4416,192977000.0,27568140.0,42031300.0,17114617.0,8776459.0,0.318479,24916683.0,0.318355


In [24]:
seed = 71

np.random.seed(seed)
valid_size = 0.2
LOOP = 1
ESR = 50
# XGB param
nround = 3000
#nround = 10

param = {'max_depth':5, # 基准是5 
         'eta':0.05,
         'gamma ':0.1,
         'colsample_bytree':0.8, # old 0.8
         'subsample':0.8,
         'silent':1,
         'eval_metric':'auc',
         'objective':'binary:logistic',
#          'scale_pos_weight':5,
         'seed': seed
        }

In [25]:
# 设置特征数据，去除id数据，不能进行预测
features = df_test.columns
features = list(features)
features.remove('uid')

label = 'label'

print len(features)

420


In [26]:
def split_train_valid(df_train,test_size=0.2):
    '''
    k-fold交叉验证,默认k=10
    df_train:训练数据
    '''
    X_train, X_vali, y_train, y_vali = train_test_split(df_train[features], df_train[label], test_size=test_size, random_state=40000)
    #added some parameters
    
#     dtrain = df_train.iloc[train_list]
#     dvali =  df_train.iloc[vali_list]
    
    dtrain = xgb.DMatrix(X_train,label=y_train)
    dvalid = xgb.DMatrix(X_vali,label=y_vali)
    watchlist = [(dtrain, 'train'),(dvalid, 'valid')]
    
    return dtrain, dvalid, watchlist

In [27]:
models = []
seeds = [71,73,91]
for i in range(LOOP):
    print('LOOP',i)
#     dbuild, dvalid, watchlist = split_build_valid(df_train)
    dbuild, dvalid, watchlist = split_train_valid(df_train,test_size=0.2)
    param['seed'] = seeds[i]
    model = xgb.train(param, dbuild, nround, watchlist,early_stopping_rounds=ESR,verbose_eval=20)
    models.append(model)
#     model.save_model('./model1'+ str(i) + '.model')
    # VALID
    valid_yhat = model.predict(dvalid,ntree_limit=model.best_iteration)
    print('Valid Mean:---------------------->', np.mean(valid_yhat))
    del dbuild, dvalid, watchlist


('LOOP', 0)
[0]	train-auc:0.858353	valid-auc:0.830481
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[20]	train-auc:0.922553	valid-auc:0.87352
[40]	train-auc:0.940525	valid-auc:0.882712
[60]	train-auc:0.955078	valid-auc:0.885186
[80]	train-auc:0.966466	valid-auc:0.888619
[100]	train-auc:0.975888	valid-auc:0.890857
[120]	train-auc:0.982056	valid-auc:0.891578
[140]	train-auc:0.986686	valid-auc:0.891722
[160]	train-auc:0.989649	valid-auc:0.892546
[180]	train-auc:0.991849	valid-auc:0.892113
[200]	train-auc:0.993188	valid-auc:0.890877
Stopping. Best iteration:
[158]	train-auc:0.989478	valid-auc:0.892765

('Valid Mean:---------------------->', 0.1646983)


In [28]:
# 计算特征重要程度
import operator
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1),reverse=True)

print importance

[('sms_all_opp_head_many_head', 152), ('sms_all_opp_len_13', 79), ('sms_all_opp_head_little_head', 66), ('sms_all_in_out_0_rate', 65), ('sms_all_start_time_avg', 59), ('sms_all_start_time_first', 58), ('sms_all_opp_len_9', 54), ('sms_all_start_time_last', 54), ('sms_all_opp_len_11_rate', 53), ('sms_all_start_time_skew', 49), ('sms_all_per_opp_rate', 43), ('sms_all_start_time_last_start_time_first_diff_rate', 40), ('sms_all_opp_head_many_head_cnt_rate', 40), ('sms_all_in_out_1', 37), ('sms_all_unique_cnt', 36), ('sms_all_start_time_day_33_rate', 36), ('sms_all_start_time_sum', 35), ('sms_all_in_out_1_rate', 34), ('sms_all_start_time_hour_19_rate', 33), ('sms_all_start_time_hour_17_rate', 33), ('sms_all_start_time_day_6_rate', 32), ('sms_all_start_time_hour_20_rate', 30), ('sms_all_opp_head_many_head_cnt_rate_unique', 30), ('sms_all_start_time_std', 30), ('sms_all_start_time_fd', 30), ('sms_all_start_time_hour_23_rate', 29), ('sms_all_opp_len_11', 28), ('sms_all_start_time_hour_10_rate',