In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score

import xgboost as xgb

In [2]:
def diff_time(a,b):
    a_day, a_hour, a_minute, a_second = (a / 1000000, a / 10000 % 100, a / 100 % 100, a % 100)
    b_day, b_hour, b_minute, b_second = (b / 1000000, b / 10000 % 100, b / 100 % 100, b % 100)
    
    d_day = b_day - a_day
    d_hour = b_hour - a_hour
    d_minute = b_minute - a_minute
    d_second = b_second - a_second
    
    diff = d_day * 24 * 60 * 60 + d_hour * 60 * 60 + d_minute * 60 + d_second
    return diff

def get_diff_time(x):
    diff_t = []
    for d in x:
#         print d
        diff_t.append(diff_time(d[0],d[1]))
    return diff_t

In [38]:
def get_data(add_data=False):
    df_train_voice_feat = pd.read_csv('../xdata/df_train_voice_feat.csv')
    df_test_voice_feat = pd.read_csv('../xdata/df_testB_voice_feat.csv')
    if add_data==True:
        df_trainA = pd.read_csv('../xdata/df_testA_label.csv')
        df_trainA_voice_feat = pd.read_csv('../xdata/df_testA_voice_feat.csv')
        
    
    df_train_sms_feat = pd.read_csv('../xdata/df_train_sms_feat.csv')
    df_test_sms_feat = pd.read_csv('../xdata/df_testB_sms_feat.csv')
    if add_data==True:
        df_trainA_sms_feat = pd.read_csv('../xdata/df_testA_sms_feat.csv')
    
    df_train_sms_feat.drop('label',axis=1,inplace=True)
    
    df_train_wa_feat = pd.read_csv('../xdata/df_train_wa_feat.csv')
    df_test_wa_feat = pd.read_csv('../xdata/df_testB_wa_feat.csv')
    if add_data==True:
        df_trainA_wa_feat = pd.read_csv('../xdata/df_testA_wa_feat.csv')
    df_train_wa_feat.drop('label',axis=1,inplace=True)
    
#     df_train_voice_sms_feat = pd.read_csv('../xdata/df_train_voice_sms_feat.csv')
#     df_test_voice_sms_feat = pd.read_csv('../xdata/df_testB_voice_sms_feat.csv')
#     df_train_voice_sms_feat.drop('label',axis=1,inplace=True)
    
    
    df_train = pd.merge(df_train_voice_feat, df_train_sms_feat, on='uid', how='left')
    df_test = pd.merge(df_test_voice_feat, df_test_sms_feat, on='uid', how='left')
    
    df_train = pd.merge(df_train, df_train_wa_feat, on='uid', how='left')
    df_test = pd.merge(df_test, df_test_wa_feat, on='uid', how='left')
    
#     df_train = pd.merge(df_train, df_train_voice_sms_feat, on='uid', how='left')
#     df_test = pd.merge(df_test, df_test_voice_sms_feat, on='uid', how='left')
    
    
    if add_data==True:
        df_trainA = pd.merge(df_trainA, df_trainA_voice_feat, on='uid', how='left')
        df_trainA = pd.merge(df_trainA, df_trainA_sms_feat, on='uid', how='left')
        df_trainA = pd.merge(df_trainA, df_trainA_wa_feat, on='uid', how='left')
#         df_trainA = df_trainA[df_trainA['label']==1]
        df_trainA = df_trainA[:100]
        
        
        df_train = df_train.append(df_trainA)
    
        
    df_train.replace([np.inf,-np.inf], 0, inplace=True)
    df_test.replace([np.inf,-np.inf], 0, inplace=True)

    df_train.fillna(0,inplace=True)
    df_test.fillna(0,inplace=True)

    
    
    # 组合特征
    # voice_all_start_time_first
    # voice_all_start_time_last 
    # voice_all_end_time_first
    # voice_all_end_time_last
    
    # sms_all_start_time_last
    # sms_all_start_time_first
    for sms_time in ['sms_all_start_time_first', 'sms_all_start_time_last']:
        df_train[sms_time] = df_train[sms_time].astype(int)
        df_test[sms_time] = df_test[sms_time].astype(int)
        
        for voice_time in ['voice_all_start_time_first','voice_all_start_time_last','voice_all_end_time_first','voice_all_end_time_last']:
            df_train[voice_time] = df_train[voice_time].astype(int)
            df_test[voice_time] = df_test[voice_time].astype(int)
        
            df_train[sms_time+'_'+voice_time] = get_diff_time(df_train[[sms_time,voice_time]].values)
            df_test[sms_time+'_'+voice_time] = get_diff_time(df_test[[sms_time,voice_time]].values)
            
            # 除以对应的不同的次数，
            df_train[sms_time+'_'+voice_time+'_voice_rate'] = df_train[sms_time+'_'+voice_time] / df_train['voice_all_cnt']
            df_test[sms_time+'_'+voice_time+'_voice_rate'] = df_test[sms_time+'_'+voice_time] / df_test['voice_all_cnt']
    
            df_train[sms_time+'_'+voice_time+'_sms_rate'] = df_train[sms_time+'_'+voice_time] / df_train['sms_all_cnt']
            df_test[sms_time+'_'+voice_time+'_sms_rate'] = df_test[sms_time+'_'+voice_time] / df_test['sms_all_cnt']
    
    df_train['wa_all_wa_name_little_wite_risk'] = df_train['wa_all_wa_name_little_wite_risk'].astype(int)
    df_train['wa_all_wa_name_many_wite_risk'] = df_train['wa_all_wa_name_many_wite_risk'].astype(int)
    
    df_test['wa_all_wa_name_little_wite_risk'] = df_test['wa_all_wa_name_little_wite_risk'].astype(int)
    df_test['wa_all_wa_name_many_wite_risk'] = df_test['wa_all_wa_name_many_wite_risk'].astype(int)

    
    return df_train, df_test

    
    
def split_train_valid(df_train,test_size=0.2):
    '''
    k-fold交叉验证,默认k=10
    df_train:训练数据
    '''
    X_train, X_vali, y_train, y_vali = train_test_split(df_train[features], df_train[label], test_size=test_size, random_state=40000)
    #added some parameters
    
#     dtrain = df_train.iloc[train_list]
#     dvali =  df_train.iloc[vali_list]
    
    dtrain = xgb.DMatrix(X_train,label=y_train)
    dvalid = xgb.DMatrix(X_vali,label=y_vali)
    watchlist = [(dtrain, 'train'),(dvalid, 'valid')]
    
    return dtrain, dvalid, watchlist

In [39]:
# 设置特征数据，去除id数据，不能进行预测
df_train, df_test = get_data(add_data=True)

df_train.info()
df_test.info()

features = df_test.columns
features = list(features)
features.remove('uid')
features.remove('wa_all_wa_name_little_wite')
features.remove('wa_all_wa_name_many_wite')

label = 'label'

print len(features)

features_ = open('features.txt','r').readlines()
features_ = [feat.strip() for feat in features_]

# features = list(set(features) - set(features_))
print len(features)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5099 entries, 0 to 99
Columns: 2065 entries, label to sms_all_start_time_last_voice_all_end_time_last_sms_rate
dtypes: float64(2043), int64(19), object(3)
memory usage: 80.4+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 0 to 2999
Columns: 2064 entries, uid to sms_all_start_time_last_voice_all_end_time_last_sms_rate
dtypes: float64(2043), int64(18), object(3)
memory usage: 47.3+ MB
2061
2061


In [40]:
df_train['label'].value_counts()

0    4099
1    1000
Name: label, dtype: int64

## 组合特征
1. 组合时间差值，组合比例计算，voice,sms,wa等三种情况


In [41]:
xgb17_first_step_models = []
rf7_first_step_models = []
gbdt_first_step_models = []
lr_model = None

def get_xgb17_first_step_model(df_train):
    '''
    stacking的第一层，使用17个不同的seed得到15维特征，返回15个模型
    
    '''
    seed = 71
    np.random.seed(seed)

    param = {'max_depth':3, # 基准是5 
         'eta':0.05,
         'gamma ':0.1,
         'colsample_bytree':0.8, # old 0.8
         'subsample':0.8,
         'silent':1,
         'eval_metric':'auc',
         'objective':'binary:logistic',
#          'scale_pos_weight':5,
         'seed': seed
        }

    nround = 150 # 参数求的
    models = []
    seeds = [71,73,91,101,2017,2018,2019,2020,10003,100007,100009,20003,200005,12345,123456]
    for i in range(len(seeds)):
        print('LOOP',i)
        dbuild, dvalid, watchlist = split_train_valid(df_train,test_size=0.002)
        param['seed'] = seeds[i]
        model = xgb.train(param, dbuild, nround, watchlist,verbose_eval=20)
        models.append(model)
        
        # VALID
#         valid_yhat = model.predict(dvalid,ntree_limit=model.best_iteration)
#         print('Valid Mean:---------------------->', np.mean(valid_yhat))
        del dbuild, dvalid, watchlist
    return models


def get_lr_first_step_model(df_train):
    '''
    得到3个lr model
    '''
    pass
    


In [44]:
seed = 71
np.random.seed(seed)
param = {'max_depth':3, # 基准是5,过拟合 
         'eta':0.05,
         'gamma ':0.1,
         'colsample_bytree':0.8, # old 0.8
         'subsample':0.8,
         'silent':1,
         'eval_metric':'auc',
         'objective':'binary:logistic',
#          'scale_pos_weight':5,
         'seed': seed
        }

ESR = 50
nround = 1000

In [55]:
LOOP = 3
models = []
seeds = [71,73,91]
for i in range(LOOP):
    print('LOOP',i)
#     dbuild, dvalid, watchlist = split_build_valid(df_train)
    dbuild, dvalid, watchlist = split_train_valid(df_train,test_size=0.2)
    param['seed'] = seeds[i]
    model = xgb.train(param, dbuild, nround, watchlist,early_stopping_rounds=ESR,verbose_eval=20)
    models.append(model)
#     model.save_model('./model1'+ str(i) + '.model')
    # VALID
    valid_yhat = model.predict(dvalid,ntree_limit=model.best_iteration)
    print('Valid Mean:---------------------->', np.mean(valid_yhat))
    del dbuild, dvalid, watchlist


('LOOP', 0)
[0]	train-auc:0.855825	valid-auc:0.806281
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[20]	train-auc:0.944514	valid-auc:0.901794
[40]	train-auc:0.959609	valid-auc:0.915949
[60]	train-auc:0.97087	valid-auc:0.921351
[80]	train-auc:0.979391	valid-auc:0.923564
[100]	train-auc:0.985777	valid-auc:0.923678
[120]	train-auc:0.991162	valid-auc:0.922394
Stopping. Best iteration:
[88]	train-auc:0.98212	valid-auc:0.924505

('Valid Mean:---------------------->', 0.1892371)
('LOOP', 1)
[0]	train-auc:0.854012	valid-auc:0.807433
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[20]	train-auc:0.944019	valid-auc:0.898056
[40]	train-auc:0.959275	valid-auc:0.918065
[60]	train-auc:0.970564	valid-auc:0.921858
[80]	train-auc:0.979583	valid-auc:0.921954
[100]	train-auc:0.986048	valid-auc:0.922551
[120]	train-

In [47]:
dtest  = xgb.DMatrix(df_test[features])
proba_test = pd.DataFrame()
proba_test['uid'] = df_test['uid']
proba_test['score'] = [0 for i in range(len(df_test))]
for model in models:
    proba_test['score'] += model.predict(dtest)
proba_test['score'] /= LOOP

proba_test = proba_test.sort_values('score',ascending=False)
proba_test['label'] = [0 for i in range(len(proba_test))]



In [48]:
# proba_test[:500]

In [49]:
proba_test.loc[proba_test['score']>0.36, 'label'] = 1
proba_test[['uid','label']].to_csv('../result/xresultB_1.csv',index=False,header=False)

In [50]:

proba_test['label'].value_counts()

0    2481
1     519
Name: label, dtype: int64

In [51]:
# 计算特征重要程度
import operator
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1),reverse=True)

print len(importance)




906


['voice_all_cnt',
 'voice_all_unique_cnt',
 'voice_all_per_opp_rate',
 'voice_all_opp_head_unique_cnt',
 'voice_all_opp_head_little_head',
 'voice_all_opp_head_many_head',
 'voice_all_opp_head_many_head_cnt',
 'voice_all_opp_head_many_head_cnt_rate',
 'voice_all_opp_head_many_head_cnt_rate_unique',
 'voice_all_opp_len_3',
 'voice_all_opp_len_5',
 'voice_all_opp_len_6',
 'voice_all_opp_len_7',
 'voice_all_opp_len_8',
 'voice_all_opp_len_9',
 'voice_all_opp_len_10',
 'voice_all_opp_len_11',
 'voice_all_opp_len_12',
 'voice_all_opp_len_13',
 'voice_all_opp_len_14',
 'voice_all_opp_len_15',
 'voice_all_opp_len_16',
 'voice_all_opp_len_17',
 'voice_all_opp_len_19',
 'voice_all_opp_len_20',
 'voice_all_opp_len_11_rate',
 'voice_all_opp_len_little_head',
 'voice_all_opp_len_many_head',
 'voice_all_call_type_1',
 'voice_all_call_type_2',
 'voice_all_call_type_3',
 'voice_all_call_type_4',
 'voice_all_call_type_1_rate',
 'voice_all_call_type_2_rate',
 'voice_all_call_type_3_rate',
 'voice_all_c

In [52]:
output = open('xfeatures906.txt','w')

for feat,k in importance:
    output.write("%s\n" % feat)

output.close()


In [54]:
used_feat = [feat for feat,k in importance]
no_userd_feat = set(features) - set(used_feat)

output = open('no_xfeatures906.txt','w')
for feat in no_userd_feat:
    output.write("%s\n" % feat)

output.close()

