In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score

import xgboost as xgb

In [2]:
def diff_time(a,b):
    a_day, a_hour, a_minute, a_second = (a / 1000000, a / 10000 % 100, a / 100 % 100, a % 100)
    b_day, b_hour, b_minute, b_second = (b / 1000000, b / 10000 % 100, b / 100 % 100, b % 100)
    
    d_day = b_day - a_day
    d_hour = b_hour - a_hour
    d_minute = b_minute - a_minute
    d_second = b_second - a_second
    
    diff = d_day * 24 * 60 * 60 + d_hour * 60 * 60 + d_minute * 60 + d_second
    return diff

def get_diff_time(x):
    diff_t = []
    for d in x:
#         print d
        diff_t.append(diff_time(d[0],d[1]))
    return diff_t

In [3]:
def get_data(add_data=False):
    df_train_voice_feat = pd.read_csv('../xdata/df_train_voice_feat.csv')
    df_test_voice_feat = pd.read_csv('../xdata/df_testB_voice_feat.csv')
    if add_data==True:
        df_trainA = pd.read_csv('../xdata/df_testA_label.csv')
        df_trainA_voice_feat = pd.read_csv('../xdata/df_testA_voice_feat.csv')
        
    
    df_train_sms_feat = pd.read_csv('../xdata/df_train_sms_feat.csv')
    df_test_sms_feat = pd.read_csv('../xdata/df_testB_sms_feat.csv')
    if add_data==True:
        df_trainA_sms_feat = pd.read_csv('../xdata/df_testA_sms_feat.csv')
    
    df_train_sms_feat.drop('label',axis=1,inplace=True)
    
    df_train_wa_feat = pd.read_csv('../xdata/df_train_wa_feat.csv')
    df_test_wa_feat = pd.read_csv('../xdata/df_testB_wa_feat.csv')
    if add_data==True:
        df_trainA_wa_feat = pd.read_csv('../xdata/df_testA_wa_feat.csv')
    df_train_wa_feat.drop('label',axis=1,inplace=True)
    
#     df_train_voice_sms_feat = pd.read_csv('../xdata/df_train_voice_sms_feat.csv')
#     df_test_voice_sms_feat = pd.read_csv('../xdata/df_testB_voice_sms_feat.csv')
#     df_train_voice_sms_feat.drop('label',axis=1,inplace=True)
    
    
    df_train = pd.merge(df_train_voice_feat, df_train_sms_feat, on='uid', how='left')
    df_test = pd.merge(df_test_voice_feat, df_test_sms_feat, on='uid', how='left')
    
    df_train = pd.merge(df_train, df_train_wa_feat, on='uid', how='left')
    df_test = pd.merge(df_test, df_test_wa_feat, on='uid', how='left')
    
#     df_train = pd.merge(df_train, df_train_voice_sms_feat, on='uid', how='left')
#     df_test = pd.merge(df_test, df_test_voice_sms_feat, on='uid', how='left')
    
    
    if add_data==True:
        df_trainA = pd.merge(df_trainA, df_trainA_voice_feat, on='uid', how='left')
        df_trainA = pd.merge(df_trainA, df_trainA_sms_feat, on='uid', how='left')
        df_trainA = pd.merge(df_trainA, df_trainA_wa_feat, on='uid', how='left')
#         df_trainA = df_trainA[df_trainA['label']==1]
        df_trainA = df_trainA[:100]
        
        
        df_train = df_train.append(df_trainA)
    
        
    df_train.replace([np.inf,-np.inf], 0, inplace=True)
    df_test.replace([np.inf,-np.inf], 0, inplace=True)

    df_train.fillna(0,inplace=True)
    df_test.fillna(0,inplace=True)

    
    
    # 组合特征
    # voice_all_start_time_first
    # voice_all_start_time_last 
    # voice_all_end_time_first
    # voice_all_end_time_last
    
    # sms_all_start_time_last
    # sms_all_start_time_first
    for sms_time in ['sms_all_start_time_first', 'sms_all_start_time_last']:
        df_train[sms_time] = df_train[sms_time].astype(int)
        df_test[sms_time] = df_test[sms_time].astype(int)
        
        for voice_time in ['voice_all_start_time_first','voice_all_start_time_last','voice_all_end_time_first','voice_all_end_time_last']:
            df_train[voice_time] = df_train[voice_time].astype(int)
            df_test[voice_time] = df_test[voice_time].astype(int)
        
            df_train[sms_time+'_'+voice_time] = get_diff_time(df_train[[sms_time,voice_time]].values)
            df_test[sms_time+'_'+voice_time] = get_diff_time(df_test[[sms_time,voice_time]].values)
            
            # 除以对应的不同的次数，
            df_train[sms_time+'_'+voice_time+'_voice_rate'] = df_train[sms_time+'_'+voice_time] / df_train['voice_all_cnt']
            df_test[sms_time+'_'+voice_time+'_voice_rate'] = df_test[sms_time+'_'+voice_time] / df_test['voice_all_cnt']
    
            df_train[sms_time+'_'+voice_time+'_sms_rate'] = df_train[sms_time+'_'+voice_time] / df_train['sms_all_cnt']
            df_test[sms_time+'_'+voice_time+'_sms_rate'] = df_test[sms_time+'_'+voice_time] / df_test['sms_all_cnt']
    
    df_train['wa_all_wa_name_little_wite_risk'] = df_train['wa_all_wa_name_little_wite_risk'].astype(int)
    df_train['wa_all_wa_name_many_wite_risk'] = df_train['wa_all_wa_name_many_wite_risk'].astype(int)
    
    df_test['wa_all_wa_name_little_wite_risk'] = df_test['wa_all_wa_name_little_wite_risk'].astype(int)
    df_test['wa_all_wa_name_many_wite_risk'] = df_test['wa_all_wa_name_many_wite_risk'].astype(int)

    df_train.replace([np.inf,-np.inf], 0, inplace=True)
    df_test.replace([np.inf,-np.inf], 0, inplace=True)

    df_train.fillna(0,inplace=True)
    df_test.fillna(0,inplace=True)

    
    return df_train, df_test

    
    
def split_train_valid(df_train,test_size=0.2):
    '''
    k-fold交叉验证,默认k=10
    df_train:训练数据
    '''
    X_train, X_vali, y_train, y_vali = train_test_split(df_train[features], df_train[label], test_size=test_size, random_state=40000)
    #added some parameters
    
#     dtrain = df_train.iloc[train_list]
#     dvali =  df_train.iloc[vali_list]
    
    dtrain = xgb.DMatrix(X_train,label=y_train)
    dvalid = xgb.DMatrix(X_vali,label=y_vali)
    watchlist = [(dtrain, 'train'),(dvalid, 'valid')]
    
    return dtrain, dvalid, watchlist

In [4]:
# 设置特征数据，去除id数据，不能进行预测
df_train, df_test = get_data(add_data=True)

df_train.info()
df_test.info()

features = df_test.columns
features = list(features)
features.remove('uid')
features.remove('wa_all_wa_name_little_wite')
features.remove('wa_all_wa_name_many_wite')

label = 'label'

print len(features)

features_ = open('no_xfeatures906.txt','r').readlines()
features_ = [feat.strip() for feat in features_]

print len(features_)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5099 entries, 0 to 99
Columns: 2065 entries, label to sms_all_start_time_last_voice_all_end_time_last_sms_rate
dtypes: float64(2043), int64(19), object(3)
memory usage: 80.4+ MB
<class 'pandas.core.frame.DataFrame'>
Int64Index: 3000 entries, 0 to 2999
Columns: 2064 entries, uid to sms_all_start_time_last_voice_all_end_time_last_sms_rate
dtypes: float64(2043), int64(18), object(3)
memory usage: 47.3+ MB
2061
1155


In [27]:
pca = PCA(n_components=100)
train_pac = pca.fit_transform(df_train[features_])

test_pac = pca.fit_transform(df_test[features_])


In [28]:
pca_feat = []
for i in range(train_pac.shape[1]):
    df_train['PCA_'+ str(i)] =  train_pac[:,1]
    df_test['PCA_' + str(i)] = test_pac[:,1]
    pca_feat.append('PCA_' + str(i))
    

In [29]:
features = open('xfeatures906.txt','r').readlines()
features = [feat.strip() for feat in features_]

# features = features + pca_feat


features = pca_feat

In [30]:
features[-100:]

['PCA_0',
 'PCA_1',
 'PCA_2',
 'PCA_3',
 'PCA_4',
 'PCA_5',
 'PCA_6',
 'PCA_7',
 'PCA_8',
 'PCA_9',
 'PCA_10',
 'PCA_11',
 'PCA_12',
 'PCA_13',
 'PCA_14',
 'PCA_15',
 'PCA_16',
 'PCA_17',
 'PCA_18',
 'PCA_19',
 'PCA_20',
 'PCA_21',
 'PCA_22',
 'PCA_23',
 'PCA_24',
 'PCA_25',
 'PCA_26',
 'PCA_27',
 'PCA_28',
 'PCA_29',
 'PCA_30',
 'PCA_31',
 'PCA_32',
 'PCA_33',
 'PCA_34',
 'PCA_35',
 'PCA_36',
 'PCA_37',
 'PCA_38',
 'PCA_39',
 'PCA_40',
 'PCA_41',
 'PCA_42',
 'PCA_43',
 'PCA_44',
 'PCA_45',
 'PCA_46',
 'PCA_47',
 'PCA_48',
 'PCA_49',
 'PCA_50',
 'PCA_51',
 'PCA_52',
 'PCA_53',
 'PCA_54',
 'PCA_55',
 'PCA_56',
 'PCA_57',
 'PCA_58',
 'PCA_59',
 'PCA_60',
 'PCA_61',
 'PCA_62',
 'PCA_63',
 'PCA_64',
 'PCA_65',
 'PCA_66',
 'PCA_67',
 'PCA_68',
 'PCA_69',
 'PCA_70',
 'PCA_71',
 'PCA_72',
 'PCA_73',
 'PCA_74',
 'PCA_75',
 'PCA_76',
 'PCA_77',
 'PCA_78',
 'PCA_79',
 'PCA_80',
 'PCA_81',
 'PCA_82',
 'PCA_83',
 'PCA_84',
 'PCA_85',
 'PCA_86',
 'PCA_87',
 'PCA_88',
 'PCA_89',
 'PCA_90',
 'PCA_91'

In [31]:
len(features)
seed = 71
np.random.seed(seed)
param = {'max_depth':3, # 基准是5,过拟合 
         'eta':0.05,
         'gamma ':0.1,
         'colsample_bytree':0.8, # old 0.8
         'subsample':0.8,
         'silent':1,
         'eval_metric':'auc',
         'objective':'binary:logistic',
#          'scale_pos_weight':5,
         'seed': seed
        }

ESR = 50
nround = 1000

In [32]:
LOOP = 3
models = []
seeds = [71,73,91]
for i in range(LOOP):
    print('LOOP',i)
#     dbuild, dvalid, watchlist = split_build_valid(df_train)
    dbuild, dvalid, watchlist = split_train_valid(df_train,test_size=0.2)
    param['seed'] = seeds[i]
    model = xgb.train(param, dbuild, nround, watchlist,early_stopping_rounds=ESR,verbose_eval=20)
    models.append(model)
#     model.save_model('./model1'+ str(i) + '.model')
    # VALID
    valid_yhat = model.predict(dvalid,ntree_limit=model.best_iteration)
    print('Valid Mean:---------------------->', np.mean(valid_yhat))
    del dbuild, dvalid, watchlist


('LOOP', 0)
[0]	train-auc:0.5	valid-auc:0.5
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[20]	train-auc:0.5	valid-auc:0.5
[40]	train-auc:0.5	valid-auc:0.5
Stopping. Best iteration:
[0]	train-auc:0.5	valid-auc:0.5

('Valid Mean:---------------------->', 0.21824902)
('LOOP', 1)
[0]	train-auc:0.5	valid-auc:0.5
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[20]	train-auc:0.5	valid-auc:0.5
[40]	train-auc:0.5	valid-auc:0.5
Stopping. Best iteration:
[0]	train-auc:0.5	valid-auc:0.5

('Valid Mean:---------------------->', 0.21743053)
('LOOP', 2)
[0]	train-auc:0.5	valid-auc:0.5
Multiple eval metrics have been passed: 'valid-auc' will be used for early stopping.

Will train until valid-auc hasn't improved in 50 rounds.
[20]	train-auc:0.5	valid-auc:0.5
[40]	train-auc:0.5	valid-auc:0.5
Stopping. Best iterat

In [18]:
# 计算特征重要程度
import operator
importance = model.get_fscore()
importance = sorted(importance.items(), key=operator.itemgetter(1),reverse=True)

print importance




[('voice_all_end_time_min', 34), ('sms_all_start_time_last_voice_all_end_time_last_sms_rate', 25), ('sms_all_start_time_day_33', 22), ('sms_all_start_time_last_voice_all_end_time_first_voice_rate', 22), ('sms_all_start_time_first_voice_all_end_time_first', 17), ('voice_all_diff_time_jc', 17), ('sms_all_start_time_hour_22', 17), ('voice_all_opp_len_9', 13), ('sms_all_start_time_hour_15_rate', 12), ('voice_all_in_out_1', 11), ('wa_all_visit_cnt_wa_type_0_std', 11), ('wa_all_down_flow_date_11_sum_rate', 10), ('sms_all_start_time_hour_9_rate', 10), ('voice_all_end_time_hour_22', 10), ('sms_all_opp_len_10', 9), ('wa_all_visit_dura_fd', 9), ('sms_all_start_time_first_voice_all_end_time_last', 9), ('sms_all_start_time_hour_11', 9), ('sms_all_start_time_minute_52_rate', 9), ('voice_all_start_end_time_day_37', 9), ('voice_all_start_time_hour_5', 9), ('sms_all_start_time_minute_0_rate', 9), ('voice_all_end_time_hour_11', 9), ('sms_all_start_time_first_voice_all_end_time_last_sms_rate', 9), ('sms