In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score

import xgboost as xgb

### 使用数据集A增加训练数据

1. 使用前200个作为label 1增加训练数据的征集
2. 因为训练的时候出现了过拟合问题
3. 使用小的深度和小的迭代次数，避免过拟合

In [2]:
def diff_time(a,b):
    a_day, a_hour, a_minute, a_second = (a / 1000000, a / 10000 % 100, a / 100 % 100, a % 100)
    b_day, b_hour, b_minute, b_second = (b / 1000000, b / 10000 % 100, b / 100 % 100, b % 100)
    
    d_day = b_day - a_day
    d_hour = b_hour - a_hour
    d_minute = b_minute - a_minute
    d_second = b_second - a_second
    
    diff = d_day * 24 * 60 * 60 + d_hour * 60 * 60 + d_minute * 60 + d_second
    return diff

def get_diff_time(x):
    diff_t = []
    for d in x:
#         print d
        diff_t.append(diff_time(d[0],d[1]))
    return diff_t

In [9]:
def get_data(Type='B'):
    df_train_voice_feat = pd.read_csv('../xdata/df_train_voice_feat.csv')
    if Type=='B':
        df_test_voice_feat = pd.read_csv('../xdata/df_testB_voice_feat.csv')
    else:
        df_test_voice_feat = pd.read_csv('../xdata/df_testA_voice_feat.csv')
        
    df_train_sms_feat = pd.read_csv('../xdata/df_train_sms_feat.csv')
    if Type=='B':
        df_test_sms_feat = pd.read_csv('../xdata/df_testB_sms_feat.csv')
    else:
        df_test_sms_feat = pd.read_csv('../xdata/df_testA_sms_feat.csv')
        
    df_train_sms_feat.drop('label',axis=1,inplace=True)
    
    df_train_wa_feat = pd.read_csv('../xdata/df_train_wa_feat.csv')
    if Type=='B':
        df_test_wa_feat = pd.read_csv('../xdata/df_testB_wa_feat.csv')
    else:
        df_test_wa_feat = pd.read_csv('../xdata/df_testA_wa_feat.csv')
        
    df_train_wa_feat.drop('label',axis=1,inplace=True)
    
#     df_train_voice_sms_feat = pd.read_csv('../xdata/df_train_voice_sms_feat.csv')
#     df_test_voice_sms_feat = pd.read_csv('../xdata/df_testB_voice_sms_feat.csv')
#     df_train_voice_sms_feat.drop('label',axis=1,inplace=True)
    
    
    df_train = pd.merge(df_train_voice_feat, df_train_sms_feat, on='uid', how='left')
    df_test = pd.merge(df_test_voice_feat, df_test_sms_feat, on='uid', how='left')
    
    df_train = pd.merge(df_train, df_train_wa_feat, on='uid', how='left')
    df_test = pd.merge(df_test, df_test_wa_feat, on='uid', how='left')
    
#     df_train = pd.merge(df_train, df_train_voice_sms_feat, on='uid', how='left')
#     df_test = pd.merge(df_test, df_test_voice_sms_feat, on='uid', how='left')
    
    
    df_train.replace([np.inf,-np.inf], 0, inplace=True)
    df_test.replace([np.inf,-np.inf], 0, inplace=True)

    df_train.fillna(0,inplace=True)
    df_test.fillna(0,inplace=True)

    
    # 组合特征
    # voice_all_start_time_first
    # voice_all_start_time_last 
    # voice_all_end_time_first
    # voice_all_end_time_last
    
    # sms_all_start_time_last
    # sms_all_start_time_first
    for sms_time in ['sms_all_start_time_first', 'sms_all_start_time_last']:
        df_train[sms_time] = df_train[sms_time].astype(int)
        df_test[sms_time] = df_test[sms_time].astype(int)
        
        for voice_time in ['voice_all_start_time_first','voice_all_start_time_last','voice_all_end_time_first','voice_all_end_time_last']:
            df_train[voice_time] = df_train[voice_time].astype(int)
            df_test[voice_time] = df_test[voice_time].astype(int)
        
            df_train[sms_time+'_'+voice_time] = get_diff_time(df_train[[sms_time,voice_time]].values)
            df_test[sms_time+'_'+voice_time] = get_diff_time(df_test[[sms_time,voice_time]].values)
            
            # 除以对应的不同的次数，
            df_train[sms_time+'_'+voice_time+'_voice_rate'] = df_train[sms_time+'_'+voice_time] / df_train['voice_all_cnt']
            df_test[sms_time+'_'+voice_time+'_voice_rate'] = df_test[sms_time+'_'+voice_time] / df_test['voice_all_cnt']
    
            df_train[sms_time+'_'+voice_time+'_sms_rate'] = df_train[sms_time+'_'+voice_time] / df_train['sms_all_cnt']
            df_test[sms_time+'_'+voice_time+'_sms_rate'] = df_test[sms_time+'_'+voice_time] / df_test['sms_all_cnt']
    
    return df_train, df_test
    
def split_train_valid(df_train,test_size=0.2):
    '''
    k-fold交叉验证,默认k=10
    df_train:训练数据
    '''
    X_train, X_vali, y_train, y_vali = train_test_split(df_train[features], df_train[label], test_size=test_size, random_state=40000)
    #added some parameters
    
#     dtrain = df_train.iloc[train_list]
#     dvali =  df_train.iloc[vali_list]
    
    dtrain = xgb.DMatrix(X_train,label=y_train)
    dvalid = xgb.DMatrix(X_vali,label=y_vali)
    watchlist = [(dtrain, 'train'),(dvalid, 'valid')]
    
    return dtrain, dvalid, watchlist

In [10]:
# 设置特征数据，去除id数据，不能进行预测
df_train, df_test = get_data(Type='A')

df_test.info()

features = df_test.columns
features = list(features)
features.remove('uid')
features.remove('wa_all_wa_name_little_wite')
features.remove('wa_all_wa_name_many_wite')

label = 'label'

print len(features)

features_ = open('features.txt','r').readlines()
features_ = [feat.strip() for feat in features_]

# features = list(set(features) - set(features_))
print len(features)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2000 entries, 0 to 1999
Columns: 2062 entries, uid to sms_all_start_time_last_voice_all_end_time_last_sms_rate
dtypes: float64(2043), int64(16), object(3)
memory usage: 31.5+ MB
2059
2059


In [12]:
seed = 71
np.random.seed(seed)
param = {'max_depth':5, # 基准是5 
         'eta':0.05,
         'gamma ':0.1,
         'colsample_bytree':0.8, # old 0.8
         'subsample':0.8,
         'silent':1,
         'eval_metric':'auc',
         'objective':'binary:logistic',
#          'scale_pos_weight':5,
         'seed': seed
        }

ESR = 50
nround = 1000

In [14]:
LOOP = 3
models = []
seeds = [71,73,91]
for i in range(LOOP):
    print('LOOP',i)
#     dbuild, dvalid, watchlist = split_build_valid(df_train)
    dbuild, dvalid, watchlist = split_train_valid(df_train,test_size=0.002)
    param['seed'] = seeds[i]
    nround = 120
    model = xgb.train(param, dbuild, nround, watchlist,verbose_eval=20)
    models.append(model)
#     model.save_model('./model1'+ str(i) + '.model')
    # VALID
    valid_yhat = model.predict(dvalid,ntree_limit=model.best_iteration)
    print('Valid Mean:---------------------->', np.mean(valid_yhat))
    del dbuild, dvalid, watchlist


('LOOP', 0)
[0]	train-auc:0.882435	valid-auc:0.388889
[20]	train-auc:0.958623	valid-auc:0.444444
[40]	train-auc:0.973491	valid-auc:0.444444
[60]	train-auc:0.982756	valid-auc:0.555556
[80]	train-auc:0.990181	valid-auc:0.555556
[100]	train-auc:0.995221	valid-auc:0.555556
[119]	train-auc:0.997773	valid-auc:0.666667
('Valid Mean:---------------------->', 0.06434512)
('LOOP', 1)
[0]	train-auc:0.893475	valid-auc:0.444444
[20]	train-auc:0.960427	valid-auc:0.555556
[40]	train-auc:0.97352	valid-auc:0.333333
[60]	train-auc:0.982788	valid-auc:0.555556
[80]	train-auc:0.990651	valid-auc:0.555556
[100]	train-auc:0.994992	valid-auc:0.555556
[119]	train-auc:0.997536	valid-auc:0.555556
('Valid Mean:---------------------->', 0.06812062)
('LOOP', 2)
[0]	train-auc:0.88957	valid-auc:0.388889
[20]	train-auc:0.959817	valid-auc:0.111111
[40]	train-auc:0.975102	valid-auc:0.555556
[60]	train-auc:0.984706	valid-auc:0.555556
[80]	train-auc:0.991476	valid-auc:0.555556
[100]	train-auc:0.995489	valid-auc:0.555556
[1

In [38]:
dtest  = xgb.DMatrix(df_test[features])
proba_test = pd.DataFrame()
proba_test['uid'] = df_test['uid']
proba_test['score'] = [0 for i in range(len(df_test))]
for model in models:
    proba_test['score'] += model.predict(dtest)
proba_test['score'] /= LOOP

proba_test = proba_test.sort_values('score',ascending=False)
proba_test['label'] = [0 for i in range(len(proba_test))]

# proba_test[:200]

In [39]:
proba_test.loc[proba_test['score']>0.4,'label'] = 1


In [40]:
proba_test['label'].value_counts()

0    1792
1     208
Name: label, dtype: int64

In [41]:

proba_test[['uid','label']].to_csv('../xdata/df_testA_label.csv',index=False)