In [6]:
import pandas as pd
import numpy as np
import scipy as sp
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, f1_score

from sklearn.utils import shuffle  


## ensemble model
### model
1. RF x 7
2. GBDT x 11
3. XGB x 17

### feature
1. all features
2. used features
3. no features

### data
1. 3 + 2
2. second layer --> LR x 1 or [XGB x 3 (AVG)]

In [7]:
# 得到模型的只，
def get_x_rf(df_train, feats, label):
    '''
    使用9种不同的参数的models
    '''
    seed = 17
    np.random.seed(seed)

    x_n_estimators = [20,100,500]
    max_depth = 5
    min_samples_split = 3
    verbose = 10
    x_random_states = [71, 91, 101]
    
    rf_models = []
    data_res = {}
    for i in range(len(x_n_estimators)):
        for j in range(len(x_random_states)):
            print i,j
            clf = RandomForestClassifier(max_depth=max_depth, random_state=x_random_states[j],
                                         min_samples_split=min_samples_split, 
                                         n_estimators=x_n_estimators[i],verbose=10)    
            clf.fit(df_train[feats],df_train[label])
            rf_model.append(clf)
#             prob = clf.predict_proba(df_test[feats])
#             data_rea['RF_%d_%d_PROB' % (i,j)] = prob
    
    return rf_models
    

    
    

In [8]:
def split_train_valid(df_train,features,label,test_size=0.2):
    '''
    k-fold交叉验证,默认k=10
    df_train:训练数据
    '''
    X_train, X_vali, y_train, y_vali = train_test_split(df_train[features], df_train[label], test_size=test_size, random_state=40000)
    #added some parameters
        
    dtrain = xgb.DMatrix(X_train,label=y_train)
    dvalid = xgb.DMatrix(X_vali,label=y_vali)
    watchlist = [(dtrain, 'train'),(dvalid, 'valid')]
    
    return dtrain, dvalid, watchlist

# 得到模型的只，
def get_x_gbdt(df_train, feats, label):
    '''
    使用7种不同的参数
    '''
    seed = 17
    np.random.seed(seed)

    learning_rate = 0.05
    n_estimators = 180
    max_depth = 3
    min_samples_split = 2
    subsample = 0.85
    verbose = 50
    x_random_states = [71,91,101,2018,2019,1007,2020]
    
    gbdt_models = []
    data_res = {}
    for j in range(len(x_random_states)):
        print j
        clf = GradientBoostingClassifier(max_depth=max_depth, random_state=x_random_states[j],
                                         min_samples_split=min_samples_split,learning_rate=learning_rate,
                                         n_estimators=n_estimators,verbose=verbose)    
        clf.fit(df_train[feats],df_train[label])
        gbdt_models.append(clf)
#         prob = clf.predict_proba(df_test[feats])
#         data_rea['GBDT_%d_PROB' % (j)] = prob
    
    return gbdt_models
    

    

In [9]:
def get_x_xgb(df_train, feats, label):
    '''
    stacking的第一层，使用15个不同的seed得到15维特征，返回15个模型
    
    '''
    seed = 17
    np.random.seed(seed)
    param = {'max_depth':3, # 基准是5 
         'eta':0.05,
         'gamma ':0.1,
         'colsample_bytree':0.85, # old 0.8
         'subsample':0.85,
         'silent':1,
         'eval_metric':'auc',
         'objective':'binary:logistic',
        }

    nround = 150 # 参数求的
    xgb_models = []
    seeds = [71,73,91,101,2017,2018,2019,2020,10003,100007,100009,20003,200005,12345,123456]
    for i in range(len(seeds)):
        print('LOOP',i)
        dbuild, dvalid, watchlist = split_train_valid(df_train, feats, test_size=0.002)
        param['seed'] = seeds[i]
        model = xgb.train(param, dbuild, nround, watchlist,verbose_eval=20)
        xgb_models.append(model)
        del dbuild, dvalid, watchlist
        
    return xgb_models



In [10]:
def get_x_xgb_second(df_train, feats, label):
    '''
    stacking的第一层，使用15个不同的seed得到15维特征，返回15个模型
    
    '''
    seed = 17
    np.random.seed(seed)
    param = {'max_depth':3, # 基准是5 
         'eta':0.05,
         'gamma ':0.1,
         'colsample_bytree':0.85, # old 0.8
         'subsample':0.85,
         'silent':1,
         'eval_metric':'auc',
         'objective':'binary:logistic',
        }
    
    nround = 100 # 参数求的
    xgb_models = []
    seeds = [71,73,91]
    for i in range(len(seeds)):
        print('LOOP',i)
        dbuild, dvalid, watchlist = split_train_valid(df_train,feats,label=label, test_size=0.002)
        param['seed'] = seeds[i]
        model = xgb.train(param, dbuild, nround, watchlist,verbose_eval=20)
        xgb_models.append(model)
        del dbuild, dvalid, watchlist
        
    return xgb_models



In [11]:
def get_all_models(df_train, feats, label='label'):
    rf_models = get_x_rf(df_train, feats, label)
    gbdt_models = get_x_gbdt(df_train, feats, label)
    
    rf_models = []
    gbdt_models = []
    
    
    xgb_models = get_x_xgb(df_train, feats, label)
    
#     return rf_models, gbdt_models, xgb_models  
    return rf_models, gbdt_models, xgb_models

def one_models_predict(df_data, feats, models, model_type):
    data = {'uid':df_data['uid']}
    for i in range(len(models)):
        if model_type == 'rf':
            prob = models[i].predeict(df_data[feats])
            data['rf_%d_prob' % i] = prob
        elif model_type == 'gbdt':
            prob = models[i].predeict(df_data[feats])
            data['gbdt_%d_prob' % i] = prob
        else:
            prob = models[i].predict(xgb.DMatrix(df_data[feats]))
            data['xgb_%d_prob' % i] = prob
    
    return pd.DataFrame(data=data)
    

def models_predict(rf_models, gbdt_models, xgb_models, df_data, feats):
    '''
    注意参数的顺序
    '''
    df_data = one_models_predict(df_data, feats, rf_models, 'xgb')
#     df_data = pd.merge(df_data, one_models_predict(df_data, feats, gbdt_models, 'gbdt'))
#     df_data = pd.merge(df_data, one_models_predict(df_data, feats, xgb_models, 'xgb'))
    
    return df_data

def models_predict_x(xgb_models, df_data, feats):
    '''
    注意参数的顺序
    '''
    df_data = one_models_predict(df_data, feats, xgb_models, 'xgb')
#     df_data = pd.merge(df_data, one_models_predict(df_data, feats, gbdt_models, 'gbdt'))
#     df_data = pd.merge(df_data, one_models_predict(df_data, feats, xgb_models, 'xgb'))
    
    return df_data


def get_finall_model(df_train_A, df_train_B, feats, label='label'):
    
#     rf_models, gbdt_models, xgb_models = get_all_models(df_train_A, feats, label)
    xgb_models = get_x_xgb(df_train_A, feats, label)
    
    print len(xgb_models)
#     df_train_b = models_predict(rf_models, gbdt_models, xgb_models, df_train_B, feats)
    df_train_b = one_models_predict(df_train_B, feats, xgb_models, 'xgb')
    df_train_b['label'] = df_train_B['label']
    
    print df_train_b.info()
    
    feats_b = list(df_train_b.columns)
    if 'uid' in feats_b:
        feats_b.remove('uid')
    
    print "xgb_model final %d" % len(xgb_models)
    xgb_second_models = get_x_xgb_second(df_train_b, feats_b, label)
    
    return xgb_models, xgb_second_models


def get_finall_score(xgb_second_models, df_test, feats, threold=0.3):
    LOOP = len(xgb_second_models)
    dtest  = xgb.DMatrix(df_test[feats])
    proba_test = pd.DataFrame()
    proba_test['uid'] = df_test['uid']
    proba_test['score'] = [0 for i in range(len(df_test))]
    for model in xgb_second_models:
        proba_test['score'] += model.predict(dtest)
    proba_test['score'] /= LOOP

    proba_test = proba_test.sort_values('score',ascending=False)
    proba_test['label'] = [0 for i in range(len(proba_test))]

    proba_test.loc[proba_test['score']>threold, 'label'] = 1
#     proba_test[['uid','label']].to_csv('../result/xresult_finall_1.csv',index=False,header=False)
    
    return proba_test

def predict_test(df_train_A, df_train_B, df_test, feats, label='label'):
    xgb_models, xgb_second_models = get_finall_model(df_train_A, df_train_B, feats, label)
    
#     print "rf_models length = %d " % len(rf_models)
#     print "gbdt_models length = %d " % len(gbdt_models)
    print "xgb_models length = %d " % len(xgb_models)
    
    df_test_b = models_predict_x(xgb_models, df_test, feats)
#     df_test_b['lable'] = df_train_B['label']
    
    print df_test_b.info()
    
    feats_b = df_test_b.columns
    
    proba_test = get_finall_score(xgb_second_models, df_data, feats_b)
    
    return proba_test


In [16]:
def split_data(df_train, label='label'):
    '''
    将数据集分成两部分A和B，A训练第一层model，B训练第二层模型, 2 : 3
    写入文件，保证能够重现。
    '''
    df_train_one = df_train[df_train[label] == 1]
    df_train_zero = df_train[df_train[label] == 0]
    
    df_train_one_A = df_train_one.sample(int(len(df_train_one)*0.6))
    df_train_one_B = df_train_one[~df_train_one['uid'].isin(df_train_one_A['uid'])]
    
    df_train_zero_A = df_train_zero.sample(int(len(df_train_zero)*0.6))
    df_train_zero_B = df_train_zero[~df_train_zero['uid'].isin(df_train_zero_A['uid'])]
    
    df_train_A = df_train_one_A.append(df_train_zero_A)
    df_train_B = df_train_one_B.append(df_train_zero_B)
    
    df_train_A = shuffle(df_train_A)
    df_train_B = shuffle(df_train_B)
    
    df_train_A.to_csv('../sdata/df_train_A.csv',index=False)
    df_train_B.to_csv('../sdata/df_train_B.csv',index=False)
    
    return df_train_A, df_train_B

def get_data(add_data=False):
    df_train_voice_feat = pd.read_csv('../xdata/df_train_voice_feat.csv')
    df_test_voice_feat = pd.read_csv('../xdata/df_testB_voice_feat.csv')
    if add_data==True:
        df_trainA = pd.read_csv('../xdata/df_testA_label.csv')
        df_trainA_voice_feat = pd.read_csv('../xdata/df_testA_voice_feat.csv')
        
    
    df_train_sms_feat = pd.read_csv('../xdata/df_train_sms_feat.csv')
    df_test_sms_feat = pd.read_csv('../xdata/df_testB_sms_feat.csv')
    if add_data==True:
        df_trainA_sms_feat = pd.read_csv('../xdata/df_testA_sms_feat.csv')
    
    df_train_sms_feat.drop('label',axis=1,inplace=True)
    
    df_train_wa_feat = pd.read_csv('../xdata/df_train_wa_feat.csv')
    df_test_wa_feat = pd.read_csv('../xdata/df_testB_wa_feat.csv')
    if add_data==True:
        df_trainA_wa_feat = pd.read_csv('../xdata/df_testA_wa_feat.csv')

    df_train_wa_feat.drop('label',axis=1,inplace=True)
    
    df_train_voice_sms_wa_feat = pd.read_csv('../xdata/df_train_voice_sms_wa_feat.csv')
    df_test_voice_sms_wa_feat = pd.read_csv('../xdata/df_testB_voice_sms_wa_feat.csv')
    if add_data==True:
        df_trainA_voice_sms_wa_feat = pd.read_csv('../xdata/df_testA_voice_sms_wa_feat.csv')
    
#     df_train_voice_sms_wa_feat.drop('label',axis=1,inplace=True)
    
    
    df_train = pd.merge(df_train_voice_feat, df_train_sms_feat, on='uid', how='left')
    df_test = pd.merge(df_test_voice_feat, df_test_sms_feat, on='uid', how='left')
    
    df_train = pd.merge(df_train, df_train_wa_feat, on='uid', how='left')
    df_test = pd.merge(df_test, df_test_wa_feat, on='uid', how='left')
    
    df_train = pd.merge(df_train, df_train_voice_sms_wa_feat, on='uid', how='left')
    df_test = pd.merge(df_test, df_test_voice_sms_wa_feat, on='uid', how='left')
    
    
    if add_data==True:
        df_trainA = pd.merge(df_trainA, df_trainA_voice_feat, on='uid', how='left')
        df_trainA = pd.merge(df_trainA, df_trainA_sms_feat, on='uid', how='left')
        df_trainA = pd.merge(df_trainA, df_trainA_wa_feat, on='uid', how='left')
        df_trainA = pd.merge(df_trainA, df_trainA_voice_sms_wa_feat, on='uid', how='left')
        
#         df_trainA = df_trainA[df_trainA['label']==1]
        df_trainA = df_trainA[:66]
        df_train = df_train.append(df_trainA)

    df_train.replace([np.inf,-np.inf], 0, inplace=True)
    df_test.replace([np.inf,-np.inf], 0, inplace=True)

    df_train.fillna(0,inplace=True)
    df_test.fillna(0,inplace=True)
    
    df_train.drop('wa_all_wa_name_little_wite',axis=1,inplace=True)
    df_train.drop('wa_all_wa_name_many_wite',axis=1,inplace=True)
    df_train.drop('wa_all_wa_name_little_wite_risk',axis=1,inplace=True)
    df_train.drop('wa_all_wa_name_many_wite_risk',axis=1,inplace=True)

    df_test.drop('wa_all_wa_name_little_wite',axis=1,inplace=True)
    df_test.drop('wa_all_wa_name_many_wite',axis=1,inplace=True)
    df_test.drop('wa_all_wa_name_little_wite_risk',axis=1,inplace=True)
    df_test.drop('wa_all_wa_name_many_wite_risk',axis=1,inplace=True)
    
    
    df_train.replace([np.inf,-np.inf], 0, inplace=True)
    df_test.replace([np.inf,-np.inf], 0, inplace=True)

    df_train.fillna(0,inplace=True)
    df_test.fillna(0,inplace=True)
    
    # 数据分成A和B的两部分
    split_data(df_train)
    
    df_train.to_csv('../sdata/df_train_plus.csv',index=False)
    df_test.to_csv('../sdata/df_test_plus.csv',index=False)
    
    return df_train, df_test

    
    
def split_train_valid(df_train, feats, label='label', test_size=0.2):
    '''
    k-fold交叉验证,默认k=10
    df_train:训练数据
    '''
    X_train, X_vali, y_train, y_vali = train_test_split(df_train[feats], df_train[label], test_size=test_size, random_state=40000)
    #added some parameters
    
#     dtrain = df_train.iloc[train_list]
#     dvali =  df_train.iloc[vali_list]
    
    dtrain = xgb.DMatrix(X_train,label=y_train)
    dvalid = xgb.DMatrix(X_vali,label=y_vali)
    watchlist = [(dtrain, 'train'),(dvalid, 'valid')]
    
    return dtrain, dvalid, watchlist



In [17]:
df_train, df_test = get_data(add_data=True) 

In [20]:
def run():
    df_train_A = pd.read_csv('../sdata/df_train_A.csv')
    df_train_B = pd.read_csv('../sdata/df_train_B.csv')
    df_test = pd.read_csv('../sdata/df_test.csv')
    
    df_train_A.replace([np.inf,-np.inf], 0, inplace=True)
    df_train_B.replace([np.inf,-np.inf], 0, inplace=True)
    df_test.replace([np.inf,-np.inf], 0, inplace=True)

    df_train_A.fillna(0,inplace=True)
    df_train_B.fillna(0,inplace=True) 
    df_test.fillna(0,inplace=True)
        
    feats = list(df_test.columns)
    feats.remove('uid')
    
    df_ans = predict_test(df_train_A, df_train_B, df_test, feats, label='label')
    
    
def run_x():
    df_train = pd.read_csv('../sdata/df_train_plus.csv')
    df_test = pd.read_csv('../sdata/df_test_plus.csv')
    
    df_train.replace([np.inf,-np.inf], 0, inplace=True)
    df_test.replace([np.inf,-np.inf], 0, inplace=True)

    df_train.fillna(0,inplace=True)
    df_test.fillna(0,inplace=True)
        
    feats = list(df_test.columns)
    feats.remove('uid')
    
    print len(feats)
    
    xgb_models = get_x_xgb(df_train,feats,label='label')
    
    proba_test = get_finall_score(xgb_models, df_test, feats, threold=0.27)
    
    proba_test[['uid','label']].to_csv('../result/xresult_finall_x_1.csv',index=False,header=False)
    
    

In [21]:
run_x()


3336
('LOOP', 0)
[0]	train-auc:0.841473	valid-auc:1
[20]	train-auc:0.916286	valid-auc:1
[40]	train-auc:0.932813	valid-auc:1
[60]	train-auc:0.944752	valid-auc:1


KeyboardInterrupt: 

In [5]:
# np.finfo(np.float32).min

NameError: name 'np' is not defined