In [2]:
import time
import datetime
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from dateutil.parser import parse
import matplotlib.pyplot as plt
from lightgbm import Booster as lgbm_Booster

from sklearn.cross_validation import KFold
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
import sklearn.model_selection
import sklearn.metrics
#import autosklearn.classification

data_path = './data/tc/'

train1 = pd.read_csv(data_path + 'f_train_20180204.csv', encoding='gb2312')
train2 = pd.read_csv(data_path + 'f_train2.csv', encoding='utf-8')
train = pd.concat([train1, train2])

# test = pd.read_csv(data_path + 'f_test_a_20180204.csv', encoding='gb2312')
test = pd.read_csv(data_path + 'f_test_b_20180305.csv', encoding='gb2312')

In [3]:
def make_feat(train, test):
    merge = pd.concat([train, test])
    n_train = len(train)
    train_y = merge['label']
    merge = merge.drop(['label','id'], axis=1)
    merge.loc['Row_sum'] = merge.isnull().apply(lambda x: x.sum())  ##计算每列有多少缺失值
    merge.loc[:, merge.loc['Row_sum'] > 600].columns  ##缺失值大于600的columns
    merge = merge.drop(merge.loc[:, merge.loc['Row_sum'] > 600].columns, axis=1)
    merge = merge.drop(['Row_sum'],axis=0)
    
    #基因
    merge['SNP34+SNP37'] = merge['SNP34'] + merge['SNP37']
    merge['SNP34+SNP37'] =  merge['SNP34+SNP37'].fillna(0)   ##0 means NaN
    d =  pd.get_dummies(merge['SNP34+SNP37'], prefix='SNP34+SNP37')
    merge = pd.concat([merge,d],axis=1)
    merge = merge.drop(['SNP34+SNP37'], axis=1)
    
    for i in merge.columns[12:62]:   ##for all snp
        merge[i] =  merge[i].fillna(0)   ##0 means NaN
        d =  pd.get_dummies(merge[i], prefix=i)
        merge = pd.concat([merge,d],axis=1)
        merge = merge.drop([i], axis=1)

    for i in ['孕次', '产次', 'DM家族史','BMI分类']:  #众数填充
        a = merge[i].mode()[0]
        merge[i] = merge[i].fillna(a)  

    merge.fillna(merge.median(axis=0), inplace=True)
    #################### 是否是多肉动物 #######################################
#     var = merge['VAR00007']
#     merge['VAR_log1'] = pd.Series(np.log(var, dtype=float))
#     var = merge['VAR00007']
#     merge['VAR_log1'] = pd.Series(pow(var, 3))
    
    # 'ApoA1/ApoB
    def Ap_level(line):
        a1 = line['ApoA1'] / line['ApoB']
        if a1 < 1:
            return "Ap_1-"
        elif a1 >= 1 and a1 < 3.7:
            return "Ap_1-3.7"
        elif a1 >= 3.7 and a1 < 15:
            return "Ap_3.7-15"
        else:
            return "Ap_na"
    merge['Ap_LEVEL'] = merge.apply(Ap_level, axis=1)
    d_Ap = pd.get_dummies(merge['Ap_LEVEL'])
    merge = pd.concat([d_Ap, merge], axis=1)
    merge = merge.drop(['Ap_LEVEL'], axis=1)

    
    # 收缩压
    def sys_level(line):
        a1 = line['收缩压']
        if a1 < 90:
            return "sys_90-"
        elif a1 >= 90 and a1 < 130:
            return "sys_90-130"
        elif a1 >= 130 and a1 < 180:
            return "sys_130-180"
        else:
            return "sys_na"

    merge['sys_LEVEL'] = merge.apply(sys_level, axis=1)
    d_sys = pd.get_dummies(merge['sys_LEVEL'])
    merge = pd.concat([d_sys, merge], axis=1)
    merge = merge.drop(['sys_LEVEL'], axis=1)

    # 收缩压+舒张压
    def bld_level(line):
        a1 = line['收缩压'] + line['舒张压']
        if a1 < 150:
            return "bloodT_150-"
        elif a1 >= 150 and a1 < 200:
            return "bloodT_150-200"
        elif a1 >= 200 and a1 < 260:
            return "bloodT_200-260"
        else:
            return "bloodT_na"
    
    # wbc
    def wbc_level(line):
        a1 = line['wbc']
        if a1 < 8:
            return "wbc_8-"
        elif a1 >= 8 and a1 < 14:
            return "wbc_8-14"
        elif a1 >= 14 and a1 < 21:
            return "wbc_14-21"
        else:
            return "wbc_na"

    merge['wbc_LEVEL'] = merge.apply(wbc_level, axis=1)
    d_wbc = pd.get_dummies(merge['wbc_LEVEL'])
    merge = pd.concat([d_wbc, merge], axis=1)
    merge = merge.drop(['wbc_LEVEL'], axis=1)
    
    ##################################################
    
    ##血压和BMI
    merge['bp*BMI'] = (merge['舒张压']+merge['收缩压'])*(merge['BMI分类']+1)/2

    ##孕产
    merge['孕产数'] = merge['孕次'] + merge['产次']
    merge['孕产差'] = merge['孕次'] - merge['产次']

    ##血生化
    #/肝脏类
    merge['肝脏']  = (merge['AST']+merge['ALT'])/2
    #/肾脏类别
    merge['肾脏'] = (merge['BUN'] + merge['Cr'])/2
    #/心血管类别
    merge['心血管'] = (merge['HDLC'] + merge['LDLC']+merge['CHO'] + merge['TG'])/4
    #/炎症
    merge['炎症'] = (np.log(merge['wbc'])+merge['hsCRP'])/2


    # 年龄
    def age_level(line):
        age = line['年龄']
        if age < 25:
            return "age_0_25"
        elif age >= 25 and age < 30:
            return "age_25_30"
        elif age >= 30 and age < 40:
            return "age_30_40"
        else:
            return "age_40_"

    merge['年龄_LEVEL'] = merge.apply(age_level, axis=1)
    d_age = pd.get_dummies(merge['年龄_LEVEL'], prefix="年龄")
    merge = pd.concat([d_age, merge], axis=1)
    merge = merge.drop(['年龄_LEVEL'], axis=1)

    def pg_level(line):
        pg = line['孕次']
        if pg < 3:
            return "pg_0_3"
        else:
            return "pg_3_"

    merge['pg_LEVEL'] = merge.apply(pg_level, axis=1)
    d_pg = pd.get_dummies(merge['pg_LEVEL'], prefix='孕次')
    merge = pd.concat([d_pg, merge], axis=1)
    merge = merge.drop(['pg_LEVEL'], axis=1)

#     merge.columns = [np.arange(285)]

    X, y = merge[:n_train], train_y[:n_train]
    test_X = merge[n_train:]

    return X, y, test_X  #0.6561

X, y, test_X = make_feat(train, test)

In [4]:
X.shape

(1200, 251)

In [9]:
X.describe()

Unnamed: 0,孕次_pg_0_3,孕次_pg_3_,年龄_age_0_25,年龄_age_25_30,年龄_age_30_40,年龄_age_40_,wbc_14-21,wbc_8-,wbc_8-14,sys_130-180,...,SNP9_0.0,SNP9_1.0,SNP9_2.0,bp*BMI,孕产数,孕产差,肝脏,肾脏,心血管,炎症
count,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,...,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0,1200.0
mean,0.829167,0.170833,0.01,0.24,0.725833,0.024167,0.031667,0.238333,0.73,0.066667,...,0.004167,0.3775,0.618333,100.26375,2.700833,0.6225,31.696458,32.33245,3.535952,3.138284
std,0.37652,0.37652,0.09954,0.427261,0.446279,0.15363,0.175184,0.426242,0.444145,0.249548,...,0.064442,0.484964,0.485998,34.834899,1.107108,1.001247,14.157181,3.076335,0.836755,10.28246
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,64.5,2.0,-1.0,8.0,19.485,2.065,0.94186
25%,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,89.0,2.0,0.0,26.5,30.635,3.1325,1.874099
50%,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,90.5,2.0,0.0,29.0,31.87,3.47625,2.45133
75%,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,1.0,1.0,95.0,3.0,1.0,33.5,33.88625,3.8275,3.467384
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,349.5,9.0,6.0,248.5,48.255,20.8925,356.207957


## 20170227 模型测试

In [5]:
# 三模型融合后0.7321
# 只用一个xgb,线上0.734
clf_lgb_params1 = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'mse',
    'sub_feature': 0.7,
    'num_leaves': 60,
    'colsample_bytree': 0.7,
    'feature_fraction': 0.2,
    'min_data': 50,
    'min_hessian': 1,
    'verbose': 200,
}

clf_lgb_params2 = {
    'learning_rate': 0.005,
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'mse',
    'sub_feature': 0.5,
    'num_leaves': 70,
    'colsample_bytree': 0.3,
    'feature_fraction': 0.1,
    'min_data': 20,
    'verbose': 200,
}

# xgboost
clf_xgb_params1 = {
    'booster': 'gbtree',
    'eval_metric': 'auc',
    'objective': 'binary:logistic',
    'max_depth': 5,
    'eta': 0.02,
    'min_child_weight': 4,
    'colsample': 0.8,
    'gamma': 2,
    'silent': 1
}

# K折交叉验证
print('开始CV 5折训练...')
t0 = time.time()
lgb1_preds = np.zeros([test_X.shape[0], 5])
lgb2_preds = np.zeros([test_X.shape[0], 5])
xgb_preds = np.zeros([test_X.shape[0], 5])


kf = KFold(len(X), n_folds=5, shuffle=True,random_state=520)

lgb1_err = []
lgb2_err = []
xgb_err = []
valid_err = []
# 0.35 0.7130454671993266

thredthold = 0.378
for i, (train_index, valid_index) in enumerate(kf):
    print('第{}次训练...'.format(i))
    
    train_X, train_y = X.iloc[train_index], y.iloc[train_index]
    valid_X, valid_y = X.iloc[valid_index], y.iloc[valid_index]
    
    #lightgbm
    lgb_train = lgb.Dataset(train_X, train_y)
    lgb_valid = lgb.Dataset(valid_X, valid_y)   
    
    clf_lgb_model1 = lgb.train(clf_lgb_params1, lgb_train,
                               valid_sets = [lgb_valid], num_boost_round = 3000,
                               verbose_eval=False,
                               early_stopping_rounds = 100)
    lgb1_valid_pred = clf_lgb_model1.predict(valid_X)#np.where(clf_lgb_model1.predict(valid_X) > 0.5, 1, 0)
    lgb1_preds[:, i] = clf_lgb_model1.predict(test_X)
        
    
    clf_lgb_model2 = lgb.train(clf_lgb_params2, lgb_train,
                               valid_sets = [lgb_valid], num_boost_round = 3000,
                               verbose_eval=False,
                               early_stopping_rounds = 50)
    lgb2_valid_pred = clf_lgb_model2.predict(valid_X) #np.where(clf_lgb_model2.predict(valid_X) > 0.5, 1, 0)
    lgb2_preds[:, i] = clf_lgb_model2.predict(test_X)
    
    
    #xgboost
    xgb_train = xgb.DMatrix(train_X, train_y)
    xgb_valid = xgb.DMatrix(valid_X, valid_y)
    
    watchlist = [(xgb_train,'train')]
    clf_xgb_model1 = xgb.train(clf_xgb_params1, xgb_train, num_boost_round=2000,
                               verbose_eval=False, evals=watchlist, early_stopping_rounds=50)
    xgb_valid_pred = clf_xgb_model1.predict(xgb.DMatrix(valid_X),
                                                ntree_limit=clf_xgb_model1.best_ntree_limit + 20)
    xgb_preds[:, i] = clf_xgb_model1.predict(xgb.DMatrix(test_X),
                                                ntree_limit=clf_xgb_model1.best_ntree_limit + 20)
    
        
    # np.where(clf_xgb_model1.predict(xgb.DMatrix(valid_X),
    #                                         ntree_limit=clf_xgb_model1.best_ntree_limit + 20)>0.5,1,0)
    # 
    
    valid_pred = lgb1_valid_pred * 0.35 + lgb2_valid_pred * 0.3 + xgb_valid_pred * 0.35

    
    lgb1_err.append(f1_score(valid_y, np.where(lgb1_valid_pred > thredthold, 1, 0)))
    lgb2_err.append(f1_score(valid_y, np.where(lgb2_valid_pred > thredthold, 1, 0)))
    xgb_err.append(f1_score(valid_y, np.where(xgb_valid_pred > thredthold, 1, 0)))
    valid_err.append(f1_score(valid_y, np.where(valid_pred > thredthold, 1, 0)))

        
    # valid_X_pred = 0.333 * xgb1_pred + 0.333 * xgb1_pred + 0.333 * lgb2_pred
    # valid_X_pred = np.where(np.array(valid_X_pred) > 0.5, 1, 0)
    # print('融合后第{}次得分:{}'.format(i, f1_score(valid_y, valid_X_pred)))
    # X_preds[valid_index] += valid_X_pred

print("lgb1 err {}".format(np.mean(lgb1_err)))
print("lgb2 err {}".format(np.mean(lgb2_err)))
print("xgb err {}".format(np.mean(xgb_err)))
print("valid err {}".format(np.mean(valid_err)))


lgb1_pred = np.mean(lgb1_preds, axis=1)
lgb2_pred = np.mean(lgb2_preds, axis=1)
xgb_pred = np.mean(xgb_preds, axis=1)
    
#分类预测结果
#print('线下得分：{}'.format(f1_score(y, X_preds)))

final_pred = lgb1_pred * 0.35 + lgb2_pred * 0.3 + xgb_pred * 0.35
final_pred = np.where(final_pred > thredthold, 1, 0)

print('CV训练用时{}秒'.format(time.time() - t0))

开始CV 5折训练...
第0次训练...
第1次训练...
第2次训练...
第3次训练...
第4次训练...
lgb1 err 0.697894778994196
lgb2 err 0.7067067718604712
xgb err 0.7079534698630956
valid err 0.7105241499267791
CV训练用时184.59697484970093秒


In [6]:
(final_pred == 1).sum()

109

In [7]:
final_pred = pd.DataFrame({'final_pred' : final_pred})
final_pred.to_csv(data_path + r'sub{}.csv'.format(datetime.datetime.now().strftime('%Y%m%d_%H%M%S')), header=None,
                  index=False, float_format='%.4f')

In [23]:
result = pd.DataFrame({'lgb1':lgb1_pred, 'lgb2':lgb2_pred, 'xgb':xgb_pred})
result.describe()

Unnamed: 0,lgb1,lgb2,xgb
count,200.0,200.0,200.0
mean,0.469092,0.468065,0.458962
std,0.25434,0.19499,0.281146
min,0.063184,0.110813,0.045658
25%,0.250086,0.304721,0.231155
50%,0.449048,0.457441,0.402915
75%,0.672027,0.596207,0.678678
max,0.980736,0.914799,0.983213


In [256]:
pd.Series(clf_xgb_model1.get_fscore()).sort_values(ascending = False).head(40)

VAR00007     293
年龄           178
孕前BMI        127
SNP37_2.0    112
炎症           108
TG            96
wbc           95
SNP34_1.0     94
hsCRP         83
LDLC          82
心血管           78
AST           67
SNP34_2.0     66
SNP20_0.0     66
肾脏            65
Cr            65
ApoA1         53
HDLC          53
BUN           52
身高            48
SNP46_0.0     48
CHO           48
孕前体重          47
SNP37_3.0     46
SNP37_1.0     42
SNP53_1.0     38
SNP53_0.0     37
SNP48_2.0     36
SNP40_2.0     35
SNP24_0.0     34
Lpa           33
SNP8_2.0      30
糖筛孕周          30
SNP5_0.0      29
SNP6_1.0      29
SNP28_2.0     28
SNP46_2.0     28
SNP38_2.0     27
孕产差           27
SNP49_2.0     26
dtype: int64

In [47]:
sub0305_117 = pd.read_csv(data_path + 'sub20180307_114352.csv', encoding='gb2312', header=None)
sub0306_117 = pd.read_csv(data_path + 'f_answer_a_20180306.csv', encoding='gb2312', header=None)
sub_117 = pd.concat([sub0305_117, sub0306_117], axis=1)
sub_117.columns = ['0305', '0306']

In [48]:
files_iter = sub_117.iterrows() # 用df.iterrows() 迭代获取每一行，每一行由(Index, series)元组构成
counter = 0
for index, file_series in files_iter:
    if file_series['0305'] != file_series['0306']:
        print(index)
        counter += 1
print(counter)

4
10
12
25
27
29
36
42
49
58
60
67
68
71
78
83
85
86
90
99
103
106
107
108
110
113
114
120
121
123
129
131
132
134
135
140
149
151
153
157
162
163
168
174
178
181
182
190
194
195
197
198
199
53
