In [1]:
import time
import datetime
import numpy as np
import pandas as pd
import lightgbm as lgb
import xgboost as xgb
from dateutil.parser import parse
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_squared_error
from sklearn.cross_validation import train_test_split
from xgboost.sklearn import XGBClassifier
import matplotlib.pyplot as plt
from sklearn import svm
import math
import matplotlib.pyplot as plt
from lightgbm import Booster as lgbm_Booster
from xgboost import Booster as xgb_Booster



In [4]:
data_path = './data/tc/'

train = pd.read_csv(data_path + 'd_train_20180102.csv', encoding='gb2312')
test = pd.read_csv(data_path + 'd_test_A_20180102.csv', encoding='gb2312')

def make_feat(train, test):
    merge = pd.concat([train, test])
    n_train = len(train)
    train_y = merge['血糖']
    merge = merge.drop(['血糖'], axis=1)
    merge = merge.drop(['id', '乙肝表面抗原', '乙肝表面抗体', '乙肝e抗原', '乙肝e抗体', '乙肝核心抗体'], axis=1)
    merge.fillna(merge.median(axis=0), inplace=True)

    # 性别
    d_sex = pd.get_dummies(merge['性别'])
    merge = pd.concat([d_sex, merge], axis=1)
    merge = merge.drop(['性别'], axis=1)

    # 季节
    merge['季节'] = pd.to_datetime(merge['体检日期']).dt.month
    d_season = pd.get_dummies(merge['季节'], prefix="季节")
    merge = pd.concat([d_season, merge], axis=1)
    merge = merge.drop(['季节'], axis=1)
    merge['体检日期'] = (pd.to_datetime(merge['体检日期']) - parse('2017-10-09')).dt.days

    # 年龄
    def age_level(line):
        age = line['年龄']
        if age < 30:
            return "age_0_30"
        elif age >= 30 and age < 45:
            return "age_30_45"
        elif age >= 45 and age < 60:
            return "age_45_60"
        else:
            return "age_60_100"

    merge['年龄_LEVEL'] = merge.apply(age_level, axis=1)
    d_age = pd.get_dummies(merge['年龄_LEVEL'], prefix="年龄")
    merge = pd.concat([d_age, merge], axis=1)
    merge = merge.drop(['年龄_LEVEL'], axis=1)

    X, y = merge[:n_train], train_y[:n_train]
    test_X = merge[n_train:]

    return X, y, test_X

X, y, test_X = make_feat(train, test)

In [4]:
topk = 1000 #用于分类 取血糖最高1000 和 血糖最低1000
threshold = 10 #定义高血糖的阈值
other = 20 #用于低血糖回归训练集 排序血糖最高的20个， 高血糖回归训练集使用topk

sort_train = train.sort_values(by="血糖", ascending=False) #按血糖排序
high_index = sort_train[0:topk].index 
low_index = sort_train[-topk:].index

other_index = sort_train[20:].index #用于低血糖回归训练集

高低血糖分类模型

In [5]:
high_low_index = np.concatenate((high_index, low_index))
clf_X, clf_y =  X.loc[high_low_index], np.where(y.loc[high_low_index] > threshold, 1, 0)
clf_xgb_params = {
    'booster':'gbtree',
    'objective':'binary:logistic',
    'max_depth': 8,
    'eta':0.01,
    'min_child_weight':3,
    'colsample':0.8,
    'scale_pos_weight':1,
    'gamma':1,
    'n_thread':4,
    'silent':1
 }

clf_xgb_train = xgb.DMatrix(clf_X, clf_y)
watchlist = [(clf_xgb_train,'train')]
clf_xgb_model = xgb.train(clf_xgb_params, clf_xgb_train, num_boost_round=1000,
                          verbose_eval=200, evals=watchlist)

In [7]:
test_pred_high = clf_xgb_model.predict(xgb.DMatrix(test_X)) #分类预测结果

血糖回归模型

In [13]:
#固定参数设置
xgb_params1 = {
    'objective':"reg:linear",
    'max_depth':5,
    'eta':0.01,
    'gamma':0.02,
    'subsample':0.5,
    'silent':1
}

lgb_params1 = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'sub_feature': 0.7,
    'num_leaves': 60,
    'colsample_bytree': 0.7,
    'feature_fraction': 0.7,
    'min_data': 100,
    'min_hessian': 1,
    'verbose': -1,
}

lgb_params2 = {
    'learning_rate': 0.05,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'sub_feature': 0.2,
    'num_leaves': 120,
    'colsample_bytree': 0.2,
    'feature_fraction': 0.5,
    'min_data': 50,
    'min_hessian': 2,
    'verbose': -1,
}

lgb_params3 = {
    'learning_rate': 0.01,
    'boosting_type': 'gbdt',
    'objective': 'regression',
    'metric': 'mse',
    'sub_feature': 0.5,
    'num_leaves': 30,
    'colsample_bytree': 0.4,
    'feature_fraction': 0.6,
    'min_data': 3,
    'min_hessian': 3,
    'verbose': -1,
}
lgb_params1['is_unbalance']='true'
lgb_params2['is_unbalance']='true'
lgb_params3['is_unbalance']='true'

def evalerror(pred, df):
    label = df.get_label().values.copy()
    score = mean_squared_error(label, pred) * 0.5
    return ('0.5mse', score, False)

def blending(models, weights, X, n_y):
    y = np.zeros(n_y)

    for i, model in enumerate(models):
        if isinstance(model, lgbm_Booster):
            #y_i = model.predict(X, num_iteration=model.best_iteration + 20)
            y_i = model.predict(X)
        else:
            #y_i = model.predict(xgb.DMatrix(X), ntree_limit=model.best_ntree_limit + 20)
            y_i = model.predict(xgb.DMatrix(X))

        y += y_i * weights[i]
    return y

In [14]:
# K折交叉验证
def k_fold_predict(X, y, test_X, k=5):
    print('start K Fold train...')
    t0 = time.time()
    kf = KFold(len(X), n_folds=k, shuffle=True)
    X_preds = np.zeros(X.shape[0])
    test_preds = np.zeros((test_X.shape[0], 5))
    for i, (train_index, valid_index) in enumerate(kf):
        train_X, train_y = X.iloc[train_index], y.iloc[train_index]
        valid_X, valid_y = X.iloc[valid_index], y.iloc[valid_index]
        xgb_train = xgb.DMatrix(train_X, train_y)
        xgb_valid = xgb.DMatrix(valid_X, valid_y)
    
        xgb_model = xgb.train(xgb_params1,
                              xgb_train,
                              num_boost_round=3000,
                              evals=[(xgb_valid, 'eval'), (xgb_train, 'train')],
                              verbose_eval=500)
    
        lgb_train = lgb.Dataset(train_X, train_y)
        lgb_valid = lgb.Dataset(valid_X, valid_y)
        
        gbm_model1 = lgb.train(lgb_params1,
                               lgb_train,
                               num_boost_round=3000,
                               valid_sets=lgb_valid,
                               verbose_eval=500,
                               feval=evalerror,
                               early_stopping_rounds=100)
    
        gbm_model2 = lgb.train(lgb_params2,
                               lgb_train,
                               num_boost_round=4500,
                               valid_sets=lgb_valid,
                               verbose_eval=500,
                               feval=evalerror,
                               early_stopping_rounds=500)
    
        gbm_model3 = lgb.train(lgb_params3,
                               lgb_train,
                               num_boost_round=5000,
                               valid_sets=lgb_valid,
                               verbose_eval=500,
                               feval=evalerror,
                               early_stopping_rounds=500)
        W = [0.1,0.35,0.25,0.3]
        M = [xgb_model, gbm_model1, gbm_model2, gbm_model3]
        X_pred = blending(M, W, valid_X, len(valid_X))
        test_pred = blending(M, W, test_X, len(test_X))
        X_preds[valid_index] += X_pred
        test_preds[:, i] = test_pred
    print('train cv score：{}'.format(mean_squared_error(y, X_preds) * 0.5))
    print('cv cost time {}'.format(time.time() - t0))
    
    return test_preds.mean(axis=1)

In [15]:
#分别训练高低两个回归模型
X_high, y_high = X.loc[high_index], y.loc[high_index]
X_low, y_low = X.loc[other_index], y.loc[other_index]

mask = test_pred_high > 0.5
final_sub = pd.Series(np.zeros(test_X.shape[0]))
final_sub[mask] = k_fold_predict(X_high, y_high, test_X[mask])
final_sub[~mask] = k_fold_predict(X_low, y_low, test_X[~mask])

In [19]:
final_sub[~mask]

313    10.060669
601     8.746518
938    11.154725
dtype: float64