In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

In [None]:
import sys
import pandas as pd
import numpy as np
# 一种特征选择的方法
from boruta import BorutaPy
# 导入sklearn的模型
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestClassifier
# 导入sklearn的特征选择方法(RFE+CV:RFE是循环移除不重要特征,CV是交叉验证)
from sklearn.feature_selection import RFECV
# 使用RobustScaler的预处理方式
from sklearn.preprocessing import RobustScaler
# 分割数据集的方式
from sklearn.model_selection import StratifiedShuffleSplit,GridSearchCV,KFold
# 评价指标均方根误差,R2决定系数
from sklearn.metrics import mean_squared_error,r2_score,roc_auc_score,make_scorer

In [None]:
# 12折
sss_split=12
# 测试集比例
test_split=0.2
# 种子
random_seed=12
# r2决定系数
r2_threshold=0.1s
np.random.seed(random_seed)



In [None]:
# import data
train = pd.read_csv('../input/train.csv')
train_y = train['target']
train_X = train.drop(['id','target'], axis=1).values

In [None]:
test = pd.read_csv('../input/test.csv')
test = test.drop(['id'], axis=1).values

In [None]:
# 使用RobustScaler数据进行缩放(缩放到第一个4间隔点和第三个4间隔点之间)
data = RobustScaler().fit_transform(np.concatenate((train_X, test), axis=0))
train_X = data[:250]
test = data[250:]
train_X += np.random.normal(0, 0.01, train_X.shape)

In [None]:
# C 惩罚系数,防止svm过拟合
# tol 停止更新的残差
param_grid = {
            'C' : [0.005,0.002,0.01,0.02,0.05,0.1,0.2],
            'tol'   : [0.005,0.002,0.001,0.002,0.0001,0.0002,0.00005]
        }
model = SVR(C=1.0,kernel='linear',tol=0.001)

In [None]:
# auc的函数
def scoring_roc_auc(y, y_pred):
    try:
        return roc_auc_score(y, y_pred)
    except:
        return 0.5
robust_roc_auc = make_scorer(scoring_roc_auc)

In [None]:
predictions = pd.DataFrame()
counter = 0

for train_index,val_index in StratifiedShuffleSplit(n_splits=sss_split,test_size=test_split,random_state=random_seed).split(train_X,train_y):
    X, val_X = train_X[train_index], train_X[val_index]
    y, val_y = train_y[train_index], train_y[val_index]
    # 根据随机森林分类器来做特征选择(Boruta不能用svm做基分类器)
    rf = RandomForestClassifier(n_jobs=-1, max_depth=5)
    # define Boruta feature selection method
    feat_selector = BorutaPy(rf, n_estimators='auto', verbose=2, random_state=random_seed)
    # find all relevant features - 5 features should be selected
    feat_selector.fit(X, y)
    # call transform() on X to filter it down to selected features
    X_feature = feat_selector.transform(X)
    val_X_feature = feat_selector.transform(val_X)
    test_feature = feat_selector.transform(test)
    
    # 网格搜索
    grid_search = GridSearchCV(model, param_grid=param_grid, verbose=0, n_jobs=-1, scoring=robust_roc_auc, cv=20)
    grid_search.fit(X_feature, y)
    
    # 选出最好的模型，进行预测
    val_y_pred = grid_search.best_estimator_.predict(val_X_feature)
    print(grid_search.best_params_)
    
    # 计算mse,roc,r2
    val_mse = mean_squared_error(val_y, val_y_pred)
    val_roc = roc_auc_score(val_y,val_y_pred)
    val_r2  = r2_score(val_y,val_y_pred)
    
    # 当val_r2大于R2决定系数的时候，将模型用于集成
    if val_r2 > r2_threshold:
        message = '<-- OK'
        prediction = grid_search.best_estimator_.predict(test_feature)
        predictions = pd.concat([predictions,pd.DataFrame(prediction)],axis=1)
    else:
        message = '<-- skipping'
    print("次数:{}   |mse系数:{}  |roc系数:{}  |r2系数:{}".format(counter,val_mse,val_roc,val_r2))
    counter+=1


In [None]:
print("{}/{} 个模型进行了模型集成".format(len(predictions.columns), sss_split))

# 对集成的模型取平均
mean_pred = pd.DataFrame(predictions.mean(axis=1))
mean_pred.index += 250
mean_pred.columns = ['target']
mean_pred.to_csv('submission1.csv', index_label='id', index=True)   