In [50]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

['test.csv', 'train.csv', 'sample_submission.csv']


### 1.第一步，导入python包

In [51]:
import sys
import pandas as pd
import numpy as np

# 导入sklearn的模型
from sklearn.linear_model import Lasso
# 导入sklearn的特征选择方法(RFE+CV:RFE是循环移除不重要特征,CV是交叉验证)
from sklearn.feature_selection import RFECV
# 使用RobustScaler的预处理方式
from sklearn.preprocessing import RobustScaler
# 分割数据集的方式
from sklearn.model_selection import StratifiedShuffleSplit,GridSearchCV
# 评价指标均方根误差,R2决定系数
from sklearn.metrics import mean_squared_error,r2_score,roc_auc_score,make_scorer

### 2.设置参数

In [52]:
# 跑12次，每次数据集打乱，其中训练集比例为0.65,测试集为0.35
sss_n_splits=12
sss_test_size = 0.35

# RFECV参数设置:最小特征，迭代次数,cv次数
rfe_min_features=12
rfe_step=15
rfe_cv=20

# 网格搜索CV设置
grid_search_cv=20

# 加点噪声
noise_std=0.01

# r2决定系数阈值(r2决定系数，当测试集与训练集越匹配的时候值越大)
r2_thresh_hold=0.185

# 随机数种子
random_seed=213
np.random.seed(random_seed)

### 3.数据初始化和预处理

In [53]:
# import data
train = pd.read_csv('../input/train.csv')
train_y = train['target']
train_X = train.drop(['id','target'], axis=1).values

In [54]:
test = pd.read_csv('../input/test.csv')
test = test.drop(['id'], axis=1).values

In [55]:
# 使用RobustScaler数据进行缩放(缩放到第一个4间隔点和第三个4间隔点之间)
data = RobustScaler().fit_transform(np.concatenate((train_X, test), axis=0))
train_X = data[:250]
test = data[250:]
train_X += np.random.normal(0, noise_std, train_X.shape)

### 4.ROC_AUC函数

In [56]:
def scoring_roc_auc(y, y_pred):
    try:
        return roc_auc_score(y, y_pred)
    except:
        return 0.5
robust_roc_auc = make_scorer(scoring_roc_auc)

### 5.定义模型

In [57]:
model = Lasso(alpha=0.031, tol=0.01, random_state=random_seed, selection='random')


### 6.网格搜索

In [58]:
# alpha是惩罚函数,1范数的
# tol是停止更新的参数,< tol就不更新了
param_grid = {
            'alpha' : [0.022, 0.021, 0.02, 0.019, 0.023, 0.024, 0.025, 0.026, 0.027, 0.029, 0.031],
            'tol'   : [0.0013, 0.0014, 0.001, 0.0015, 0.0011, 0.0012, 0.0016, 0.0017]
        }
feature_selector = RFECV(model, min_features_to_select=rfe_min_features, scoring=robust_roc_auc, step=rfe_step, verbose=0, cv=rfe_cv, n_jobs=-1)

### 7.开始循环寻找最优的参数

In [59]:
predictions = pd.DataFrame()
counter = 0

for train_index,val_index in StratifiedShuffleSplit(n_splits=sss_n_splits,test_size=sss_test_size,random_state=random_seed).split(train_X,train_y):
    X, val_X = train_X[train_index], train_X[val_index]
    y, val_y = train_y[train_index], train_y[val_index]
    
    # 去拟合lasso
    feature_selector.fit(X,y)
    
    # 选出比较好的特征
    X_important_features        = feature_selector.transform(X)
    val_X_important_features    = feature_selector.transform(val_X)
    test_important_features     = feature_selector.transform(test)
    
    # 网格搜索
    grid_search = GridSearchCV(feature_selector.estimator_, param_grid=param_grid, verbose=0, n_jobs=-1, scoring=robust_roc_auc, cv=20)
    grid_search.fit(X_important_features, y)
    
    # 选出最好的模型，进行预测
    val_y_pred = grid_search.best_estimator_.predict(val_X_important_features)
    
    # 计算mse,roc,r2
    val_mse = mean_squared_error(val_y, val_y_pred)
    val_roc = roc_auc_score(val_y,val_y_pred)
    val_r2  = r2_score(val_y,val_y_pred)
    
    # 当val_r2大于R2决定系数的时候，将模型用于集成
    if val_r2 > r2_thresh_hold:
        message = '<-- OK'
        prediction = grid_search.best_estimator_.predict(test_important_features)
        predictions = pd.concat([predictions,pd.DataFrame(prediction)],axis=1)
    else:
        message = '<-- skipping'
    print("次数:{}   |mse系数:{}  |roc系数:{}  |r2系数:{}".format(counter,val_mse,val_roc,val_r2))
    counter+=1


次数:0   |mse系数:0.19832549801725438  |roc系数:0.7321428571428572  |r2系数:0.14295052642543637
次数:1   |mse系数:0.1879359560119525  |roc系数:0.7494419642857143  |r2系数:0.18784819009120535
次数:2   |mse系数:0.19144470575029773  |roc系数:0.7650669642857143  |r2系数:0.17268537872192768
次数:3   |mse系数:0.19167289833468554  |roc系数:0.7706473214285715  |r2系数:0.17169926076796627
次数:4   |mse系数:0.1965605739428999  |roc系数:0.7561383928571428  |r2系数:0.15057751974675393
次数:5   |mse系数:0.18119591095452822  |roc系数:0.7845982142857144  |r2系数:0.2169748133750744
次数:6   |mse系数:0.21292272714305074  |roc系数:0.67578125  |r2系数:0.07986964341753078
次数:7   |mse系数:0.16027682581279468  |roc系数:0.8381696428571429  |r2系数:0.3073751455947086
次数:8   |mse系数:0.19859897971414622  |roc系数:0.7416294642857143  |r2系数:0.14176869480672505
次数:9   |mse系数:0.19183656529452248  |roc系数:0.7650669642857142  |r2系数:0.17099198569152785
次数:10   |mse系数:0.21996479065017233  |roc系数:0.7020089285714285  |r2系数:0.04943786897604119
次数:11   |mse系数:0.17932523052560728  |roc系数:

### 8.最终结果

In [60]:
print("{}/{} 个模型进行了模型集成".format(len(predictions.columns), sss_n_splits))

# 对集成的模型取平均
mean_pred = pd.DataFrame(predictions.mean(axis=1))
mean_pred.index += 250
mean_pred.columns = ['target']
mean_pred.to_csv('submission.csv', index_label='id', index=True)      

4/12 个模型进行了模型集成
