In [11]:
import os
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import GridSearchCV

In [12]:
data = pd.read_csv('./train.csv')
y = data['y']
X = data.iloc[:, 1:]

### Straight forward approach

In [13]:
ridge = linear_model.Ridge()
param = {'alpha': [0.1, 1, 10, 100, 200]}
reg = GridSearchCV(estimator=ridge, cv=10, scoring='neg_root_mean_squared_error', param_grid=param)
reg.fit(X, y)

GridSearchCV(cv=10, estimator=Ridge(),
             param_grid={'alpha': [0.1, 1, 10, 100, 200]},
             scoring='neg_root_mean_squared_error')

In [14]:
sorted(reg.cv_results_.keys())

['mean_fit_time',
 'mean_score_time',
 'mean_test_score',
 'param_alpha',
 'params',
 'rank_test_score',
 'split0_test_score',
 'split1_test_score',
 'split2_test_score',
 'split3_test_score',
 'split4_test_score',
 'split5_test_score',
 'split6_test_score',
 'split7_test_score',
 'split8_test_score',
 'split9_test_score',
 'std_fit_time',
 'std_score_time',
 'std_test_score']

In [15]:
scores = -1 * reg.cv_results_['mean_test_score']
stds = reg.cv_results_['std_test_score']
for mean, std in zip(scores, stds):
    print('%0.6f (+/-%0.04f)' % (mean, std * 2))
print('')
print('%0.6f (+/-%0.04f)' % (np.mean(scores), np.std(scores)))

5.501809 (+/-2.5425)
5.499839 (+/-2.6589)
5.483631 (+/-2.8876)
5.636642 (+/-3.3998)
5.721234 (+/-3.5577)

5.568631 (+/-0.0942)


In [16]:
result = pd.DataFrame({'rmse': scores})
filename = 'submission01.csv'
result.to_csv(os.path.join('.', filename), index=False, header=False)

### Stratified kfolds

In [17]:
from sklearn.model_selection import StratifiedKFold

In [18]:
y_class = pd.qcut(y, q=10, labels=False)
skf = StratifiedKFold(n_splits=10)
kf = skf.split(X, y_class)

In [19]:
reg_strat = GridSearchCV(estimator=ridge, cv=kf, scoring='neg_root_mean_squared_error', param_grid=param)
reg_strat.fit(X, y)

GridSearchCV(cv=<generator object _BaseKFold.split at 0x7f9085875970>,
             estimator=Ridge(), param_grid={'alpha': [0.1, 1, 10, 100, 200]},
             scoring='neg_root_mean_squared_error')

In [20]:
scores_strat = -1 * reg_strat.cv_results_['mean_test_score']
stds_strat = reg_strat.cv_results_['std_test_score']
for mean, std in zip(scores_strat, stds_strat):
    print('%0.6f (+/-%0.04f)' % (mean, std * 2))
print('')
print('%0.6f (+/-%0.04f)' % (np.mean(scores_strat), np.std(scores_strat)))

5.395367 (+/-2.8389)
5.391044 (+/-2.9443)
5.388171 (+/-3.0129)
5.573697 (+/-3.1907)
5.658800 (+/-3.3142)

5.481416 (+/-0.1134)


In [21]:
result_strat = pd.DataFrame({'rmse': scores_strat})
filename_strat = 'submission_strat_01.csv'
result_strat.to_csv(os.path.join('.', filename_strat), index=False, header=False)