In [22]:
import os
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn import linear_model
from sklearn import model_selection

In [23]:
# load data
data = pd.read_csv('./train.csv', float_precision='high')
y = data['y']
raw_X = data.iloc[:, 2:]

# create the features that are required for the task:
quadratic = raw_X**2
exponential = np.exp(raw_X)
cosine = np.cos(raw_X)
constant = pd.Series(np.ones(raw_X.shape[0]))

# merge features to dataframe
X = pd.concat([raw_X, quadratic, exponential, cosine, constant], axis=1)

# rename columns
column_names = list()
for i in range(1, 22):
    column_names.append('x{}'.format(i))
X.columns = column_names

0     -6.822679
1     -6.326290
2     -9.302728
3     -7.371893
4     -6.027647
         ...   
695   -6.147394
696   -6.214899
697   -6.841341
698   -6.523371
699   -5.676683
Name: y, Length: 700, dtype: float64

In [9]:
# search for best alpha with 10-fold cross validation
regr = linear_model.Ridge()
rmse = metrics.make_scorer(metrics.mean_squared_error, squared=False,
                          greater_is_better=False, fit_intercept=False)
param = {'alpha': np.linspace(5.34, 5.342, 10)}
gs = model_selection.GridSearchCV(estimator=regr, cv=20, scoring=rmse,
                                  param_grid=param)
gs.fit(X, y)

GridSearchCV(cv=20, estimator=Ridge(),
             param_grid={'alpha': array([5.34      , 5.34022222, 5.34044444, 5.34066667, 5.34088889,
       5.34111111, 5.34133333, 5.34155556, 5.34177778, 5.342     ])},
             scoring=make_scorer(mean_squared_error, greater_is_better=False, squared=False))

In [24]:
# evaluate gridsearch
scores = gs.cv_results_['mean_test_score']
stds = gs.cv_results_['std_test_score']
parameters = gs.cv_results_['params']
for mean, std, par in zip(scores, stds, parameters):
    print('%0.6f (+/-%0.04f): %s)' % (mean, std * 2, par))
print('')
print('%0.6f: %s)' % (gs.best_score_, gs.best_params_))

-1.949639 (+/-0.4556): {'alpha': 5.34})
-1.949639 (+/-0.4556): {'alpha': 5.340222222222222})
-1.949639 (+/-0.4556): {'alpha': 5.3404444444444445})
-1.949639 (+/-0.4556): {'alpha': 5.3406666666666665})
-1.949639 (+/-0.4556): {'alpha': 5.340888888888888})
-1.949639 (+/-0.4556): {'alpha': 5.341111111111111})
-1.949639 (+/-0.4556): {'alpha': 5.341333333333333})
-1.949639 (+/-0.4556): {'alpha': 5.341555555555555})
-1.949639 (+/-0.4556): {'alpha': 5.341777777777778})
-1.949639 (+/-0.4556): {'alpha': 5.342})

-1.949639: {'alpha': 5.34})


In [25]:
# fit with best found parameter
final_regr = linear_model.Ridge(alpha=gs.best_params_['alpha'],
                               fit_intercept=False)
final_regr.fit(X, y)

# get weights
weights = final_regr.coef_

array([ 0.18774177, -0.44411429, -0.64918903,  0.22956104,  0.09340925,
       -0.27724755,  0.15670077,  0.13247267, -0.14640312,  0.07723636,
       -0.49686822, -0.90797117, -1.12983997, -0.38501133, -0.41647907,
       -0.40238602, -0.6180734 , -0.60591403, -0.46864331, -0.57730255,
       -0.54014733])

In [24]:
# create submission file
result = pd.DataFrame({'weights': weights})
filename = 'submission_ridge_tuned.csv'
result.to_csv(os.path.join('.', filename), index=False, header=False)

In [19]:
# cross validation with best found parameter and select best split
model = linear_model.Ridge(alpha=gs.best_params_['alpha'])
kf = model_selection.KFold(n_splits=20, shuffle=True, random_state=False)
low_loss = 10
for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    loss = metrics.mean_squared_error(y_test, y_pred, squared=False)
    if loss < low_loss:
        weights = model.coef_
        low_loss = loss
        print(loss, '<--')
        continue
    print(loss)
print(weights)

1.6375180184495604 <--
1.8517809733189285
2.1663743995094067
1.8299631029490628
1.6625165615480793
2.142432424910959
2.758428367273095
2.101877806707221
1.798777469560628
1.808100255600022
2.230172021340251
1.648423154629421
1.918671892757678
2.2303658283943046
2.0229646894587154
1.6024325126326844 <--
2.0380985839759407
1.9545090445779354
1.9311432927260963
1.5885387944901856 <--
[-0.10529522 -0.70196259 -0.98611871 -0.08744262 -0.0110359  -0.25611092
  0.24979152  0.21424441  0.01995888  0.09584798 -0.24056155 -0.58413623
 -0.88898609 -0.08910956  0.02725983  0.12708403 -0.12406523 -0.10644776
 -0.0104891  -0.04627945  0.        ]
