In [1]:
import numpy as np
import pandas as pd
from sklearn import linear_model
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import cross_validate, train_test_split

### Create feature vector

In [2]:
df_train = pd.read_csv('train.csv')
                       
df_y = df_train.loc[:,'y']

df_xi = df_train.iloc[:,2:]
df_quad = df_xi**2
df_exp = np.exp(df_xi)
df_cos = np.cos(df_xi)
df_const = df_xi.iloc[:,0]*0 +1
df_features = pd.concat([df_xi,df_quad,df_exp,df_cos,df_const],axis=1)
df_features

Unnamed: 0,x1,x2,x3,x4,x5,x1.1,x2.1,x3.1,x4.1,x5.1,...,x2.2,x3.2,x4.2,x5.2,x1.2,x2.3,x3.3,x4.3,x5.3,x1.3
0,0.02,0.05,-0.09,-0.43,-0.08,0.0004,0.0025,0.0081,0.1849,0.0064,...,1.051271,0.913931,0.650509,0.923116,0.999800,0.998750,0.995953,0.908966,0.996802,1.0
1,-0.13,0.11,-0.08,-0.29,-0.03,0.0169,0.0121,0.0064,0.0841,0.0009,...,1.116278,0.923116,0.748264,0.970446,0.991562,0.993956,0.996802,0.958244,0.999550,1.0
2,0.08,0.06,-0.07,-0.41,-0.03,0.0064,0.0036,0.0049,0.1681,0.0009,...,1.061837,0.932394,0.663650,0.970446,0.996802,0.998201,0.997551,0.917121,0.999550,1.0
3,0.02,-0.12,0.01,-0.43,-0.02,0.0004,0.0144,0.0001,0.1849,0.0004,...,0.886920,1.010050,0.650509,0.980199,0.999800,0.992809,0.999950,0.908966,0.999800,1.0
4,-0.14,-0.12,-0.08,-0.02,-0.08,0.0196,0.0144,0.0064,0.0004,0.0064,...,0.886920,0.923116,0.980199,0.923116,0.990216,0.992809,0.996802,0.999800,0.996802,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
695,0.31,-0.09,0.04,-0.09,0.03,0.0961,0.0081,0.0016,0.0081,0.0009,...,0.913931,1.040811,0.913931,1.030455,0.952334,0.995953,0.999200,0.995953,0.999550,1.0
696,-0.26,-0.01,0.02,-0.40,0.05,0.0676,0.0001,0.0004,0.1600,0.0025,...,0.990050,1.020201,0.670320,1.051271,0.966390,0.999950,0.999800,0.921061,0.998750,1.0
697,-0.27,-0.22,-0.01,-0.32,-0.05,0.0729,0.0484,0.0001,0.1024,0.0025,...,0.802519,0.990050,0.726149,0.951229,0.963771,0.975897,0.999950,0.949235,0.998750,1.0
698,0.19,0.11,-0.05,-0.27,-0.04,0.0361,0.0121,0.0025,0.0729,0.0016,...,1.116278,0.951229,0.763379,0.960789,0.982004,0.993956,0.998750,0.963771,0.999200,1.0


In [3]:
y_train,y_test = train_test_split(df_y, shuffle=False)
x_train,x_test = train_test_split(df_features, shuffle=False)

### Do cross validation

In [8]:
K = 5
# lasso needs smaller value of lambda
lambdas_rid = np.logspace(-1,2)
lambdas_las = np.logspace(-2,-1)
ridge = [linear_model.Ridge(alpha=lam,fit_intercept=False) for lam in lambdas_rid]
lasso = [linear_model.Lasso(alpha=lam,fit_intercept=False) for lam in lambdas_las]
models = [*ridge,*lasso]

validation_data = [cross_validate(model,x_train,y_train, cv=K, scoring='neg_root_mean_squared_error')
                   for model in models]
#convert score to error
RME = [validation['test_score']*(-1) for validation in validation_data]
average_error = np.mean(RME,axis=1)


### Choose best model and try on test set

In [5]:
lambdas = [*lambdas_rid , *lambdas_las]
min_index = np.argmin(average_error)
print('lambda = ', lambdas[min_index])
models[min_index].fit(x_train,y_train)
prediction = models[min_index].predict(x_test)
mean_squared_error(prediction, y_test)**0.5

lambda =  0.0655128556859551


2.009505366080599

In [6]:
model = models[min_index]
model.fit(df_features, df_y)
model.coef_

array([ 0.        , -0.        , -0.        ,  0.        ,  0.        ,
       -0.        , -0.        ,  0.        , -0.        ,  0.        ,
       -0.0507238 , -1.58443402, -2.02918799, -0.        , -0.        ,
       -0.80655235, -1.76935614, -0.04776205, -0.        , -0.16703664,
       -0.        ])

In [7]:
pd.DataFrame(model.coef_).to_csv('sample.csv', index=False, header=False)