In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

# Load Data

In [2]:
df = pd.read_csv('cars_multivariate.txt')
df = df[df.horsepower != '?']
df.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car_name
0,18.0,8,307.0,130.0,3504.0,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350.0,165.0,3693.0,11.5,70,1,buick skylark 320
2,18.0,8,318.0,150.0,3436.0,11.0,70,1,plymouth satellite
3,16.0,8,304.0,150.0,3433.0,12.0,70,1,amc rebel sst
4,17.0,8,302.0,140.0,3449.0,10.5,70,1,ford torino


In [3]:
y = df.mpg
X = df.drop(columns=['mpg', 'car_name'])

# Split and Scale

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)
scalar = StandardScaler().fit(X_train)
X_train = scalar.transform(X_train)
X_test = scalar.transform(X_test)

# Grid Search

In [5]:
params = {'alpha': np.logspace(-4, 5)}

In [6]:
# set initial values in constructor
# params will override parameters not set in constructor
# any param you specify in the constructor WILL NOT CHANGE during grid search
gs = GridSearchCV(Ridge(), param_grid=params, cv=10, n_jobs=5).fit(X_train, y_train)
gs

GridSearchCV(cv=10, error_score='raise-deprecating',
       estimator=Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001),
       fit_params=None, iid='warn', n_jobs=5,
       param_grid={'alpha': array([1.00000e-04, 1.52642e-04, 2.32995e-04, 3.55648e-04, 5.42868e-04,
       8.28643e-04, 1.26486e-03, 1.93070e-03, 2.94705e-03, 4.49843e-03,
       6.86649e-03, 1.04811e-02, 1.59986e-02, 2.44205e-02, 3.72759e-02,
       5.68987e-02, 8.68511e-02, 1.32571e-01, 2.02359e-01, 3.08... 7.90604e+03, 1.20679e+04,
       1.84207e+04, 2.81177e+04, 4.29193e+04, 6.55129e+04, 1.00000e+05])},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

# Results

In [7]:
gs.best_estimator_

Ridge(alpha=3.906939937054621, copy_X=True, fit_intercept=True, max_iter=None,
   normalize=False, random_state=None, solver='auto', tol=0.001)

In [8]:
gs.best_params_

{'alpha': 3.906939937054621}

In [9]:
gs.best_score_

0.7891446024206112

In [10]:
gs.get_params()

{'cv': 10,
 'error_score': 'raise-deprecating',
 'estimator__alpha': 1.0,
 'estimator__copy_X': True,
 'estimator__fit_intercept': True,
 'estimator__max_iter': None,
 'estimator__normalize': False,
 'estimator__random_state': None,
 'estimator__solver': 'auto',
 'estimator__tol': 0.001,
 'estimator': Ridge(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=None,
    normalize=False, random_state=None, solver='auto', tol=0.001),
 'fit_params': None,
 'iid': 'warn',
 'n_jobs': 5,
 'param_grid': {'alpha': array([1.00000000e-04, 1.52641797e-04, 2.32995181e-04, 3.55648031e-04,
         5.42867544e-04, 8.28642773e-04, 1.26485522e-03, 1.93069773e-03,
         2.94705170e-03, 4.49843267e-03, 6.86648845e-03, 1.04811313e-02,
         1.59985872e-02, 2.44205309e-02, 3.72759372e-02, 5.68986603e-02,
         8.68511374e-02, 1.32571137e-01, 2.02358965e-01, 3.08884360e-01,
         4.71486636e-01, 7.19685673e-01, 1.09854114e+00, 1.67683294e+00,
         2.55954792e+00, 3.90693994e+00, 5.96362332e+

# Compare to Baseline

In [11]:
r2_score(y_test, gs.predict(X_test))

0.8398097025208954

In [12]:
r2_score(y_test, LinearRegression().fit(X_train, y_train).predict(X_test))

0.8380360648144675