Tuning hyperparameters with grid search
---

In [None]:
import pandas as pd

# Load the data
data_df = pd.read_csv('house-prices.csv')
data_df.head()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

data_df.hist(column='SalePrice', bins=20, grid=False, xrot=45)
plt.show()

In [None]:
import numpy as np

plt.hist(np.log10(data_df.SalePrice), bins=20)
plt.xlabel('log10(SalePrice)')
plt.show()

In [None]:
def preprocess(df):
    # Work on a copy
    df = df.copy()
    
    # One-hot encoding
    df = pd.get_dummies(df, dummy_na=True)
    
    # Fill missing values
    for c in df.columns:
        df[c] = df[c].fillna(df[c].median())
        
    return df

preprocessed_df = preprocess(data_df)
preprocessed_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# Create X, y
X = preprocessed_df.drop('SalePrice', axis=1).values
y = np.log10(preprocessed_df.SalePrice).values

# Split into train/test sets
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.5, random_state=0)

print('Train:', X_tr.shape, y_tr.shape)
print('Test:', X_te.shape, y_te.shape)

In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize features
scaler = StandardScaler()
X_tr_rescaled = scaler.fit_transform(X_tr)
X_te_rescaled = scaler.transform(X_te)

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error as MSE

# Try with a linear regression
lr = LinearRegression()
lr.fit(X_tr_rescaled, y_tr)

print('Train MSE: {:.4f}'.format(MSE(y_tr, lr.predict(X_tr_rescaled))))
print('Test MSE: {:.4f}'.format(MSE(y_te, lr.predict(X_te_rescaled))))

In [None]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_absolute_error as MAE

# Variable to store the results
gs_results = []

# Grid search
for alpha in np.logspace(-10, 10, num=100):
    # Create and fit ridge regression
    ridge = Ridge(alpha=alpha)
    ridge.fit(X_tr_rescaled, y_tr)
    
    # Save model and its performance on train/test sets
    gs_results.append({
        'alpha': alpha,
        'train_mse': MSE(y_tr, ridge.predict(X_tr_rescaled)),
        'train_mae': MAE(10**y_tr, 10**ridge.predict(X_tr_rescaled)),
        'test_mse': MSE(y_te, ridge.predict(X_te_rescaled)),
        'test_mae': MAE(10**y_te, 10**ridge.predict(X_te_rescaled)),
    })

# Convert results to DataFrame
gs_results = pd.DataFrame(gs_results)
gs_results.head()

In [None]:
# Plot the validation curves
plt.semilogx(gs_results['alpha'], gs_results['train_mse'], label='train curve')
plt.semilogx(gs_results['alpha'], gs_results['test_mse'], label='test curve')
plt.xlabel('$log_{10}(alpha)$')
plt.ylabel('MSE')
plt.legend()
plt.show()

In [None]:
# Fit a model with a very strong regularization
strong_ridge = Ridge(alpha=10**10)
strong_ridge.fit(X_tr_rescaled, y_tr)

print('Largest coefficient: {:.1e}'.format(
    np.max(np.abs(strong_ridge.coef_))))
print('Intercept: {:.2f}'.format(
    strong_ridge.intercept_))

In [None]:
print('Mean target value: {:.2f}'.format(np.mean(y_tr)))

In [None]:
# Get entry with the best test MSE
best_result = gs_results.loc[gs_results.test_mse.idxmin()]

# Print the details
print('Best alpha: {:.1e}'.format(best_result.alpha))
print('Test MSE: {:.4f}'.format(best_result.test_mse))
print('Test MAE: {:,.0f}$'.format(best_result.test_mae))

In [None]:
# Fit/test N models
gs_results = []
for run_idx in range(10):
    # Split into train/test sets
    X_tr, X_te, y_tr, y_te = train_test_split(
        X, y, test_size=0.5, random_state=run_idx)
    
    # Standardize features
    X_tr_rescaled = scaler.fit_transform(X_tr)
    X_te_rescaled = scaler.transform(X_te)

    # Grid search
    for alpha in np.logspace(1, 4, num=20):
        # Create and fit ridge regression
        ridge = Ridge(alpha=alpha)
        ridge.fit(X_tr_rescaled, y_tr)

        # Save model and its performance on train/test sets
        gs_results.append({
            'alpha': alpha,
            'run_idx': run_idx,
            'train_mse': MSE(y_tr, ridge.predict(X_tr_rescaled)),
            'train_mae': MAE(10**y_tr, 10**ridge.predict(X_tr_rescaled)),
            'test_mse': MSE(y_te, ridge.predict(X_te_rescaled)),
            'test_mae': MAE(10**y_te, 10**ridge.predict(X_te_rescaled)),
        })
        
# Convert results to DataFrame
gs_results = pd.DataFrame(gs_results)
gs_results.head()

In [None]:
# Group results by alpha value
gb_alpha = gs_results.groupby('alpha')

# Compute train/test mean scores with std
mean_tr = gb_alpha.train_mse.mean()
mean_te = gb_alpha.test_mse.mean()
std_tr = gb_alpha.train_mse.std()
std_te = gb_alpha.test_mse.std()
alphas = mean_tr.index.values

# Get entry with the best mean test MSE
best_alpha = mean_te.idxmin()
best_result = gb_alpha.get_group(best_alpha)

# Print the details
print('Best alpha: {:.1e}'.format(best_alpha))
print('Test MSE: {:.4f}'.format(best_result.test_mse.mean()))
print('Test MAE: {:,.0f}$'.format(best_result.test_mae.mean()))

In [None]:
# Plot mean scores
plt.plot(np.log10(alphas), mean_tr, label='train')
plt.plot(np.log10(alphas), mean_te, label='test')

# Quantify variance with ±std curves
plt.fill_between(np.log10(alphas), mean_tr-std_tr, mean_tr+std_tr, alpha=0.2)
plt.fill_between(np.log10(alphas), mean_te-std_te, mean_te+std_te, alpha=0.2)

# Add marker for best score
plt.scatter(np.log10(best_alpha), mean_te.min(), marker='x', c='red', zorder=10)

plt.title('Validation curves with {} runs'.format(len(gs_results.groupby('run_idx'))))
plt.xlabel('$log_{10}(alpha)$')
plt.ylabel('MSE')
plt.legend()
plt.show()