Pipelines
---

In [None]:
import pandas as pd

data_df = pd.read_csv('bike-sharing.csv')
data_df.head()

In [None]:
from sklearn.model_selection import train_test_split

# One-hot encoding
encoded_df = pd.get_dummies(data_df)

# Split into train/test sets
X = encoded_df.drop('casual', axis=1).values
y = data_df.casual.values
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.3, random_state=0)

In [None]:
from sklearn.preprocessing import StandardScaler

# Standardize data
scaler = StandardScaler()
X_tr_rescaled = scaler.fit_transform(X_tr)
X_te_rescaled = scaler.transform(X_te)

In [None]:
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.linear_model import Ridge

ridge = Ridge()
ridge.fit(X_tr_rescaled, y_tr)
print('MAE: {:.0f}'.format(MAE(y_te, ridge.predict(X_te))))

In [None]:
import numpy as np

median_predictions = np.full_like(y_te, np.median(y_tr))
print('Median baseline: {:.0f}'.format(MAE(y_te, median_predictions)))

In [None]:
print('MAE: {:.0f}'.format(MAE(y_te, ridge.predict(X_te_rescaled))))

In [None]:
from sklearn.pipeline import Pipeline

# Create pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])

In [None]:
# Get a dictionary with each step
pipe.named_steps

In [None]:
# Fit on the train set
pipe.fit(X_tr, y_tr)

# Evaluate on the test set
print('MAE: {:.0f}'.format(MAE(y_te, pipe.predict(X_te))))

In [None]:
def pipe_fit(X, y):
    # Fit and apply the transformation
    X1 = scaler.fit_transform(X)
    
    # Fit the estimator
    ridge.fit(X1, y)

# Fit to the train data
pipe_fit(X_tr, y_tr)

In [None]:
def pipe_predict(X):
    # Apply the transformation
    X1 = scaler.transform(X)

    # Make predictions
    return ridge.predict(X1)

# Evaluate on the test set
print('MAE: {:.0f}'.format(MAE(y_te, pipe_predict(X_te))))

In [None]:
# Pipeline with three transformations
# pipe = Pipeline([
#     ('transform1', ...),
#     ('transform2', ...),
#     ('transform3', ...),
#     ('estimator', ...)
# ])

In [None]:
# Example with three transformations
def pipe_fit(X, y):
    # Fit and apply the transformations
    X1 = transform1.fit_transform(X)
    X2 = transform2.fit_transform(X1)
    X3 = transform3.fit_transform(X2)

    # Fit the estimator
    estimator.fit(X3, y)

In [None]:
def pipe_predict(X, y):
    # Apply the transformations
    X1 = transform1.transform(X)
    X2 = transform2.transform(X1)
    X3 = transform3.transform(X2)
    
    # Make predictions
    return estimator.predict(X3)

In [None]:
# Create a pipeline
pipe = Pipeline([
    ('scaler', None), # Disable this step
    ('ridge', Ridge())
])

# Fit pipeline to the train set
pipe.fit(X_tr, y_tr)

# Accuracy on the test set
print('MAE: {:.0f}'.format(MAE(y_te, pipe.predict(X_te))))

In [None]:
import numpy as np

# Variable to store the results
gs_results = []

# Grid search
for alpha in np.logspace(-4, 4, num=100):
    # Create/fit the pipeline
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('ridge', Ridge(alpha))
    ])
    pipe.fit(X_tr, y_tr)
    
    # Save model and its performance on train/test sets
    gs_results.append({
        'alpha': alpha,
        'train_mae': MAE(y_tr, pipe.predict(X_tr)),
        'test_mae': MAE(y_te, pipe.predict(X_te)),
    })

# Convert results to DataFrame
gs_results = pd.DataFrame(gs_results)
gs_results.head()

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

# Plot the validation curves
plt.semilogx(gs_results['alpha'], gs_results['train_mae'], label='train curve')
plt.semilogx(gs_results['alpha'], gs_results['test_mae'], label='test curve')
plt.xlabel('$log_{10}(alpha)$')
plt.ylabel('MAE')
plt.legend()
plt.show()

In [None]:
# Create the pipeline
pipe = Pipeline([
    ('scaler', StandardScaler()),
    ('ridge', Ridge())
])
pipe.get_params()

In [None]:
# Variable to store the results
gs_results = []

# Grid search
for alpha in np.logspace(-4, 4, num=100):
    # Fit the pipeline
    pipe.set_params(ridge__alpha=alpha)
    pipe.fit(X_tr, y_tr)
    
    # Save model and its performance on train/test sets
    gs_results.append({
        'alpha': alpha,
        'train_mae': MAE(y_tr, pipe.predict(X_tr)),
        'test_mae': MAE(y_te, pipe.predict(X_te)),
    })

# Convert results to DataFrame
gs_results = pd.DataFrame(gs_results)

# Plot the validation curves
plt.semilogx(gs_results['alpha'], gs_results['train_mae'], label='train curve')
plt.semilogx(gs_results['alpha'], gs_results['test_mae'], label='test curve')
plt.xlabel('$log_{10}(alpha)$')
plt.ylabel('MAE')
plt.legend()
plt.show()