In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import openml

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer
from sklearn.compose import TransformedTargetRegressor
from sklearn.ensemble import RandomForestRegressor

# Whats Going on in this Notebook?
The model wwas buildt up, tuned on the train set using a grid search with 10-fold CV, and then tested on the test set. Several custom classes were created to enact the feature engineering steps in the sklearn pipeline. The final model chose did not necessarily have the best test score. The final model was chosen because it not only had a good test score (although not the best) but it also had a very low standard deviation, meaning it has a much higher probability of giving us a better test score on the test set.

# What was the results?
The model that was created was evaluated on a 33% holdout test set. It scored a mean absolute error of \$1171 (much much better than the models on OpenML might I add) which means that for each automobile in the test set, the average error in the model's prediction was 1171. Considering that this value is significantly lower than the standard deviation of the distribution of prices in the dataset, I would say that this model performed very excellently.

This link was INCREDIBLY helpful to the whole process:
https://adamnovotny.com/blog/custom-scikit-learn-pipeline.html

Some results in absolute standard error:
- linear regression: 1,850 std didnt check
- random forest: 1400 but std of 443
- tuned random forest: 1650 but std of 221

In [3]:
# get the dataset first.
datasets = openml.datasets.list_datasets(output_format='dataframe')
datasets[datasets['name'] == 'auto_price']
dataset = openml.datasets.get_dataset(195)
df = dataset.get_data(dataset_format='dataframe')[0]

In [28]:
# lets just use train test split to set up train and test. 
# using 10 fold cv in the grid search for the validation set
xtrain, xtest, ytrain, ytest = train_test_split(df.drop(columns=['price']), df['price'],
                                               test_size=.33, random_state=24)

First off feature engineer the features that we wanted to engineer. In order to use these in the pipeline we need to use custom classes.

In [5]:
# for the mean of the mpgs.
__version__ = "1.0"
class Select_Features(BaseEstimator, TransformerMixin):
    """Narrows down the model to only the given features.
    """
    def __init__(self, features):
        self.features=features
        pass
    def fit(self, x, y=None):
        return self
    def transform(self, x):
        return x.loc[:, self.features].copy(deep=True)

In [6]:
# for the mean of the mpgs.
__version__ = "1.0"
class Mean_MPG(BaseEstimator, TransformerMixin):
    """Finds the arithmetic mean of the city and highway mpgs for each car.
    """
    def __init__(self):
        pass
    def fit(self, x, y=None):
        return self
    def transform(self, x):
        x['mean-mpg'] = (x['city-mpg'] + x['highway-mpg'])/2
        return x

In [7]:
# for the mean of all of the 'size' features.
__version__ = "1.0"
class Size(BaseEstimator, TransformerMixin):
    """Finds the 'size' coefficient by computing the arithmetic mean of the wheel-base, curb-weight,
    width, length, engine-size, and horsepower.
    """
    def __init__(self):
        pass
    def fit(self, x, y=None):
        return self
    def transform(self, x):
        x['size'] = (x['wheel-base'] + x['curb-weight'] + x['width'] + x['length'] + \
                    x['engine-size'] + x['horsepower'])/6
        return x

In [8]:
column_trans = ColumnTransformer(transformers=[
    ('StandardScaler', StandardScaler(), ['mean-mpg', 'size']),
    ('OneHotEncoder', OneHotEncoder(), ['symboling'])
], remainder=StandardScaler())

In [9]:
pipeline = Pipeline(steps = [
    ("mean-mpg", Mean_MPG()),
    ("size", Size()),
    ("select", Select_Features(['mean-mpg', 'size', 'symboling'])),
    ("ct", column_trans),
    ("model", TransformedTargetRegressor(RandomForestRegressor(n_jobs=-1,
                                                              criterion='absolute_error',
                                                              ),
                                        transformer=StandardScaler()))
])

In [12]:
# because we are using the transformedtargetregressor on the model, access its parameters with:
# model__regressor__{param} since the model is now stored under the regressor parameter in
# transformedtargetregressor
param_grid = dict(
    model__regressor__max_depth=list(range(2,10)),
    model__regressor__ccp_alpha=list(np.logspace(-4,-1,4)),
    model__regressor__n_estimators=[100,300,500],
    model__regressor__min_samples_leaf=[1, 2, 5],# may provide smoothing in the model
    model__regressor__max_features=['auto', 'sqrt']
)

In [13]:
# the cv in here is the validation set.
grid_search = GridSearchCV(pipeline, param_grid=param_grid, cv=10,
                          scoring='neg_mean_absolute_error')
grid_search.fit(xtrain, ytrain)
res = pd.DataFrame(grid_search.cv_results_)

In [14]:
res[res['mean_test_score'] == max(res['mean_test_score'])]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__regressor__ccp_alpha,param_model__regressor__max_depth,param_model__regressor__max_features,param_model__regressor__min_samples_leaf,param_model__regressor__n_estimators,params,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
37,0.257139,0.007373,0.045165,0.002633,0.0001,4,auto,1,300,"{'model__regressor__ccp_alpha': 0.0001, 'model...",...,-1718.912727,-1520.587727,-1958.268333,-1642.439333,-972.806,-1411.853667,-2085.615333,-1646.999236,314.216605,1


In [15]:
res[res['mean_test_score'] == max(res['mean_test_score'])].iloc[0]['params']

{'model__regressor__ccp_alpha': 0.0001,
 'model__regressor__max_depth': 4,
 'model__regressor__max_features': 'auto',
 'model__regressor__min_samples_leaf': 1,
 'model__regressor__n_estimators': 300}

In [16]:
res[res['mean_test_score'] == max(res['mean_test_score'])]

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__regressor__ccp_alpha,param_model__regressor__max_depth,param_model__regressor__max_features,param_model__regressor__min_samples_leaf,param_model__regressor__n_estimators,params,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
37,0.257139,0.007373,0.045165,0.002633,0.0001,4,auto,1,300,"{'model__regressor__ccp_alpha': 0.0001, 'model...",...,-1718.912727,-1520.587727,-1958.268333,-1642.439333,-972.806,-1411.853667,-2085.615333,-1646.999236,314.216605,1


In [17]:
res.sort_values('std_test_score', ascending=True).head(10)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_model__regressor__ccp_alpha,param_model__regressor__max_depth,param_model__regressor__max_features,param_model__regressor__min_samples_leaf,param_model__regressor__n_estimators,params,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
126,0.099634,0.005099,0.019892,0.002032,0.0001,9,auto,1,100,"{'model__regressor__ccp_alpha': 0.0001, 'model...",...,-1824.353636,-1653.696818,-1929.113636,-1710.4045,-1184.147,-1494.72,-1837.4275,-1677.025218,221.805181,21
272,0.301872,0.012496,0.061955,0.027817,0.001,9,auto,1,500,"{'model__regressor__ccp_alpha': 0.001, 'model_...",...,-1829.742273,-1618.113091,-1875.850545,-1672.8756,-1236.4128,-1484.7089,-1938.8905,-1699.838935,228.923595,52
252,0.099779,0.009783,0.021063,0.002437,0.001,8,auto,1,100,"{'model__regressor__ccp_alpha': 0.001, 'model_...",...,-1886.550455,-1599.548636,-1803.158182,-1684.1635,-1290.7555,-1460.257,-1928.8455,-1713.472195,229.049538,60
254,0.303629,0.014508,0.055574,0.004713,0.001,8,auto,1,500,"{'model__regressor__ccp_alpha': 0.001, 'model_...",...,-1858.106364,-1630.160545,-1897.010273,-1636.6181,-1271.9053,-1477.4533,-1971.439,-1702.562443,230.050729,55
109,0.259743,0.02344,0.041902,0.002477,0.0001,8,auto,1,300,"{'model__regressor__ccp_alpha': 0.0001, 'model...",...,-1814.792424,-1557.266667,-1900.706212,-1685.514333,-1210.289667,-1449.876,-1969.745167,-1690.693395,230.429682,43
128,0.309071,0.013748,0.056816,0.005069,0.0001,9,auto,1,500,"{'model__regressor__ccp_alpha': 0.0001, 'model...",...,-1830.439818,-1628.186364,-1863.411455,-1656.2075,-1263.7904,-1436.8214,-1956.7588,-1693.810301,233.397833,47
90,0.10306,0.012165,0.021731,0.002516,0.0001,7,auto,1,100,"{'model__regressor__ccp_alpha': 0.0001, 'model...",...,-1863.851364,-1671.33,-1903.394091,-1630.154,-1293.785,-1371.775,-1939.3095,-1726.919668,241.866015,68
271,0.243784,0.01112,0.03942,0.002581,0.001,9,auto,1,300,"{'model__regressor__ccp_alpha': 0.001, 'model_...",...,-1849.431667,-1572.228788,-1960.904091,-1707.338167,-1277.026833,-1515.531667,-1981.885,-1724.486545,247.136584,65
127,0.256963,0.037339,0.042318,0.002255,0.0001,9,auto,1,300,"{'model__regressor__ccp_alpha': 0.0001, 'model...",...,-1794.615152,-1645.002879,-1852.431061,-1703.616,-1204.513333,-1436.218,-1947.828833,-1706.899071,247.177984,56
236,0.310309,0.01093,0.053822,0.006342,0.001,7,auto,1,500,"{'model__regressor__ccp_alpha': 0.001, 'model_...",...,-1804.018273,-1568.777909,-1870.685273,-1613.4903,-1200.4584,-1479.1009,-2024.2723,-1685.822254,247.448619,35


parameters for result 126 have a really good balance of mean and std.

In [19]:
res.iloc[126]['params']

{'model__regressor__ccp_alpha': 0.0001,
 'model__regressor__max_depth': 9,
 'model__regressor__max_features': 'auto',
 'model__regressor__min_samples_leaf': 1,
 'model__regressor__n_estimators': 100}

Okay lets run this.

In [22]:
pipeline = Pipeline(steps = [
    ("size", Size()),
    ("mean-mpg", Mean_MPG()),
    ("select", Select_Features(['mean-mpg', 'size', 'symboling'])),
    ("ct", column_trans),
    ("model", TransformedTargetRegressor(RandomForestRegressor(n_jobs=-1,
            criterion='absolute_error',
            ccp_alpha=0.0001,
            max_depth=9,
            max_features='auto',
            min_samples_leaf=1,
            # grid search said use 100 estimators but going to use 1000.
            n_estimators=1000
    ), transformer=StandardScaler()))
])

In [24]:
pipeline.fit(xtrain, ytrain)

Pipeline(steps=[('size', Size()), ('mean-mpg', Mean_MPG()),
                ('select',
                 Select_Features(features=['mean-mpg', 'size', 'symboling'])),
                ('ct',
                 ColumnTransformer(remainder=StandardScaler(),
                                   transformers=[('StandardScaler',
                                                  StandardScaler(),
                                                  ['mean-mpg', 'size']),
                                                 ('OneHotEncoder',
                                                  OneHotEncoder(),
                                                  ['symboling'])])),
                ('model',
                 TransformedTargetRegressor(regressor=RandomForestRegressor(ccp_alpha=0.0001,
                                                                            criterion='absolute_error',
                                                                            max_depth=9,
                       

In [25]:
yhat = pipeline.predict(xtest)

In [26]:
from sklearn.metrics import mean_absolute_error as MAE

In [30]:
MAE(ytest, yhat)

1171.4394622641491

Pretty good :)