In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(42)

In [2]:
car_data = pd.read_csv('resources/car-sales-extended-missing-data.csv')
car_data.sample(5)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
521,Nissan,Red,196130.0,3.0,13707.0
737,Toyota,Blue,223875.0,4.0,12650.0
740,BMW,Black,11049.0,3.0,19500.0
660,BMW,Blue,209466.0,5.0,
411,BMW,Blue,72575.0,5.0,41294.0


In [3]:
# We will use sklearn Pipelines to pre-process our data
# This includes making everything a number, dealing with missing values
# and One Hot Encoding our categorical features
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer


# First, we drop rows with missing labels (Price Column)
car_data.dropna(subset=["Price"], inplace=True)

# Define different features and transformer pipelines
categorical_features = ["Make", "Colour"]
# Here we create a pipeline which has a list of steps to take
# on our data, each step in the list is a tuple (name, action)
categorical_transformer = Pipeline(
    steps = [
        ("inputer", SimpleImputer(strategy='constant', fill_value='missing')), # Fill NA values with "missing"
        ("onehot", OneHotEncoder(handle_unknown='ignore')) # One Hot Encode all features
    ]
)

doors_feature = ["Doors"]
doors_tranformer = Pipeline(
    steps = [
        ("inputer", SimpleImputer(strategy='constant', fill_value=4))
    ]
)

numeric_features = ["Odometer (KM)"]
numeric_transformer = Pipeline(
    steps=[
        ("inputer", SimpleImputer(strategy='mean'))
    ]
)

# Create a preprocessor that applies all of our transformations to the data
preprocessor = ColumnTransformer(
    transformers= [
        ("categorical", categorical_transformer, categorical_features),
        ("doors", doors_tranformer, doors_feature),
        ("numeric", numeric_transformer, numeric_features)
    ]
)

In [4]:
# Now we will combine the preprocessing steps with the model training step by using pipelines again!
from sklearn.ensemble import RandomForestRegressor

model = Pipeline(steps = [
    ('preprocessing', preprocessor),
    ('training', RandomForestRegressor())
])

In [5]:
# We can now prepare our data so we can feed it to the pipeline
from sklearn.model_selection import train_test_split

x = car_data.drop("Price", axis=1)
y = car_data["Price"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [6]:
# And finally we can train the model
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.2920060423100943

In [7]:
# Cool, but we are not doing any Croos Validation or hyperparameter tunning in this pipeline
from sklearn.model_selection import GridSearchCV

# Lets creat a grid with the hyperparameter for the ENTIRE pipeline
pipeline_grid = {
    # This entry correspond to the strategy of our numeric inputer in the preprocessing step 
    # note: double underscores
    "preprocessing__numeric__inputer__strategy" : ["mean", "median"],
    # The n_estimators paramenters, note that since we are no navigating in the 
    # pipeline hierarchy, there is no need for the double underscore
    # between n and estimators
    "training__n_estimators": [100, 200, 500],
    "training__max_depth": [None, 5],
    "training__max_features": ["auto"],
    "training__min_samples_split": [2,4]
}

grid_cv = GridSearchCV(model, pipeline_grid, cv=5, verbose=1)
grid_cv.fit(x_train, y_train)
grid_cv.score(x_test,y_test)

Fitting 5 folds for each of 24 candidates, totalling 120 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 120 out of 120 | elapsed:   40.9s finished


0.38413246082945096