Steps we want to do (all in one cell):
1. Fill missing data
2. Convert data to numbers
3. Build a model on the data

In [19]:
# Pre-processing imports
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# Modeling imports
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

# Setup random seed
import numpy as np
RANDOM_SEED = 42

data = pd.read_csv("../ztm-ml/data/car-sales-extended-missing-data.csv")
# Remove rows with no target price value as we cannot train a model without price
data.dropna(subset=["Price"], inplace=True)

categorical_features = ["Make", "Colour"]
categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="missing")),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))
])

door_features = ["Doors"]
door_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value=4))
])

numerical_features = ["Odometer (KM)"]
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])

# Setup preprocessing steps (fill missing values, then convert to numbers)
preprocessor = ColumnTransformer(
    transformers=[
        ("categorical", categorical_transformer, categorical_features),
        ("door", door_transformer, door_features),
        ("numerical", numerical_transformer, numerical_features)
])  

# Create a preprocessing and modeling pipeline
model = Pipeline(steps=[
    ("preprocessor", preprocessor),
    ("model", RandomForestRegressor(random_state=RANDOM_SEED))
])

x = data.drop("Price", axis=1)
y = data["Price"]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.30965461004684325

It's also possible to use GridSearchCV and RandomizedSearchCV within a Pipeline.

```
grid = {
    'n_estimators': [100, 200, 500],
    'max_depth': [None, 5, 20],
    'max_features': ['log2', 'sqrt'],
    'min_samples_split': [2, 6],
    'min_samples_leaf': [1, 2]
}
```

In [21]:
# Use GridSearchCV or RandomizedSearchCV with our Pipeline
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

pipe_grid = {
    "preprocessor__numerical__imputer__strategy": ["mean", "median"],
    "model__n_estimators": [100, 200],
    "model__max_depth": [None, 5],
    "model__max_features": ['log2', 'sqrt'],
    "model__min_samples_split": [2, 6],
    "model__min_samples_leaf": [1, 2]
}

In [None]:
gs_model = GridSearchCV(model,
                        pipe_grid,
                        cv=5,
                        n_jobs=-1,
                        verbose=True)
gs_model.fit(x_train, y_train)

Fitting 5 folds for each of 64 candidates, totalling 320 fits


In [23]:
gs_model.best_params_

{'model__max_depth': 5,
 'model__max_features': 'log2',
 'model__min_samples_leaf': 1,
 'model__min_samples_split': 2,
 'model__n_estimators': 200,
 'preprocessor__numerical__imputer__strategy': 'median'}

In [24]:
gs_model.score(x_test, y_test)

0.34324874813056194

In [None]:
# # Fill missing values with Scikit-learn
# from sklearn.impute import SimpleImputer
# from sklearn.compose import ColumnTransformer

# # Fill categorical values with "missing" and numerical values with mean
# cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
# door_imputer = SimpleImputer(strategy="constant", fill_value=4)
# num_imputer = SimpleImputer(strategy="mean")

# # Define columns
# cat_features = ["Make", "Colour"]
# door_feature = ["Doors"]
# num_features = ["Odometer (KM)"]

# # Create an imputer (something that fills missing data)
# imputer = ColumnTransformer([
#     ("cat_imputer", cat_imputer, cat_features),
#     ("door_imputer", door_imputer, door_feature),
#     ("num_imputer", num_imputer, num_features)
# ])