# we are going to implement the model preprocessing, training and scoring using a pipeline

In [6]:
# imports
import pandas as pd
import numpy as np

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor



In [3]:
# lets import the data
data = pd.read_csv("car-sales-extended-missing-data.csv")

# remove the columns with missing labels
data = data.dropna(subset=["Price"])
data.columns

Index(['Make', 'Colour', 'Odometer (KM)', 'Doors', 'Price'], dtype='object')

In [9]:
# Pipeline

# setup the preprocessing and imputation pipeline

categorical_features = ["Make","Colour"]
categorical_transformer = Pipeline(steps=[
    ("impute",SimpleImputer(strategy="constant",fill_value="missing")),
    ("onehot",OneHotEncoder(handle_unknown="ignore"))
])

door_feature=["Doors"]
door_transformer = Pipeline(steps=[
    ("impute",SimpleImputer(strategy="constant",fill_value=4))
])

numerical_feature = ["Odometer (KM)"]
numerical_transform = Pipeline(steps=[
    ("impute",SimpleImputer(strategy="constant",fill_value="missing"))    
])

# setup the preprocessor
preprocessor = ColumnTransformer([
    ("cat",categorical_transformer,categorical_features),
    ("door",door_transformer,door_feature),
    ("num",numerical_transform,numerical_feature)
])

# setup the preprocessor and modeling pipeline
model = Pipeline(steps=[
    ("preprocerssor",preprocessor),
    ("model",RandomForestRegressor())
])

# split the data into x and y
x = data.drop("Price",axis=1)
y = data["Price"]

# split the data into train and test split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

# fit the model
model.fit(x_train,y_train)

# score the model
model.score(x_test,y_test)


ValueError: fill_value='missing' (of type <class 'str'>) cannot be cast to the input data that is dtype('float64'). Make sure that both dtypes are of the same kind.