In this notebook, we create a baseline model and use the RMSE metric to meassure its performance for future reference. 

In [1]:
import pandas as pd
import joblib

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression

In [2]:
data = pd.read_csv("../data/ford.csv")

In [3]:
data.head()

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,Fiesta,2017,12000,Automatic,15944,Petrol,150,57.7,1.0
1,Focus,2018,14000,Manual,9083,Petrol,150,57.7,1.0
2,Focus,2017,13000,Manual,12456,Petrol,150,57.7,1.0
3,Fiesta,2019,17500,Manual,10460,Petrol,145,40.3,1.5
4,Fiesta,2019,16500,Automatic,1482,Petrol,145,48.7,1.0


In [4]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17966 entries, 0 to 17965
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   model         17966 non-null  object 
 1   year          17966 non-null  int64  
 2   price         17966 non-null  int64  
 3   transmission  17966 non-null  object 
 4   mileage       17966 non-null  int64  
 5   fuelType      17966 non-null  object 
 6   tax           17966 non-null  int64  
 7   mpg           17966 non-null  float64
 8   engineSize    17966 non-null  float64
dtypes: float64(2), int64(4), object(3)
memory usage: 1.2+ MB


## Data preparation


In [5]:
# Remove outlier
data = data[data.year < 2060]

In [6]:
# Select features
num_features = ["year", "engineSize", "mileage"]
cat_features = ["fuelType"]

## Baseline model

In [7]:
X = data[num_features + cat_features].copy()
y = data.price

In [8]:
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8,
                                                      random_state=42)

In [9]:
cat_encoder = OneHotEncoder(handle_unknown="ignore")

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', cat_encoder, cat_features)
    ]
)

rf = RandomForestRegressor()

model = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('randomforest', rf)
    ]
)

model.fit(X_train, y_train)

y_predict = model.predict(X_valid)

In [10]:
print("RMSE: ", mean_squared_error(y_valid, y_predict, squared=False))

RMSE:  4626.269702792426


## Save baseline model

In [11]:
joblib.dump(model, "../model/model_rf.pkl")

['../model/model_rf.pkl']