In [189]:
# Import pandas library and the data set
import pandas as pd
import numpy as np
df = pd.read_csv('automobile_data.csv')

In [190]:
# Have a look at the first five rows
df.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,13495
1,3,?,alfa-romero,gas,std,two,convertible,rwd,front,88.6,...,130,mpfi,3.47,2.68,9.0,111,5000,21,27,16500
2,1,?,alfa-romero,gas,std,two,hatchback,rwd,front,94.5,...,152,mpfi,2.68,3.47,9.0,154,5000,19,26,16500
3,2,164,audi,gas,std,four,sedan,fwd,front,99.8,...,109,mpfi,3.19,3.4,10.0,102,5500,24,30,13950
4,2,164,audi,gas,std,four,sedan,4wd,front,99.4,...,136,mpfi,3.19,3.4,8.0,115,5500,18,22,17450


In [191]:
# Have a look at the data set's info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 26 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   symboling          205 non-null    int64  
 1   normalized-losses  205 non-null    object 
 2   make               205 non-null    object 
 3   fuel-type          205 non-null    object 
 4   aspiration         205 non-null    object 
 5   num-of-doors       205 non-null    object 
 6   body-style         205 non-null    object 
 7   drive-wheels       205 non-null    object 
 8   engine-location    205 non-null    object 
 9   wheel-base         205 non-null    float64
 10  length             205 non-null    float64
 11  width              205 non-null    float64
 12  height             205 non-null    float64
 13  curb-weight        205 non-null    int64  
 14  engine-type        205 non-null    object 
 15  num-of-cylinders   205 non-null    object 
 16  engine-size        205 non

In [192]:
# Check for any missing data
df.isna().values.any()

False

In [193]:
# Drop rows with missing data in column price
to_remove = df[df.price == '?'].index
df = df.drop(to_remove)

In [194]:
# Retrieve X and y from the data set
X = df.drop('price', axis=1)
y = df['price']

In [195]:
# Get numeric and categorical columns from X
num_features = X._get_numeric_data().columns
cat_features = list(set(X.columns) - set(num_features))

In [196]:
# Have a look at categorical columns' values
for i in cat_features:
    print(i)
    print(X[i].unique())

stroke
['2.68' '3.47' '3.4' '2.8' '3.19' '3.39' '3.03' '3.11' '3.23' '3.46' '3.9'
 '3.41' '3.07' '3.58' '4.17' '2.76' '3.15' '?' '3.16' '3.64' '3.1' '3.35'
 '3.12' '3.86' '3.29' '3.27' '3.52' '2.19' '3.21' '2.9' '2.07' '2.36'
 '2.64' '3.08' '3.5' '3.54' '2.87']
normalized-losses
['?' '164' '158' '192' '188' '121' '98' '81' '118' '148' '110' '145' '137'
 '101' '78' '106' '85' '107' '104' '113' '150' '129' '115' '93' '142'
 '161' '153' '125' '128' '122' '103' '168' '108' '194' '231' '119' '154'
 '74' '186' '83' '102' '89' '87' '77' '91' '134' '65' '197' '90' '94'
 '256' '95']
horsepower
['111' '154' '102' '115' '110' '140' '101' '121' '182' '48' '70' '68' '88'
 '145' '58' '76' '60' '86' '100' '78' '90' '176' '262' '135' '84' '64'
 '120' '72' '123' '155' '184' '175' '116' '69' '55' '97' '152' '160' '200'
 '95' '142' '143' '207' '?' '73' '82' '94' '62' '56' '112' '92' '161'
 '156' '52' '85' '114' '162' '134' '106']
fuel-system
['mpfi' '2bbl' 'mfi' '1bbl' 'spfi' '4bbl' 'idi' 'spdi']
make
['

In [197]:
# Turn categorical columns to numeric
def cylinders(n):
    if n == '?': return np.nan
    elif n == 'two': return 2
    elif n == 'three': return 3
    elif n == 'four': return 4
    elif n == 'five': return 5
    elif n == 'six': return 6
    elif n == 'eight': return 8
    else : return 12   
X['num-of-cylinders'] = X['num-of-cylinders'].apply(cylinders)
X['num-of-doors'] = X['num-of-doors'].apply(cylinders)
for i in ['peak-rpm', 'bore', 'horsepower', 'normalized-losses', 'stroke']:
    X[i] = X[i].apply(pd.to_numeric, errors='coerce')

In [198]:
# Verify the categorical columns
for i in cat_features:
    print(i)
    print(X[i].unique())

stroke
[2.68 3.47 3.4  2.8  3.19 3.39 3.03 3.11 3.23 3.46 3.9  3.41 3.07 3.58
 4.17 2.76 3.15  nan 3.16 3.64 3.1  3.35 3.12 3.86 3.29 3.27 3.52 2.19
 3.21 2.9  2.07 2.36 2.64 3.08 3.5  3.54 2.87]
normalized-losses
[ nan 164. 158. 192. 188. 121.  98.  81. 118. 148. 110. 145. 137. 101.
  78. 106.  85. 107. 104. 113. 150. 129. 115.  93. 142. 161. 153. 125.
 128. 122. 103. 168. 108. 194. 231. 119. 154.  74. 186.  83. 102.  89.
  87.  77.  91. 134.  65. 197.  90.  94. 256.  95.]
horsepower
[111. 154. 102. 115. 110. 140. 101. 121. 182.  48.  70.  68.  88. 145.
  58.  76.  60.  86. 100.  78.  90. 176. 262. 135.  84.  64. 120.  72.
 123. 155. 184. 175. 116.  69.  55.  97. 152. 160. 200.  95. 142. 143.
 207.  nan  73.  82.  94.  62.  56. 112.  92. 161. 156.  52.  85. 114.
 162. 134. 106.]
fuel-system
['mpfi' '2bbl' 'mfi' '1bbl' 'spfi' '4bbl' 'idi' 'spdi']
make
['alfa-romero' 'audi' 'bmw' 'chevrolet' 'dodge' 'honda' 'isuzu' 'jaguar'
 'mazda' 'mercedes-benz' 'mercury' 'mitsubishi' 'nissan' 'peugo

In [199]:
# Get the newly_updated numeric and categorical columns
num_features = X._get_numeric_data().columns
cat_features = list(set(X.columns) - set(num_features))

In [200]:
# Imputer missing values in the newly-converted-to numeric columns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
ii = IterativeImputer()
X[num_features] = ii.fit_transform(X[num_features])

In [201]:
# Encode categorical columns
X = pd.get_dummies(X, columns=cat_features, drop_first=True, dummy_na=True)

In [202]:
# Scale X
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_scaled = sc.fit_transform(X)
X = pd.DataFrame(data=X_scaled, columns=X.columns, index=X.index)

In [203]:
# Split data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [204]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
parameters = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

# Create a based model
rf = RandomForestRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = parameters, 
                          cv = 5, n_jobs = -1, verbose = 2)

# Try fitting training data sets with all parameters
grid_search.fit(X_train,y_train)

# Print the best parameters
print(grid_search.best_params_)

#Fit the training tests using the best parameters
best_grid = RandomForestRegressor(**grid_search.best_params_)
best_grid.fit(X_train,y_train)

# Get the predicted y
predictions = best_grid.predict(X_test)

# Print the mean square error of the predicted and the real charges values
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(predictions, y_test)
print(mse)

Fitting 5 folds for each of 288 candidates, totalling 1440 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  58 tasks      | elapsed:    6.3s
[Parallel(n_jobs=-1)]: Done 182 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done 385 tasks      | elapsed:   54.1s
[Parallel(n_jobs=-1)]: Done 668 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 1033 tasks      | elapsed:  2.4min
[Parallel(n_jobs=-1)]: Done 1440 out of 1440 | elapsed:  3.3min finished


{'bootstrap': True, 'max_depth': 90, 'max_features': 3, 'min_samples_leaf': 3, 'min_samples_split': 8, 'n_estimators': 200}
7913021.013539019
