In [None]:
# Import pandas library and the data set
import pandas as pd
import numpy as np
df = pd.read_csv('automobile_data.csv')

In [None]:
# Have a look at the first five rows
df.head()

In [None]:
# Have a look at the data set's info
df.info()

In [None]:
# Check for any missing data
df.isna().values.any()

In [None]:
# Drop rows with missing data in column price
to_remove = df[df.price == '?'].index
df = df.drop(to_remove)

In [None]:
# Retrieve X and y from the data set
X = df.drop('price', axis=1)
y = df['price']

In [None]:
# Get numeric and categorical columns from X
num_features = X._get_numeric_data().columns
cat_features = list(set(X.columns) - set(num_features))

In [None]:
# Have a look at categorical columns' values
for i in cat_features:
    print(i)
    print(X[i].unique())

In [None]:
# Turn categorical columns to numeric
def cylinders(n):
    if n == '?': return np.nan
    elif n == 'two': return 2
    elif n == 'three': return 3
    elif n == 'four': return 4
    elif n == 'five': return 5
    elif n == 'six': return 6
    elif n == 'eight': return 8
    else : return 12   
X['num-of-cylinders'] = X['num-of-cylinders'].apply(cylinders)
X['num-of-doors'] = X['num-of-doors'].apply(cylinders)
for i in ['peak-rpm', 'bore', 'horsepower', 'normalized-losses', 'stroke']:
    X[i] = X[i].apply(pd.to_numeric, errors='coerce')

In [None]:
# Verify the categorical columns
for i in cat_features:
    print(i)
    print(X[i].unique())

In [None]:
# Get the newly_updated numeric and categorical columns
num_features = X._get_numeric_data().columns
cat_features = list(set(X.columns) - set(num_features))

In [None]:
# Imputer missing values in the newly-converted-to numeric columns
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
ii = IterativeImputer()
X[num_features] = ii.fit_transform(X[num_features])

In [None]:
# Encode categorical columns
X = pd.get_dummies(X, columns=cat_features, drop_first=True, dummy_na=True)

In [None]:
# Scale X
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_scaled = sc.fit_transform(X)
X = pd.DataFrame(data=X_scaled, columns=X.columns, index=X.index)

In [None]:
# Split data into train and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

# Create the parameter grid based on the results of random search 
parameters = {
    'bootstrap': [True],
    'max_depth': [80, 90, 100, 110],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4, 5],
    'min_samples_split': [8, 10, 12],
    'n_estimators': [100, 200, 300, 1000]
}

# Create a based model
rf = RandomForestRegressor()

# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = parameters, 
                          cv = 5, n_jobs = -1, verbose = 2)

# Try fitting training data sets with all parameters
grid_search.fit(X_train,y_train)

# Print the best parameters
print(grid_search.best_params_)

#Fit the training tests using the best parameters
best_grid = RandomForestRegressor(**grid_search.best_params_)
best_grid.fit(X_train,y_train)

# Get the predicted y
predictions = best_grid.predict(X_test)

# Print the mean square error of the predicted and the real charges values
from sklearn.metrics import mean_squared_error

mse = mean_squared_error(predictions, y_test)
print(mse)