In [56]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [57]:
#!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

In [58]:
df = pd.read_csv('car_fuel_efficiency.csv')
print (df.head())

   engine_displacement  num_cylinders  horsepower  vehicle_weight  \
0                  170            3.0       159.0     3413.433759   
1                  130            5.0        97.0     3149.664934   
2                  170            NaN        78.0     3079.038997   
3                  220            4.0         NaN     2542.392402   
4                  210            1.0       140.0     3460.870990   

   acceleration  model_year  origin fuel_type         drivetrain  num_doors  \
0          17.7        2003  Europe  Gasoline    All-wheel drive        0.0   
1          17.8        2007     USA  Gasoline  Front-wheel drive        0.0   
2          15.1        2018  Europe  Gasoline  Front-wheel drive        0.0   
3          20.2        2009     USA    Diesel    All-wheel drive        2.0   
4          14.4        2009  Europe  Gasoline    All-wheel drive        2.0   

   fuel_efficiency_mpg  
0            13.231729  
1            13.688217  
2            14.246341  
3         

In [59]:
print (df.isnull().sum())

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64


In [60]:
# Fill missing values with 0
df = df.fillna(0)

print (df.isnull().sum())

engine_displacement    0
num_cylinders          0
horsepower             0
vehicle_weight         0
acceleration           0
model_year             0
origin                 0
fuel_type              0
drivetrain             0
num_doors              0
fuel_efficiency_mpg    0
dtype: int64


In [61]:
# Do train/validation/test split with 60%/20%/20% distribution.
# Use the train_test_split function and set the random_state parameter to 1.
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state=1)

# Reset indexes
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Set target variables for all datasets
target = 'fuel_efficiency_mpg'
y_train = df_train[target].values
y_test = df_test[target].values
y_val = df_val[target].values

# Delete target variable from datasets
del df_train[target]
del df_test[target]
del df_val[target]

In [62]:
'''
Question 1
Let's train a decision tree regressor to predict the fuel_efficiency_mpg variable.

Use DictVectorizer(sparse=True) to turn the dataframes into matrices.

Train a model with max_depth=1.
Which feature is used for splitting the data?

'vehicle_weight'
'model_year'
'origin'
'fuel_type'
'''
# Prepare data
dv = DictVectorizer(sparse=True)

train_dicts = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

# Train a decision tree regressor with max_depth=1
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

# Transform validation set and inspect the tree to see the splitting feature
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

# Print tree structure with feature names
feature_names = dv.get_feature_names_out()
tree_text = export_text(dt, feature_names=list(feature_names))
print(tree_text)

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



In [None]:
'''
Question 2
Train a random forest regressor with these parameters:

n_estimators=10
random_state=1
n_jobs=-1 (optional - to make training faster)
What's the RMSE of this model on the validation data?

0.045
0.45
4.5
45.0
'''
# Train a random forest model
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

# Predict on validation set
y_pred = rf.predict(X_val)

# Calculate RMSE
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print("RMSE:", rmse)

RMSE: 0.4595777223092726


In [None]:
'''
Question 3
Now let's experiment with the n_estimators parameter

Try different values of this parameter from 10 to 200 with step 10.
Set random_state to 1.
Evaluate the model on the validation dataset.
After which value of n_estimators does RMSE stop improving? Consider 3 decimal places for calculating the answer.
10
25
80
200

If it doesn't stop improving, use the latest iteration number in your answer.
'''

rmse = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_val)
    
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    rmse.append(rmse)

    print(f"n_estimators: {n}, RMSE: {rmse:.3f}")

n_estimators: 10, RMSE: 0.4595777223092726
n_estimators: 20, RMSE: 0.45359067251247054
n_estimators: 30, RMSE: 0.45168672575457125
n_estimators: 40, RMSE: 0.4487208301736997
n_estimators: 50, RMSE: 0.4466568972416094
n_estimators: 60, RMSE: 0.44545970260811213
n_estimators: 70, RMSE: 0.4451263244986996
n_estimators: 80, RMSE: 0.4449843119777284
n_estimators: 90, RMSE: 0.4448614906399875
n_estimators: 100, RMSE: 0.4446518680868042
n_estimators: 110, RMSE: 0.44357876439860233
n_estimators: 120, RMSE: 0.4439118681233817
n_estimators: 130, RMSE: 0.443702590396687
n_estimators: 140, RMSE: 0.4433549955101688
n_estimators: 150, RMSE: 0.44289761494219454
n_estimators: 160, RMSE: 0.4427612219659299
n_estimators: 170, RMSE: 0.44280146504730905
n_estimators: 180, RMSE: 0.44236195357041347
n_estimators: 190, RMSE: 0.4424939711220692
n_estimators: 200, RMSE: 0.4424785084688597
