In [16]:
# Import libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [2]:
#!wget https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv

In [3]:
df = pd.read_csv('car_fuel_efficiency.csv')
print (df.head())

   engine_displacement  num_cylinders  horsepower  vehicle_weight  \
0                  170            3.0       159.0     3413.433759   
1                  130            5.0        97.0     3149.664934   
2                  170            NaN        78.0     3079.038997   
3                  220            4.0         NaN     2542.392402   
4                  210            1.0       140.0     3460.870990   

   acceleration  model_year  origin fuel_type         drivetrain  num_doors  \
0          17.7        2003  Europe  Gasoline    All-wheel drive        0.0   
1          17.8        2007     USA  Gasoline  Front-wheel drive        0.0   
2          15.1        2018  Europe  Gasoline  Front-wheel drive        0.0   
3          20.2        2009     USA    Diesel    All-wheel drive        2.0   
4          14.4        2009  Europe  Gasoline    All-wheel drive        2.0   

   fuel_efficiency_mpg  
0            13.231729  
1            13.688217  
2            14.246341  
3         

In [4]:
print (df.isnull().sum())

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64


In [5]:
# Fill missing values with 0
df = df.fillna(0)

print (df.isnull().sum())

engine_displacement    0
num_cylinders          0
horsepower             0
vehicle_weight         0
acceleration           0
model_year             0
origin                 0
fuel_type              0
drivetrain             0
num_doors              0
fuel_efficiency_mpg    0
dtype: int64


In [6]:
# Do train/validation/test split with 60%/20%/20% distribution.
# Use the train_test_split function and set the random_state parameter to 1.
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size = 0.25, random_state=1)

# Reset indexes
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# Set target variables for all datasets
target = 'fuel_efficiency_mpg'
y_train = df_train[target].values
y_test = df_test[target].values
y_val = df_val[target].values

# Delete target variable from datasets
del df_train[target]
del df_test[target]
del df_val[target]

In [7]:
'''
Question 1
Let's train a decision tree regressor to predict the fuel_efficiency_mpg variable.

Use DictVectorizer(sparse=True) to turn the dataframes into matrices.

Train a model with max_depth=1.
Which feature is used for splitting the data?

'vehicle_weight'
'model_year'
'origin'
'fuel_type'
'''
# Prepare data
dv = DictVectorizer(sparse=True)

train_dicts = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dicts)

# Train a decision tree regressor with max_depth=1
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

# Transform validation set and inspect the tree to see the splitting feature
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

# Print tree structure with feature names
feature_names = dv.get_feature_names_out()
tree_text = export_text(dt, feature_names=list(feature_names))
print(tree_text)

|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



In [8]:
'''
Question 2
Train a random forest regressor with these parameters:

n_estimators=10
random_state=1
n_jobs=-1 (optional - to make training faster)
What's the RMSE of this model on the validation data?

0.045
0.45
4.5
45.0
'''
# Train a random forest model
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

# Predict on validation set
y_pred = rf.predict(X_val)

# Calculate RMSE
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print("RMSE:", rmse)

RMSE: 0.4595777223092726


In [None]:
'''
Question 3
Now let's experiment with the n_estimators parameter

Try different values of this parameter from 10 to 200 with step 10.
Set random_state to 1.
Evaluate the model on the validation dataset.
After which value of n_estimators does RMSE stop improving? Consider 3 decimal places for calculating the answer.
10
25
80
200

If it doesn't stop improving, use the latest iteration number in your answer.
'''

rmse_list = []

for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1)
    rf.fit(X_train, y_train)

    y_pred = rf.predict(X_val)
    
    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    rmse_list.append(rmse)

    print(f"n_estimators: {n}, RMSE: {rmse:.3f}")

print ("n_estimators: 200")

n_estimators: 10, RMSE: 0.460
n_estimators: 20, RMSE: 0.454
n_estimators: 30, RMSE: 0.452
n_estimators: 40, RMSE: 0.449
n_estimators: 50, RMSE: 0.447
n_estimators: 60, RMSE: 0.445
n_estimators: 70, RMSE: 0.445
n_estimators: 80, RMSE: 0.445
n_estimators: 90, RMSE: 0.445
n_estimators: 100, RMSE: 0.445
n_estimators: 110, RMSE: 0.444
n_estimators: 120, RMSE: 0.444
n_estimators: 130, RMSE: 0.444
n_estimators: 140, RMSE: 0.443
n_estimators: 150, RMSE: 0.443
n_estimators: 160, RMSE: 0.443
n_estimators: 170, RMSE: 0.443
n_estimators: 180, RMSE: 0.442
n_estimators: 190, RMSE: 0.442
n_estimators: 200, RMSE: 0.442


In [13]:
'''
Question 4
Let's select the best max_depth:

Try different values of max_depth: [10, 15, 20, 25]
For each of these values,
try different values of n_estimators from 10 till 200 (with step 10)
calculate the mean RMSE
Fix the random seed: random_state=1

What's the best max_depth, using the mean RMSE?
10
15
20
25
'''
max_depths = [10, 15, 20, 25]

results = {}

for depth in max_depths:
    rmse_list = []
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n, max_depth=depth, random_state=1, n_jobs=-1)
        rf.fit(X_train, y_train)

        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_list.append(rmse)
    
    mean_rmse = float(np.mean(rmse_list))
    results[depth] = mean_rmse
    print(f"max_depth: {depth}, Mean RMSE over n_estimators: {mean_rmse:.3f}")

print ("max_depth: 10")

        


max_depth: 10, Mean RMSE over n_estimators: 0.442
max_depth: 15, Mean RMSE over n_estimators: 0.445
max_depth: 20, Mean RMSE over n_estimators: 0.446
max_depth: 25, Mean RMSE over n_estimators: 0.446


In [14]:
'''
Question 5
We can extract feature importance information from tree-based models.

At each step of the decision tree learning algorithm, it finds the best split. When doing it, we can calculate "gain" - the reduction in impurity before and after the split. This gain is quite useful in understanding what are the important features for tree-based models.

In Scikit-Learn, tree-based models contain this information in the feature_importances_ field.

For this homework question, we'll find the most important feature:

Train the model with these parameters:
n_estimators=10,
max_depth=20,
random_state=1,
n_jobs=-1 (optional)
Get the feature importance information from this model
What's the most important feature (among these 4)?

vehicle_weight
horsepower
acceleration
engine_displacement
'''

rf = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)

rf.fit(X_train, y_train)

feature_names = dv.get_feature_names_out()
importances = rf.feature_importances_

# Sort by importance
sorted_idx = np.argsort(importances)[::-1]
for idx in sorted_idx[:]:
    print(f"{feature_names[idx]}: {importances[idx]:.3f}")



vehicle_weight: 0.959
horsepower: 0.016
acceleration: 0.011
engine_displacement: 0.003
model_year: 0.003
num_cylinders: 0.002
num_doors: 0.002
origin=USA: 0.001
origin=Europe: 0.001
origin=Asia: 0.000
fuel_type=Gasoline: 0.000
drivetrain=All-wheel drive: 0.000
drivetrain=Front-wheel drive: 0.000
fuel_type=Diesel: 0.000


In [17]:
'''
Question 6
Now let's train an XGBoost model! For this question, we'll tune the eta parameter:
Install XGBoost
Create DMatrix for train and validation
Create a watchlist

Train a model with these parameters for 100 rounds:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

Now change eta from 0.3 to 0.1.

Which eta leads to the best RMSE score on the validation dataset?
0.3
0.1
Both give equal value
'''

dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

watchlist = [(dtrain, 'train'), (dval, 'val')]

xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

model_03 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)

y_pred_03 = model_03.predict(dval)
rmse_03 = np.sqrt(mean_squared_error(y_val, y_pred_03))
print(f'RMSE (eta=0.3): {rmse_03:.3f}')

xgb_params['eta'] = 0.1

model_01 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)

y_pred_01 = model_01.predict(dval)
rmse_01 = np.sqrt(mean_squared_error(y_val, y_pred_01))
print(f'RMSE (eta=0.1): {rmse_01:.3f}')


RMSE (eta=0.3): 0.450
RMSE (eta=0.1): 0.426
