In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [8]:
df = pd.read_csv("car_fuel_efficiency.csv")
print(df.head())

   engine_displacement  num_cylinders  horsepower  vehicle_weight  \
0                  170            3.0       159.0     3413.433759   
1                  130            5.0        97.0     3149.664934   
2                  170            NaN        78.0     3079.038997   
3                  220            4.0         NaN     2542.392402   
4                  210            1.0       140.0     3460.870990   

   acceleration  model_year  origin fuel_type         drivetrain  num_doors  \
0          17.7        2003  Europe  Gasoline    All-wheel drive        0.0   
1          17.8        2007     USA  Gasoline  Front-wheel drive        0.0   
2          15.1        2018  Europe  Gasoline  Front-wheel drive        0.0   
3          20.2        2009     USA    Diesel    All-wheel drive        2.0   
4          14.4        2009  Europe  Gasoline    All-wheel drive        2.0   

   fuel_efficiency_mpg  
0            13.231729  
1            13.688217  
2            14.246341  
3         

In [9]:
df.isnull().sum()

engine_displacement      0
num_cylinders          482
horsepower             708
vehicle_weight           0
acceleration           930
model_year               0
origin                   0
fuel_type                0
drivetrain               0
num_doors              502
fuel_efficiency_mpg      0
dtype: int64

In [10]:
columns = [
    'engine_displacement',
    'horsepower',
    'vehicle_weight',
    'model_year',
    'fuel_efficiency_mpg'
]
df = df[columns]

print(df.head())

   engine_displacement  horsepower  vehicle_weight  model_year  \
0                  170       159.0     3413.433759        2003   
1                  130        97.0     3149.664934        2007   
2                  170        78.0     3079.038997        2018   
3                  220         NaN     2542.392402        2009   
4                  210       140.0     3460.870990        2009   

   fuel_efficiency_mpg  
0            13.231729  
1            13.688217  
2            14.246341  
3            16.912736  
4            12.488369  


#### Preparing the dataset
    Preparation:

        Fill missing values with zeros.
        Do train/validation/test split with 60%/20%/20% distribution.
        Use the train_test_split function and set the random_state parameter to 1.
        Use DictVectorizer(sparse=True) to turn the dataframes into matrices.

### Question 1
    Let's train a decision tree regressor to predict the fuel_efficiency_mpg variable.

    Train a model with max_depth=1.
    Which feature is used for splitting the data?

    'vehicle_weight'
    'model_year'
    'origin'
    'fuel_type'

In [11]:

# Load data
url = "https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv"
df = pd.read_csv(url)

# 1) Fill missing values with zeros
df = df.fillna(0)

# 2) Split the dataset: 60% train, 20% val, 20% test
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)
# (0.25 * 0.8 = 0.2 â†’ gives 60/20/20 overall)

# 3) Reset index
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

# 4) Separate target and features
y_train = df_train.fuel_efficiency_mpg.values
y_val   = df_val.fuel_efficiency_mpg.values
y_test  = df_test.fuel_efficiency_mpg.values

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

# 5) Turn dataframes into feature matrices using DictVectorizer
dv = DictVectorizer(sparse=True)

train_dicts = df_train.to_dict(orient='records')
val_dicts   = df_val.to_dict(orient='records')
test_dicts  = df_test.to_dict(orient='records')

X_train = dv.fit_transform(train_dicts)
X_val   = dv.transform(val_dicts)
X_test  = dv.transform(test_dicts)

# 6) Train DecisionTreeRegressor with max_depth=1
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train)

# 7) Check which feature was used for splitting
feature_importances = pd.Series(dt.feature_importances_, index=dv.get_feature_names_out())
print("Feature importances:")
print(feature_importances[feature_importances > 0])


Feature importances:
vehicle_weight    1.0
dtype: float64


### Question 2
    Train a random forest regressor with these parameters:

    n_estimators=10
    random_state=1
    n_jobs=-1 (optional - to make training faster)
    What's the RMSE of this model on the validation data?

    0.045
    0.45
    4.5
    45.0

In [12]:
# Train RandomForestRegressor
rf = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train, y_train)

from sklearn.metrics import mean_squared_error
import numpy as np

# Predict on validation data
y_pred = rf.predict(X_val)

# Compute RMSE (manual sqrt to avoid the 'squared' kwarg)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
print("Validation RMSE:", round(rmse, 3))



Validation RMSE: 0.46


### Question 3
    Now let's experiment with the n_estimators parameter

    Try different values of this parameter from 10 to 200 with step 10.
    Set random_state to 1.
    Evaluate the model on the validation dataset.
    After which value of n_estimators does RMSE stop improving? Consider 3 decimal places for calculating the answer.

    10
    25
    80
    200
    If it doesn't stop improving, use the latest iteration number in your answer.

In [13]:
scores = []

# Loop over number of trees from 10 to 200 (step 10)
for n in range(10, 201, 10):
    rf = RandomForestRegressor(
        n_estimators=n,
        random_state=1,
        n_jobs=-1
    )
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    scores.append((n, rmse))

# Display results
for n, s in scores:
    print(f"n_estimators={n:3d}  ->  RMSE={s:.3f}")

# Find where improvement stops
best_rmse = min(scores, key=lambda x: x[1])
print("\nBest RMSE:", round(best_rmse[1], 3), "at n_estimators =", best_rmse[0])


n_estimators= 10  ->  RMSE=0.460
n_estimators= 20  ->  RMSE=0.454
n_estimators= 30  ->  RMSE=0.452
n_estimators= 40  ->  RMSE=0.449
n_estimators= 50  ->  RMSE=0.447
n_estimators= 60  ->  RMSE=0.445
n_estimators= 70  ->  RMSE=0.445
n_estimators= 80  ->  RMSE=0.445
n_estimators= 90  ->  RMSE=0.445
n_estimators=100  ->  RMSE=0.445
n_estimators=110  ->  RMSE=0.444
n_estimators=120  ->  RMSE=0.444
n_estimators=130  ->  RMSE=0.444
n_estimators=140  ->  RMSE=0.443
n_estimators=150  ->  RMSE=0.443
n_estimators=160  ->  RMSE=0.443
n_estimators=170  ->  RMSE=0.443
n_estimators=180  ->  RMSE=0.442
n_estimators=190  ->  RMSE=0.442
n_estimators=200  ->  RMSE=0.442

Best RMSE: 0.442 at n_estimators = 180


### Question 4
    Let's select the best max_depth:

    Try different values of max_depth: [10, 15, 20, 25]
    For each of these values,
    try different values of n_estimators from 10 till 200 (with step 10)
    calculate the mean RMSE
    Fix the random seed: random_state=1
    What's the best max_depth, using the mean RMSE?

    10
    15
    20
    25

In [14]:
depths = [10, 15, 20, 25]
results = []

for d in depths:
    scores = []
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(
            n_estimators=n,
            max_depth=d,
            random_state=1,
            n_jobs=-1
        )
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        scores.append(rmse)
    mean_rmse = np.mean(scores)
    results.append((d, mean_rmse))
    print(f"max_depth={d:2d} -> mean RMSE={mean_rmse:.3f}")

# Find best depth
best = min(results, key=lambda x: x[1])
print("\nBest max_depth:", best[0], "with mean RMSE =", round(best[1], 3))


max_depth=10 -> mean RMSE=0.442
max_depth=15 -> mean RMSE=0.445
max_depth=20 -> mean RMSE=0.446
max_depth=25 -> mean RMSE=0.446

Best max_depth: 10 with mean RMSE = 0.442


### Question 5
        We can extract feature importance information from tree-based models.

        At each step of the decision tree learning algorithm, it finds the best split. When doing it, we can calculate "gain" - the reduction in impurity before and after the split. This gain is quite useful in understanding what are the important features for tree-based models.

        In Scikit-Learn, tree-based models contain this information in the feature_importances_ field.

        For this homework question, we'll find the most important feature:

        Train the model with these parameters:
        n_estimators=10,
        max_depth=20,
        random_state=1,
        n_jobs=-1 (optional)
        Get the feature importance information from this model
        What's the most important feature (among these 4)?

vehicle_weight
horsepower
acceleration
engine_displacement

In [15]:
# Train Random Forest with the given parameters
rf = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train, y_train)

# Get feature importances
importances = pd.Series(rf.feature_importances_, index=dv.get_feature_names_out())
importances_sorted = importances.sort_values(ascending=False)

# Display the top features
print(importances_sorted.head(10))


vehicle_weight         0.959150
horsepower             0.015998
acceleration           0.011480
engine_displacement    0.003273
model_year             0.003212
num_cylinders          0.002343
num_doors              0.001635
origin=USA             0.000540
origin=Europe          0.000519
origin=Asia            0.000462
dtype: float64


### Question 6
Now let's train an XGBoost model! For this question, we'll tune the eta parameter:

Install XGBoost
Create DMatrix for train and validation
Create a watchlist
Train a model with these parameters for 100 rounds:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}
Now change eta from 0.3 to 0.1.

Which eta leads to the best RMSE score on the validation dataset?

0.3
0.1
Both give equal value

In [16]:
import xgboost as xgb

In [17]:
import xgboost as xgb

# 1) Create DMatrix for train and validation
dtrain = xgb.DMatrix(X_train, label=y_train)
dval   = xgb.DMatrix(X_val, label=y_val)

watchlist = [(dtrain, 'train'), (dval, 'val')]

# 2) Train model with eta=0.3
xgb_params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1
}

model_03 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)
y_pred_03 = model_03.predict(dval)
rmse_03 = np.sqrt(mean_squared_error(y_val, y_pred_03))
print("RMSE with eta=0.3:", round(rmse_03, 3))

# 3) Train model with eta=0.1
xgb_params['eta'] = 0.1

model_01 = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist, verbose_eval=False)
y_pred_01 = model_01.predict(dval)
rmse_01 = np.sqrt(mean_squared_error(y_val, y_pred_01))
print("RMSE with eta=0.1:", round(rmse_01, 3))


RMSE with eta=0.3: 0.45
RMSE with eta=0.1: 0.426
