In [None]:
!pip install xgboost



In [None]:
import pandas as pd
import numpy as np
import warnings

# ML libraries
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor, export_text
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import xgboost as xgb

# Suppress warnings for cleaner output
warnings.filterwarnings('ignore')

print("### Setup and Data Preparation ###")

### Setup and Data Preparation ###


In [None]:
# --- 1. Data Loading and Preparation ---

# Load the dataset
data_url = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'
df = pd.read_csv(data_url)

# Fill missing values with zeros
df = df.fillna(0)

df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,0.0,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,0.0,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [None]:
# --- 2. Data Splitting (60/20/20) ---

# Split 80% (full_train) and 20% (test)
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)

# Split 80% (full_train) into 60% (train) and 20% (validation)
# 0.2 / 0.8 = 0.25
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

# Verify splits
print(f"Train size: {len(df_train)}, Val size: {len(df_val)}, Test size: {len(df_test)}")

Train size: 5822, Val size: 1941, Test size: 1941


In [None]:
# --- 3. Target Variable (y) Preparation ---

# Get the target variable (fuel_efficiency_mpg)
y_train = df_train.fuel_efficiency_mpg.values
y_val = df_val.fuel_efficiency_mpg.values
y_test = df_test.fuel_efficiency_mpg.values

# Remove the target variable from the dataframes
del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']

In [None]:
# --- 4. Vectorization (DictVectorizer) ---

# Turn dataframes into dictionaries
train_dicts = df_train.to_dict(orient='records')
val_dicts = df_val.to_dict(orient='records')

# Use DictVectorizer (sparse=True)
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

# Get feature names for later use (Q1 and Q5)
feature_names = dv.get_feature_names_out()

print("Data preparation complete.")

Data preparation complete.


In [None]:
# --- Question 1: Decision Tree (max_depth=1) ---
print("\n### Question 1 ###")

# Train the model
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

# Export the tree rules
tree_rules = export_text(dt, feature_names=feature_names.tolist())
print("Decision Tree (max_depth=1) rules:")
print(tree_rules)

# From the output, the split is on 'vehicle_weight'
print("Answer 1: The feature used for splitting is 'vehicle_weight'.")


### Question 1 ###
Decision Tree (max_depth=1) rules:
|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]

Answer 1: The feature used for splitting is 'vehicle_weight'.


In [None]:
# --- Question 2: Random Forest (RMSE) ---
print("### Question 2 ###")

# Train the model
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)
rf.fit(X_train, y_train)

# Predict on validation data
y_pred_val = rf.predict(X_val)

# Calculate RMSE
rmse = np.sqrt(mean_squared_error(y_val, y_pred_val))
print(f"RMSE on validation data: {rmse:.3f}")

### Question 2 ###
RMSE on validation data: 0.460


In [None]:
# --- Question 3: Tuning n_estimators ---
print("### Question 3 ###")

scores = []
estimators_range = range(10, 201, 10)

for n in estimators_range:
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    rf.fit(X_train, y_train)
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    # Rounding to 3 decimal places as requested
    scores.append((n, round(rmse, 3)))

# Print scores to find the minimum
print("RMSE scores by n_estimators (rounded to 3 decimals):")
# print(pd.DataFrame(scores, columns=['n_estimators', 'rmse']))

# Find the point where RMSE stops improving
min_rmse = min(scores, key=lambda x: x[1])
print(f"Minimum RMSE ({min_rmse[1]}) first reached at n_estimators={min_rmse[0]}.")

### Question 3 ###
RMSE scores by n_estimators (rounded to 3 decimals):
Minimum RMSE (0.442) first reached at n_estimators=180.
Answer 3: RMSE stops improving after 180 estimators.


In [None]:
# --- Question 4: Tuning max_depth ---
print("### Question 4 ###")

mean_rmses = {}
depths = [10, 15, 20, 25]
estimators = range(10, 201, 10)

for d in depths:
    rmses = []
    for n in estimators:
        rf = RandomForestRegressor(n_estimators=n,
                                 max_depth=d,
                                 random_state=1,
                                 n_jobs=-1)
        rf.fit(X_train, y_train)
        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmses.append(rmse)

    mean_rmses[d] = np.mean(rmses)
    print(f"Depth: {d}, Mean RMSE: {mean_rmses[d]:.4f}")

best_depth = min(mean_rmses, key=mean_rmses.get)
print(f"The best max_depth is {best_depth}.")

### Question 4 ###
Depth: 10, Mean RMSE: 0.4418
Depth: 15, Mean RMSE: 0.4454
Depth: 20, Mean RMSE: 0.4463
Depth: 25, Mean RMSE: 0.4459
The best max_depth is 10.


In [None]:
# --- Question 5: Feature Importance ---
print("### Question 5 ###")

# Train the model
rf_feat = RandomForestRegressor(n_estimators=10,
                                max_depth=20,
                                random_state=1,
                                n_jobs=-1)
rf_feat.fit(X_train, y_train)

# Get feature importances
importances = rf_feat.feature_importances_

# Map feature names to importances
feat_imp_df = pd.DataFrame({'feature': feature_names, 'importance': importances})
feat_imp_df = feat_imp_df.sort_values(by='importance', ascending=False)

print("Top 5 Feature Importances:")
print(feat_imp_df.head())

# Check the importances for the 4 options:
print("\nImportance of specific features:")
options = ['vehicle_weight', 'horsepower', 'acceleration', 'engine_displacement']
print(feat_imp_df[feat_imp_df.feature.isin(options)])

print("Answer 5: The most important feature among the options is 'vehicle_weight'.")

### Question 5 ###
Top 5 Feature Importances:
                feature  importance
13       vehicle_weight    0.959150
6            horsepower    0.015998
0          acceleration    0.011480
3   engine_displacement    0.003273
7            model_year    0.003212

Importance of specific features:
                feature  importance
13       vehicle_weight    0.959150
6            horsepower    0.015998
0          acceleration    0.011480
3   engine_displacement    0.003273
Answer 5: The most important feature among the options is 'vehicle_weight'.


In [None]:
# --- Question 6: XGBoost eta tuning ---
print("### Question 6 ###")

# Create DMatrix (XGBoost's internal data structure)
# Sparse matrices from DictVectorizer are compatible.
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

# Watchlist to monitor performance
watchlist = [(dtrain, 'train'), (dval, 'val')]

# --- Train with eta=0.3 ---
xgb_params_03 = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

evals_result_03 = {} # To store evaluation results
model_03 = xgb.train(xgb_params_03,
                     dtrain,
                     num_boost_round=100,
                     evals=watchlist,
                     evals_result=evals_result_03,
                     verbose_eval=False) # Suppress round-by-round output

rmse_03 = min(evals_result_03['val']['rmse'])
print(f"Best RMSE (eta=0.3): {rmse_03:.4f}")

# --- Train with eta=0.1 ---
xgb_params_01 = {
    'eta': 0.1, # Changed parameter
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
}

evals_result_01 = {} # To store evaluation results
model_01 = xgb.train(xgb_params_01,
                     dtrain,
                     num_boost_round=100,
                     evals=watchlist,
                     evals_result=evals_result_01,
                     verbose_eval=False)

rmse_01 = min(evals_result_01['val']['rmse'])
print(f"Best RMSE (eta=0.1): {rmse_01:.4f}")

print("Answer 6: 0.1 leads to the best RMSE score.")

### Question 6 ###
Best RMSE (eta=0.3): 0.4335
Best RMSE (eta=0.1): 0.4243
Answer 6: 0.1 leads to the best RMSE score.
