In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb

In [5]:
df = pd.read_csv('https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv')

In [6]:
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


#### Data Preparation

In [7]:
df = df.fillna(0)

y = df['fuel_efficiency_mpg'].values
X = df.drop('fuel_efficiency_mpg', axis=1)

# Split: 60% train, 40% temp (which becomes 20% val + 20% test)
X_train_full, X_temp, y_train_full, y_temp = train_test_split(
    X, y, test_size=0.4, random_state=1
)

# Split into validation and test
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.5, random_state=1
)

# Convert to dictionaries
train_dicts = X_train_full.to_dict(orient='records')
val_dicts = X_val.to_dict(orient='records')

dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

#### Question 1

In [8]:
dt = DecisionTreeRegressor(max_depth=1, random_state=1)
dt.fit(X_train, y_train_full)

feature_index = dt.tree_.feature[0]
feature_name = dv.feature_names_[feature_index]
print(f"Feature used for splitting: {feature_name}")

Feature used for splitting: vehicle_weight


#### Question 2

In [9]:
rf = RandomForestRegressor(
    n_estimators=10,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train, y_train_full)

y_pred = rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"RMSE: {rmse:.2f}")

RMSE: 0.46


#### Question 3

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

rmse_scores = []
n_estimators_values = []

print("n_estimators values:")
print("-" * 50)

for n in range(10, 201, 10):
    # Train Random Forest
    rf = RandomForestRegressor(
        n_estimators=n,
        random_state=1,
        n_jobs=-1
    )
    rf.fit(X_train, y_train_full)
    
    # Predict and calculate RMSE
    y_pred = rf.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, y_pred))
    
    # Store results (rounded to 3 decimal places)
    rmse_scores.append(round(rmse, 3))
    n_estimators_values.append(n)
    
    print(f"n_estimators={n:3d}, RMSE={rmse:.3f}")

n_estimators values:
--------------------------------------------------
n_estimators= 10, RMSE=0.460
n_estimators= 20, RMSE=0.446
n_estimators= 30, RMSE=0.440
n_estimators= 40, RMSE=0.438
n_estimators= 50, RMSE=0.437
n_estimators= 60, RMSE=0.436
n_estimators= 70, RMSE=0.436
n_estimators= 80, RMSE=0.436
n_estimators= 90, RMSE=0.435
n_estimators=100, RMSE=0.435
n_estimators=110, RMSE=0.435
n_estimators=120, RMSE=0.435
n_estimators=130, RMSE=0.435
n_estimators=140, RMSE=0.435
n_estimators=150, RMSE=0.435
n_estimators=160, RMSE=0.435
n_estimators=170, RMSE=0.435
n_estimators=180, RMSE=0.435
n_estimators=190, RMSE=0.435
n_estimators=200, RMSE=0.435


#### Question 4

In [18]:
max_depths = [10, 15, 20, 25]
results = {}

for depth in max_depths:
    rmse_list = []
    
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(
            n_estimators=n,
            max_depth=depth,
            random_state=1,
            n_jobs=-1
        )
        rf.fit(X_train, y_train_full)
        y_pred = rf.predict(X_val)
        rmse = np.sqrt(mean_squared_error(y_val, y_pred))
        rmse_list.append(rmse)
    
    mean_rmse = np.mean(rmse_list)
    results[depth] = mean_rmse
    print(f"max_depth={depth}, Mean RMSE={mean_rmse:.4f}")

best_depth = min(results, key=results.get)
print(f"\nvest max_depth: {best_depth}")

max_depth=10, Mean RMSE=0.4362
max_depth=15, Mean RMSE=0.4378
max_depth=20, Mean RMSE=0.4377
max_depth=25, Mean RMSE=0.4377

Best max_depth: 10


#### Question 5

In [14]:
rf = RandomForestRegressor(
    n_estimators=10,
    max_depth=20,
    random_state=1,
    n_jobs=-1
)
rf.fit(X_train, y_train_full)

feature_importance = rf.feature_importances_
feature_names = dv.feature_names_

importance_df = pd.DataFrame({
    'feature': feature_names,
    'importance': feature_importance
}).sort_values('importance', ascending=False)

print(importance_df.head(10))

features_to_check = ['vehicle_weight', 'horsepower', 
                      'acceleration', 'engine_displacement']
for feat in features_to_check:
    idx = feature_names.index(feat)
    print(f"{feat}: {feature_importance[idx]:.4f}")

                feature  importance
13       vehicle_weight    0.959878
6            horsepower    0.015933
0          acceleration    0.011442
3   engine_displacement    0.003159
7            model_year    0.003066
8         num_cylinders    0.002323
9             num_doors    0.001576
12           origin=USA    0.000496
10          origin=Asia    0.000431
11        origin=Europe    0.000419
vehicle_weight: 0.9599
horsepower: 0.0159
acceleration: 0.0114
engine_displacement: 0.0032


#### Question 6

In [16]:
watchlist = [(dtrain, 'train'), (dval, 'val')]

params = {
    'eta': 0.3,
    'max_depth': 6,
    'min_child_weight': 1,
    'objective': 'reg:squarederror',
    'nthread': 8,
    'seed': 1,
    'verbosity': 1,
    'eval_metric': 'rmse',  
}

model_03 = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=watchlist,
    early_stopping_rounds=50,  
    verbose_eval=10
)

params_01 = dict(params, eta=0.1)
model_01 = xgb.train(
    params_01,
    dtrain,
    num_boost_round=1000,
    evals=watchlist,
    early_stopping_rounds=50,
    verbose_eval=10
)

print("eta=0.3 RMSE:", model_03.best_score)      
print("eta=0.1 RMSE:", model_01.best_score)
print("Best iters:", model_03.best_iteration, model_01.best_iteration)

[0]	train-rmse:1.83282	val-rmse:1.82567
[10]	train-rmse:0.37400	val-rmse:0.43004
[20]	train-rmse:0.33724	val-rmse:0.42509
[30]	train-rmse:0.31757	val-rmse:0.42754
[40]	train-rmse:0.29886	val-rmse:0.43086
[50]	train-rmse:0.28277	val-rmse:0.43405
[60]	train-rmse:0.26525	val-rmse:0.43583
[64]	train-rmse:0.25800	val-rmse:0.43643
[0]	train-rmse:2.31334	val-rmse:2.30592
[10]	train-rmse:0.91846	val-rmse:0.92183
[20]	train-rmse:0.49316	val-rmse:0.52039
[30]	train-rmse:0.38552	val-rmse:0.43528
[40]	train-rmse:0.35546	val-rmse:0.41889
[50]	train-rmse:0.34115	val-rmse:0.41644
[60]	train-rmse:0.33254	val-rmse:0.41610
[70]	train-rmse:0.32553	val-rmse:0.41617
[80]	train-rmse:0.32012	val-rmse:0.41654
[90]	train-rmse:0.31718	val-rmse:0.41654
[100]	train-rmse:0.31114	val-rmse:0.41693
[110]	train-rmse:0.30496	val-rmse:0.41810
[118]	train-rmse:0.30032	val-rmse:0.41858
eta=0.3 RMSE: 0.4244959963107532
eta=0.1 RMSE: 0.4158992988437659
Best iters: 15 69
