In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv")
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [3]:
df = df.fillna(0)
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,0.0,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,0.0,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [4]:
from sklearn.model_selection import train_test_split
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
y_train = df_train['fuel_efficiency_mpg'].values
y_test = df_test['fuel_efficiency_mpg'].values
y_val = df_val['fuel_efficiency_mpg'].values

del df_train['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']

In [6]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.feature_extraction import DictVectorizer
from sklearn.tree import export_text

train_dicts = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dicts)

dt_reg = DecisionTreeRegressor(max_depth=1)
dt_reg.fit(X_train, y_train)

print(export_text(dt_reg, feature_names=dv.get_feature_names_out()))


|--- vehicle_weight <= 3022.11
|   |--- value: [16.88]
|--- vehicle_weight >  3022.11
|   |--- value: [12.94]



In [27]:
from sklearn.ensemble  import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

rf_reg = RandomForestRegressor(random_state=1, n_estimators=10, n_jobs=-1)
rf_reg.fit(X_train, y_train)
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)

y_pred = rf_reg.predict(X_val)
rmse = root_mean_squared_error(y_val, y_pred)
print(f"RMSE: {rmse}")

RMSE: 0.4595777223092726


In [68]:
import numpy as np
n_estimators_array = np.arange(10, 210, 10)
for n_estimators in n_estimators_array:
    rf_reg = RandomForestRegressor(random_state=1, n_estimators=n_estimators, n_jobs=-1)
    rf_reg.fit(X_train, y_train)
    val_dicts = df_val.to_dict(orient='records')
    X_val = dv.transform(val_dicts)
    
    y_pred = rf_reg.predict(X_val)
    rmse = root_mean_squared_error(y_val, y_pred)
    print(f"for {n_estimators} n_estimators the RMSE: {rmse}")

for 10 n_estimators the RMSE: 0.4595777223092726
for 20 n_estimators the RMSE: 0.45359067251247054
for 30 n_estimators the RMSE: 0.45168672575457125
for 40 n_estimators the RMSE: 0.4487208301736997
for 50 n_estimators the RMSE: 0.4466568972416094
for 60 n_estimators the RMSE: 0.4454597026081121
for 70 n_estimators the RMSE: 0.4451263244986996
for 80 n_estimators the RMSE: 0.4449843119777284
for 90 n_estimators the RMSE: 0.4448614906399875
for 100 n_estimators the RMSE: 0.4446518680868042
for 110 n_estimators the RMSE: 0.4435787643986024
for 120 n_estimators the RMSE: 0.4439118681233817
for 130 n_estimators the RMSE: 0.443702590396687
for 140 n_estimators the RMSE: 0.4433549955101688
for 150 n_estimators the RMSE: 0.44289761494219454
for 160 n_estimators the RMSE: 0.4427612219659299
for 170 n_estimators the RMSE: 0.44280146504730905
for 180 n_estimators the RMSE: 0.44236195357041347
for 190 n_estimators the RMSE: 0.44249397112206923


In [None]:
import numpy as np
n_estimators_array = np.arange(10, 210, 10)
max_depth_array = [10, 15, 20, 25]
for max_depth in max_depth_array:
    for n_estimators in n_estimators_array:
        rf_reg = RandomForestRegressor(max_depth=max_depth, random_state=1, n_estimators=n_estimators, n_jobs=-1)
        rf_reg.fit(X_train, y_train)
        val_dicts = df_val.to_dict(orient='records')
        X_val = dv.transform(val_dicts)
        
        y_pred = rf_reg.predict(X_val)
        rmse = root_mean_squared_error(y_val, y_pred)
        print(f"for {max_depth} and for {n_estimators} n_estimators the RMSE: {rmse}")

for 10 and for 10 n_estimators the RMSE: 0.4502486597058524
for 10 and for 20 n_estimators the RMSE: 0.44685703362920204
for 10 and for 30 n_estimators the RMSE: 0.44547396459413735
for 10 and for 40 n_estimators the RMSE: 0.4430673112962584
for 10 and for 50 n_estimators the RMSE: 0.44195668621793566


In [25]:
from sklearn.ensemble  import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error

train_dicts = df_train.to_dict(orient='records')
dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dicts)
model  = RandomForestRegressor(random_state=1, n_estimators=10, max_depth=20, n_jobs=-1)
model .fit(X_train, y_train)

feature_names = dv.feature_names_
importances = model.feature_importances_

for name, importance in sorted(zip(feature_names, importances), reverse=True):
    print(name, importance)

vehicle_weight 0.9591499647407432
origin=USA 0.0005397216891829147
origin=Europe 0.000518739638586969
origin=Asia 0.0004622464955097423
num_doors 0.0016349895439306998
num_cylinders 0.0023433469524512048
model_year 0.003212300094794675
horsepower 0.015997897714266237
fuel_type=Gasoline 0.00036038360069172865
fuel_type=Diesel 0.000325424322869738
engine_displacement 0.0032727919136094864
drivetrain=Front-wheel drive 0.00034538411263183535
drivetrain=All-wheel drive 0.0003571085493021933
acceleration 0.01147970063142938


In [32]:
%pip install xgboost
import xgboost as xgb

dv = DictVectorizer(sparse=True)
X_train = dv.fit_transform(train_dicts)
val_dicts = df_val.to_dict(orient='records')
X_val = dv.transform(val_dicts)
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

watchlist = [(dtrain, 'train'), (dval, 'val')]

xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

for eta in [0.3, 0.1]:
    xgb_params['eta'] = eta
    print(f'Training model with eta={eta}')
    xgb.train(
        params=xgb_params,
        dtrain=dtrain,
        num_boost_round=100,
        evals=watchlist
)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.
Training model with eta=0.3
[0]	train-rmse:1.81393	val-rmse:1.85444
[1]	train-rmse:1.31919	val-rmse:1.35353
[2]	train-rmse:0.98120	val-rmse:1.01316
[3]	train-rmse:0.75443	val-rmse:0.78667
[4]	train-rmse:0.60680	val-rmse:0.64318
[5]	train-rmse:0.51381	val-rmse:0.55664
[6]	train-rmse:0.45470	val-rmse:0.50321
[7]	train-rmse:0.41881	val-rmse:0.47254
[8]	train-rmse:0.39534	val-rmse:0.45509
[9]	train-rmse:0.38038	val-rmse:0.44564
[10]	train-rmse:0.37115	val-rmse:0.43896
[11]	train-rmse:0.36361	val-rmse:0.43594
[12]	train-rmse:0.35850	val-rmse:0.43558
[13]	train-rmse:0.35365	val-rmse:0.43394
[14]	train-rmse:0.35025	val-rmse:0.43349
[15]	train-rmse:0.34666	val-rmse:0.43362