In [2]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [20]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction import DictVectorizer
from sklearn.metrics import roc_auc_score
from sklearn.tree import export_text
from sklearn.tree import DecisionTreeRegressor

In [3]:
data = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/car_fuel_efficiency.csv'

df = pd.read_csv(data)
df.head()

Unnamed: 0,engine_displacement,num_cylinders,horsepower,vehicle_weight,acceleration,model_year,origin,fuel_type,drivetrain,num_doors,fuel_efficiency_mpg
0,170,3.0,159.0,3413.433759,17.7,2003,Europe,Gasoline,All-wheel drive,0.0,13.231729
1,130,5.0,97.0,3149.664934,17.8,2007,USA,Gasoline,Front-wheel drive,0.0,13.688217
2,170,,78.0,3079.038997,15.1,2018,Europe,Gasoline,Front-wheel drive,0.0,14.246341
3,220,4.0,,2542.392402,20.2,2009,USA,Diesel,All-wheel drive,2.0,16.912736
4,210,1.0,140.0,3460.87099,14.4,2009,Europe,Gasoline,All-wheel drive,2.0,12.488369


In [4]:
df.shape[0]

9704

### Preparation step

In [7]:
df.isnull().sum().sort_values()

engine_displacement      0
vehicle_weight           0
origin                   0
model_year               0
fuel_type                0
drivetrain               0
fuel_efficiency_mpg      0
num_cylinders          482
num_doors              502
horsepower             708
acceleration           930
dtype: int64

In [13]:
df = df.fillna(0)
df.fuel_efficiency_mpg.value_counts()

fuel_efficiency_mpg
13.231729    1
13.688217    1
14.246341    1
16.912736    1
12.488369    1
            ..
15.101802    1
17.962326    1
17.186587    1
15.331551    1
14.884467    1
Name: count, Length: 9704, dtype: int64

In [16]:
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=1)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=1)

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = (df_train.fuel_efficiency_mpg)
y_val = (df_val.fuel_efficiency_mpg)
y_test = (df_test.fuel_efficiency_mpg)
y_full_train = df_full_train.fuel_efficiency_mpg

del df_train['fuel_efficiency_mpg']
del df_val['fuel_efficiency_mpg']
del df_test['fuel_efficiency_mpg']
del df_full_train['fuel_efficiency_mpg']

In [17]:
train_dicts = df_train.fillna(0).to_dict(orient='records')
val_dicts = df_val.fillna(0).to_dict(orient='records')
test_dicts = df_test.fillna(0).to_dict(orient='records')
full_train_dicts = df_full_train.fillna(0).to_dict(orient='records')

dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(train_dicts)
X_val = dv.fit_transform(val_dicts)
X_test = dv.fit_transform(test_dicts)
X_full_train = dv.fit_transform(full_train_dicts)

### Quesion 1

In [21]:
dt = DecisionTreeRegressor(max_depth=1)
dt.fit(X_train, y_train)

0,1,2
,criterion,'squared_error'
,splitter,'best'
,max_depth,1
,min_samples_split,2
,min_samples_leaf,1
,min_weight_fraction_leaf,0.0
,max_features,
,random_state,
,max_leaf_nodes,
,min_impurity_decrease,0.0


In [23]:
export_text(dt, feature_names=dv.get_feature_names_out())

'|--- vehicle_weight <= 3022.11\n|   |--- value: [16.88]\n|--- vehicle_weight >  3022.11\n|   |--- value: [12.94]\n'

### Question 2

In [24]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import root_mean_squared_error as rmse

In [25]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, n_jobs=-1)

rf.fit(X_train, y_train)

y_pred = rf.predict(X_val)
rmse(y_pred, y_val)

0.4599777557336148

### Question 3

In [30]:
scores = []
for n in range(10, 201, 10):
    rf = RandomForestRegressor(n_estimators=n, random_state=1, n_jobs=-1)
    
    rf.fit(X_train, y_train)
    
    y_pred = rf.predict(X_val)
    scores.append((n, round(rmse(y_pred, y_val), 3)))


In [31]:
scores

[(10, 0.46),
 (20, 0.454),
 (30, 0.451),
 (40, 0.448),
 (50, 0.446),
 (60, 0.445),
 (70, 0.445),
 (80, 0.445),
 (90, 0.445),
 (100, 0.444),
 (110, 0.443),
 (120, 0.444),
 (130, 0.443),
 (140, 0.443),
 (150, 0.443),
 (160, 0.443),
 (170, 0.443),
 (180, 0.442),
 (190, 0.443),
 (200, 0.443)]

### Question 4

In [39]:
scores = []
for m in [10, 15, 20, 25]:
    rmse_list = []
    for n in range(10, 201, 10):
        rf = RandomForestRegressor(n_estimators=n, random_state=1, max_depth=m, n_jobs=-1)
        
        rf.fit(X_train, y_train)
        
        y_pred = rf.predict(X_val)
        rmse_list.append(rmse(y_pred, y_val))

    scores.append((m, sum(rmse_list)/len(rmse_list)))


In [44]:
scores.sort(key=lambda x: x[1])
scores

[(10, 0.44232130237115186),
 (15, 0.44505999920137435),
 (20, 0.4456441321803526),
 (25, 0.4456606000029247)]

### Question 5

In [45]:
rf = RandomForestRegressor(n_estimators=10, random_state=1, max_depth=20, n_jobs=-1)

In [53]:
rf.fit(X_train, y_train)
df_features = pd.DataFrame(zip(list(dv.get_feature_names_out()), rf.feature_importances_),
                          columns=['feature', 'importance'])
df_features

Unnamed: 0,feature,importance
0,acceleration,0.011471
1,drivetrain=All-wheel drive,0.000382
2,drivetrain=Front-wheel drive,0.000312
3,engine_displacement,0.003269
4,fuel_type=Diesel,0.000344
5,fuel_type=Gasoline,0.000337
6,horsepower,0.01604
7,model_year,0.003182
8,num_cylinders,0.002359
9,num_doors,0.001591


In [58]:
df_features.sort_values(by=['importance'],ascending=False)

Unnamed: 0,feature,importance
13,vehicle_weight,0.959162
6,horsepower,0.01604
0,acceleration,0.011471
3,engine_displacement,0.003269
7,model_year,0.003182
8,num_cylinders,0.002359
9,num_doors,0.001591
12,origin=USA,0.000555
11,origin=Europe,0.00052
10,origin=Asia,0.000476


### Question 6

In [60]:
import xgboost as xgb

In [61]:
features = list(dv.get_feature_names_out())
dtrain = xgb.DMatrix(X_train, label=y_train, feature_names=features)
dval = xgb.DMatrix(X_val, label=y_val, feature_names=features)

In [62]:
xgb_params = {
    'eta': 0.3, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

watchlist = [(dtrain, 'train'), (dval, 'val')]

model = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist)

[0]	train-rmse:1.81393	val-rmse:1.85444
[1]	train-rmse:1.31919	val-rmse:1.35353
[2]	train-rmse:0.98120	val-rmse:1.01316
[3]	train-rmse:0.75443	val-rmse:0.78667
[4]	train-rmse:0.60680	val-rmse:0.64318
[5]	train-rmse:0.51381	val-rmse:0.55664
[6]	train-rmse:0.45470	val-rmse:0.50321
[7]	train-rmse:0.41881	val-rmse:0.47254
[8]	train-rmse:0.39534	val-rmse:0.45509
[9]	train-rmse:0.38038	val-rmse:0.44564
[10]	train-rmse:0.37115	val-rmse:0.43896
[11]	train-rmse:0.36361	val-rmse:0.43594
[12]	train-rmse:0.35850	val-rmse:0.43558
[13]	train-rmse:0.35365	val-rmse:0.43394
[14]	train-rmse:0.35025	val-rmse:0.43349
[15]	train-rmse:0.34666	val-rmse:0.43362
[16]	train-rmse:0.34459	val-rmse:0.43378
[17]	train-rmse:0.34128	val-rmse:0.43405
[18]	train-rmse:0.33822	val-rmse:0.43391
[19]	train-rmse:0.33709	val-rmse:0.43374
[20]	train-rmse:0.33553	val-rmse:0.43376
[21]	train-rmse:0.33243	val-rmse:0.43453
[22]	train-rmse:0.33031	val-rmse:0.43510
[23]	train-rmse:0.32815	val-rmse:0.43601
[24]	train-rmse:0.32670	va

In [63]:
xgb_params = {
    'eta': 0.1, 
    'max_depth': 6,
    'min_child_weight': 1,
    
    'objective': 'reg:squarederror',
    'nthread': 8,
    
    'seed': 1,
    'verbosity': 1,
}

watchlist = [(dtrain, 'train'), (dval, 'val')]

model = xgb.train(xgb_params, dtrain, num_boost_round=100, evals=watchlist)

[0]	train-rmse:2.28944	val-rmse:2.34561
[1]	train-rmse:2.07396	val-rmse:2.12434
[2]	train-rmse:1.88066	val-rmse:1.92597
[3]	train-rmse:1.70730	val-rmse:1.74987
[4]	train-rmse:1.55163	val-rmse:1.59059
[5]	train-rmse:1.41247	val-rmse:1.44988
[6]	train-rmse:1.28796	val-rmse:1.32329
[7]	train-rmse:1.17660	val-rmse:1.20930
[8]	train-rmse:1.07736	val-rmse:1.10830
[9]	train-rmse:0.98883	val-rmse:1.02009
[10]	train-rmse:0.91008	val-rmse:0.94062
[11]	train-rmse:0.84030	val-rmse:0.87100
[12]	train-rmse:0.77874	val-rmse:0.80916
[13]	train-rmse:0.72417	val-rmse:0.75465
[14]	train-rmse:0.67626	val-rmse:0.70780
[15]	train-rmse:0.63402	val-rmse:0.66672
[16]	train-rmse:0.59690	val-rmse:0.63062
[17]	train-rmse:0.56447	val-rmse:0.60016
[18]	train-rmse:0.53619	val-rmse:0.57383
[19]	train-rmse:0.51138	val-rmse:0.55044
[20]	train-rmse:0.48983	val-rmse:0.53064
[21]	train-rmse:0.47135	val-rmse:0.51451
[22]	train-rmse:0.45501	val-rmse:0.49998
[23]	train-rmse:0.44120	val-rmse:0.48790
[24]	train-rmse:0.42929	va