### Tree-based Modelling EDA
Using the existing DataLoader and DataCleaner functionality, I aim to explore different tree-based models. I am expecting to settle for a XGBoost model.

I will also use this notebook to explore feature selection techniques 

In [1]:
# Data Preprocessing
from data_preprocessing.DataLoader import DataLoader
from data_preprocessing.DataCleaner import DataCleaner

# Modelling
import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesClassifier

# Feature Selection
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score

# General
import numpy as np

  from pandas import MultiIndex, Int64Index


In [2]:
data_loader = DataLoader(path="car_data.parquet.gzip")
df = data_loader.load_parquet()
data_cleaner = DataCleaner(df)

In [3]:
data_cleaner.clean_data()
new_df = data_cleaner.get_df()
# Split data and drop old columns
X_train, X_test, y_train, y_test = data_loader.split_data(new_df)
data_cleaner.drop_columns(X_train)
data_cleaner.drop_columns(X_test)
# Encode to ordinal bsaed on train set
columns_to_ordinal = ["co2_grouped", "engine_size_grouped", "owners_grouped", "fuel_type_grouped", "make_grouped", "doors_grouped", "seats_grouped"]
data_cleaner.convert_columns_to_ordinal(columns_to_ordinal, X_train, y_train, X_test)

## Feature Selection using an ExtraTreesClassifier
<u> What is an ExtraTrees Model and How does it differ to a Random Forest? </u>  
Extra trees are also know as <a href="https://orbi.uliege.be/bitstream/2268/9357/1/geurts-mlj-advance.pdf"><b>Extremely Random Trees</b></a>

- Random forest uses boostrap replicas (subsamples input with replacement)
- Extra Trees use whole original sample (bootstrapping is optional arg in scikit)
- Random forest chooses the optimum split for each branch
- Extra Trees choose these splits randomly
- Extra trees are computationally more efficient than other ensemble methods

In [4]:
extra_tree_clf = ExtraTreesClassifier(n_estimators=20, max_depth=16, min_samples_split=4, random_state=1337, bootstrap=True)
extra_tree_clf.fit(X_train, y_train)

ExtraTreesClassifier(bootstrap=True, max_depth=16, min_samples_split=4,
                     n_estimators=20, random_state=1337)

In [5]:
importance_dict = dict(zip(X_train.columns, extra_tree_clf.feature_importances_))
importance_dict = dict(sorted(importance_dict.items(), key=lambda item: item[1], reverse=True))
for k,v in importance_dict.items():
    print(f"{k} : {v}")

image_count : 0.18240699816356115
co2_grouped : 0.09499500390796864
make_grouped : 0.0816674599937495
owners_grouped : 0.07965179299610939
advert_sentiment : 0.06877785918853063
advert_title_sentiment : 0.06844241882328977
mileage_deviation_encoded : 0.055499378800171674
seats_grouped : 0.047691590453083875
doors_grouped : 0.039554421513674116
fuel_type_grouped : 0.03735740094943999
engine_size_grouped : 0.02968713384556957
is_private_plate : 0.02961423392545983
is_manual : 0.018355057723976404
is_ulez : 0.014817188549698063
is_convertible : 0.012088344975618951
is_new : 0.010431301818927007
model_col_0 : 0.009584391005332748
model_col_10 : 0.009491987816772417
model_col_1 : 0.008309373940534277
model_col_11 : 0.00818655603266088
model_col_6 : 0.008162785736843257
model_col_2 : 0.00801349787604391
model_col_14 : 0.007715090222761606
model_col_8 : 0.007629226600500287
model_col_13 : 0.007478019375458415
model_col_3 : 0.0073886617961535575
model_col_4 : 0.0072428139140146746
model_col_15

In [6]:
# importance_dict
print(f"Old shape: {X_train.shape}")
model = SelectFromModel(extra_tree_clf, prefit=True)
X_new = model.transform(X_train)
features_out = model.get_feature_names_out(input_features=X_train.columns)
print(f"New shape: {X_new.shape}")

Old shape: (157838, 39)
New shape: (157838, 12)




In [7]:
for f in features_out:
    print(f"{f} : {importance_dict[f]:.4f}")

is_private_plate : 0.0296
mileage_deviation_encoded : 0.0555
image_count : 0.1824
advert_sentiment : 0.0688
advert_title_sentiment : 0.0684
make_grouped : 0.0817
fuel_type_grouped : 0.0374
doors_grouped : 0.0396
seats_grouped : 0.0477
owners_grouped : 0.0797
engine_size_grouped : 0.0297
co2_grouped : 0.0950


## Base Model - Decision Tree

In [8]:
X_train = X_new
dt = DecisionTreeRegressor(max_depth=32, random_state=1337)
dt.fit(X_train, y_train)

DecisionTreeRegressor(max_depth=32, random_state=1337)

In [9]:
basic_dt_scores = cross_val_score(dt, X_train, y_train, cv=10, scoring='neg_mean_absolute_error')
def output_score(scores):
    print(f"Mean: {abs(scores.mean()):.3f} (std: {scores.std():.3f})\n\n")

## Random Forest

In [11]:
rf = RandomForestRegressor(n_estimators=100, max_depth=32, random_state=1337)
rf.fit(X_train, y_train)

RandomForestRegressor(max_depth=32, random_state=1337)

In [12]:
basic_rf_scores = cross_val_score(rf, X_train, y_train, cv=10, scoring='neg_mean_absolute_error')

### Gradient Boosting Regressor

In [None]:
gb = GradientBoostingRegressor(n_estimators=100, max_depth =16)


In [None]:
output_score(basic_dt_scores)
output_score(basic_rf_scores)