### Tree-based Modelling EDA
Using the existing DataLoader and DataCleaner functionality, I aim to explore different tree-based models. I am expecting to settle for a XGBoost model.

I will also use this notebook to explore feature selection techniques 

In [1]:
# Data Preprocessing
from data_preprocessing.DataLoader import DataLoader
from data_preprocessing.DataCleaner import DataCleaner

# Modelling
import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesClassifier

# Feature Selection
from sklearn.feature_selection import RFECV, SelectFromModel

# General
import numpy as np

  from pandas import MultiIndex, Int64Index


In [2]:
data_loader = DataLoader(path="car_data.parquet.gzip")
df = data_loader.load_parquet()
data_cleaner = DataCleaner(df)

In [3]:
data_cleaner.clean_data()
new_df = data_cleaner.get_df()
# Split data and drop old columns
X_train, X_test, y_train, y_test = data_loader.split_data(new_df)
data_cleaner.drop_columns(X_train)
data_cleaner.drop_columns(X_test)
# Encode to ordinal bsaed on train set
columns_to_ordinal = ["co2_grouped", "engine_size_grouped", "owners_grouped", "fuel_type_grouped", "make_grouped", "doors_grouped", "seats_grouped"]
data_cleaner.convert_columns_to_ordinal(columns_to_ordinal, X_train, y_train, X_test)

#### Feature Selection using an ExtraTreesClassifier

In [19]:
extra_tree_clf = ExtraTreesClassifier(n_estimators=100, max_depth=8)
extra_tree_clf.fit(X_train, y_train)

ExtraTreesClassifier(max_depth=8)

In [22]:
importance_dict = dict(zip(X_train.columns, extra_tree_clf.feature_importances_))
importance_dict = dict(sorted(importance_dict.items(), key=lambda item: item[1], reverse=True))

In [37]:
# importance_dict
model = SelectFromModel(extra_tree_clf, prefit=True)
X_new = model.transform(X_train)
print(X_train.shape)
print(X_new.shape)

(157838, 39)
(157838, 14)




### RFECV Feature Selection

In [33]:
dt = DecisionTreeRegressor(
    random_state=1337, max_depth=8, min_samples_split=15)

# recursive feature elimination
selector_dt = RFECV(dt, cv=2)

# fit training data
selector_dt.fit(X_train, y_train)

selected_dt = X_train.columns[(selector_dt.get_support())]

In [38]:
rf = RandomForestRegressor(
    n_estimators=20, random_state=1984, max_depth=8, min_samples_split=15, bootstrap=True)
selector_rf = RFECV(rf, cv=2)
selector_rf.fit(X_train, y_train)
selected_rf = X_train.columns[(selector_rf.get_support())]

In [39]:
print(selected_dt)
print(selected_rf)

Index(['is_ulez', 'is_manual', 'is_new', 'make_grouped', 'fuel_type_grouped',
       'seats_grouped', 'owners_grouped', 'engine_size_grouped',
       'co2_grouped'],
      dtype='object')
Index(['is_ulez', 'is_manual', 'is_new', 'image_count', 'make_grouped',
       'fuel_type_grouped', 'seats_grouped', 'owners_grouped',
       'engine_size_grouped', 'co2_grouped'],
      dtype='object')
