### Tree-based Modelling EDA
Using the existing DataLoader and DataCleaner functionality, I aim to explore different tree-based models. I am expecting to settle for a XGBoost model.

I will also use this notebook to explore feature selection techniques 

In [1]:
# Data Preprocessing
from data_preprocessing.DataLoader import DataLoader
from data_preprocessing.DataCleaner import DataCleaner
from sklearn.model_selection import train_test_split

# Modelling
import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesClassifier

# Feature Selection
from sklearn.feature_selection import SelectFromModel
from sklearn.model_selection import cross_val_score, GridSearchCV

# Hyper param Tuning
from hpsklearn import HyperoptEstimator, random_forest_regressor, xgboost_regression
from hyperopt import hp

# General
import numpy as np
import pandas as pd

  from pandas import MultiIndex, Int64Index


In [2]:
X_train = pd.read_csv('X_train.csv')
X_test = pd.read_csv('X_test.csv')
y_train = pd.read_csv('y_train.csv')
y_test = pd.read_csv('y_test.csv')

In [3]:
# data_loader = DataLoader(path="car_data.parquet.gzip")
# df = data_loader.load_parquet()
# data_cleaner = DataCleaner(df)

In [4]:
# data_cleaner.clean_data()
# new_df = data_cleaner.get_df()
# # Split data and drop old columns
# X_train, X_test, y_train, y_test = data_loader.split_data(new_df)
# data_cleaner.drop_columns(X_train)
# data_cleaner.drop_columns(X_test)
# # Encode to ordinal bsaed on train set
# columns_to_ordinal = ["co2_grouped", "engine_size_grouped", "owners_grouped", "fuel_type_grouped", "make_grouped", "doors_grouped", "seats_grouped"]
# data_cleaner.convert_columns_to_ordinal(columns_to_ordinal, X_train, y_train, X_test)

## Feature Selection using an ExtraTreesClassifier
<u> What is an ExtraTrees Model and How does it differ to a Random Forest? </u>  
Extra trees are also know as <a href="https://orbi.uliege.be/bitstream/2268/9357/1/geurts-mlj-advance.pdf"><b>Extremely Random Trees</b></a>

- Random forest uses boostrap replicas (subsamples input with replacement)
- Extra Trees use whole original sample (bootstrapping is optional arg in scikit)
- Random forest chooses the optimum split for each branch
- Extra Trees choose these splits randomly
- Extra trees are computationally more efficient than other ensemble methods

In [5]:
columns_to_keep = ["is_private_plate",
"mileage_deviation_encoded",
"image_count",
"advert_sentiment",
"advert_title_sentiment",
"make_grouped",
"fuel_type_grouped",
"doors_grouped",
"seats_grouped",
"owners_grouped",
"engine_size_grouped",
"co2_grouped"]

X_train = np.array(X_train[columns_to_keep])
X_test = np.array(X_test[columns_to_keep])
y_train = np.reshape(np.array(y_train), (-1, ))
y_test = np.reshape(np.array(y_test), (-1, ))
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=1337, test_size=0.1)
# extra_tree_clf = ExtraTreesClassifier(n_estimators=20, max_depth=16, min_samples_split=4, random_state=1337, bootstrap=True)
# extra_tree_clf.fit(X_train, y_train)

In [6]:
# importance_dict = dict(zip(X_train.columns, extra_tree_clf.feature_importances_))
# importance_dict = dict(sorted(importance_dict.items(), key=lambda item: item[1], reverse=True))
# for k,v in importance_dict.items():
#     print(f"{k} : {v}")

In [7]:
# importance_dict
# print(f"Old shape: {X_train.shape}")
# model = SelectFromModel(extra_tree_clf, prefit=True)
# X_new = model.transform(X_train)
# features_out = model.get_feature_names_out(input_features=X_train.columns)
# print(f"New shape: {X_new.shape}")

In [8]:
# for f in features_out:
#     print(f"{f} : {importance_dict[f]:.4f}")

### Metric Choice
1. MAE
 - Easy to interpret and gives less weight to outliers
2. MSE
- Will  be sensitive to hypercar outliers and needs reformatting for interpretation
3. RMSE
- Also sensitive to hypercar outliers but already rooted so easier to interpret

## Base Model - Decision Tree

In [9]:
# X_train = X_new
# X_test = model.transform(X_test)
def output_score(scores):
    print(f"Mean: {abs(scores.mean()):.3f} (std: {scores.std():.3f})\n\n")

In [10]:
dt = DecisionTreeRegressor(max_depth=32, random_state=1337)
basic_dt_scores = cross_val_score(dt, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')


## Random Forest

In [11]:
rf = RandomForestRegressor(n_estimators=100, max_depth=16, random_state=1337)
basic_rf_scores = cross_val_score(rf, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')

### Gradient Boosting Regressor

In [12]:
gb = GradientBoostingRegressor(n_estimators=100, max_depth=16, random_state=1337, learning_rate=0.3)
basic_gb_scores = cross_val_score(gb, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')

In [13]:
print("DT")
output_score(basic_dt_scores)
print("RF")
output_score(basic_rf_scores)
print("GB")
output_score(basic_gb_scores)

DT
Mean: 7094.560 (std: 97.362)


RF
Mean: 5914.527 (std: 58.325)


GB
Mean: 6591.903 (std: 64.129)




## Best Performing 'Default' Model: Random Forest
#### Random Forest Hyperparam Tuning

In [14]:
# cv_rf = RandomForestRegressor(random_state=1337)
# param_grid = {'n_estimators':[120], 'max_depth':[16], 'min_samples_split': [16]}
# grid_cv_clf = GridSearchCV(cv_rf, param_grid)
# grid_cv_clf.fit(X_train, y_train)

In [15]:
"""
RandomForestRegressor(max_depth=16, min_samples_split=16, n_estimators=120,
                      random_state=1337)
"""
best_rf = RandomForestRegressor(max_depth=16, min_samples_split=16, n_estimators=120,
                      random_state=1337)
                      
tuned_rf_scores = cross_val_score(best_rf, X_train, y_train, cv=5, scoring='neg_mean_absolute_error')                      

In [16]:
print("Basic RF")
output_score(basic_rf_scores)
print("Tuned RF")
output_score(tuned_rf_scores)

Basic RF
Mean: 5914.527 (std: 58.325)


Tuned RF
Mean: 5879.341 (std: 57.489)




I think I accidentally chose some good starting params for my base model as my Grid search returned very similar values

### XGBoost
I am going to use their Scikit API wrapper to keep the implementation similar to previous models

TODO: Fill in what XGBoost is

In [27]:
# Creating a validation set to enable the use of early stopping
# final split: 60 train : 10 val : 30 test
# will move this up so other base models are trained on same amount of data

# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=1337, test_size=0.1)


reg = xgb.XGBRegressor(
    # Number of boosting rounds
    n_estimators=200, 
    max_depth=16,
    learning_rate=0.3,
    verbosity=1,
    booster='gbtree',
    random_state=1337,
    predictor='cpu_predictor',   # [cpu_predictor, gpu_predictor]
    eval_metric=['rmse', 'mae'] # last eval metric will be evaluated on for early stopping
)



reg.fit(X_train, y_train, eval_set=[(X_val, y_val)], early_stopping_rounds=5)

[0]	validation_0-rmse:26372.56836	validation_0-mae:13414.70606
[1]	validation_0-rmse:22661.56055	validation_0-mae:10188.07129
[2]	validation_0-rmse:20781.36719	validation_0-mae:8238.74609
[3]	validation_0-rmse:19783.68555	validation_0-mae:7076.14502
[4]	validation_0-rmse:19489.26758	validation_0-mae:6418.01562
[5]	validation_0-rmse:19178.58203	validation_0-mae:6038.39941
[6]	validation_0-rmse:19196.21289	validation_0-mae:5848.92432
[7]	validation_0-rmse:19297.47266	validation_0-mae:5752.18799
[8]	validation_0-rmse:19372.88672	validation_0-mae:5704.11084
[9]	validation_0-rmse:19353.30469	validation_0-mae:5671.17676
[10]	validation_0-rmse:19500.33984	validation_0-mae:5666.64697
[11]	validation_0-rmse:19652.96289	validation_0-mae:5667.53906
[12]	validation_0-rmse:19766.30859	validation_0-mae:5677.73340
[13]	validation_0-rmse:19896.79297	validation_0-mae:5692.77197
[14]	validation_0-rmse:20008.83203	validation_0-mae:5706.61084
[15]	validation_0-rmse:20088.37305	validation_0-mae:5717.26514


XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
             eval_metric=['rmse', 'mae'], gamma=0, gpu_id=-1,
             importance_type=None, interaction_constraints='',
             learning_rate=0.3, max_delta_step=0, max_depth=16,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=200, n_jobs=12, num_parallel_tree=1,
             predictor='cpu_predictor', random_state=1337, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=1, tree_method='exact',
             validate_parameters=1, verbosity=1)

Even the base model is able to attain a reasonable validation MAE of approx. 5.7k

## Using Hyperopt for Tuning XGBoost

In [28]:
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe

space={'max_depth': hp.quniform("max_depth", 2, 18, 2),
        'gamma': hp.uniform ('gamma', 1,9),
        'reg_alpha' : hp.quniform('reg_alpha', 40,180,1),
        'reg_lambda' : hp.uniform('reg_lambda', 0,1),
        'colsample_bytree' : hp.uniform('colsample_bytree', 0.5,1),
        'min_child_weight' : hp.quniform('min_child_weight', 0, 10, 1),
        'n_estimators': 180,
        'seed': 1337
    }

In [29]:
from sklearn.metrics import mean_absolute_error
def objective(space):
    clf = xgb.XGBRegressor(
                    n_estimators = space['n_estimators'], 
                    max_depth = int(space['max_depth']), 
                    gamma = space['gamma'],
                    reg_alpha = int(space['reg_alpha']),
                    min_child_weight = int(space['min_child_weight']),
                    colsample_bytree = int(space['colsample_bytree']))
    
    evaluation = [( X_train, y_train), (X_val, y_val)]
    
    clf.fit(X_train, y_train,
            eval_set=evaluation, eval_metric="mae",
            early_stopping_rounds=10,verbose=False)
    

    y_pred = clf.predict(X_val)
    mae = mean_absolute_error(y_val, y_pred)
    print ("MAE:", mae)
    return {'loss': -mae, 'status': STATUS_OK }

In [30]:
trials = Trials()

best_hyperparams = fmin(fn = objective,
                        space = space,
                        algo = tpe.suggest,
                        max_evals = 100,
                        trials = trials)

MAE:                                                   
9437.862823619736                                      
MAE:                                                                             
9440.020801302508                                                                
MAE:                                                                             
9465.854586766083                                                                
MAE:                                                                             
9465.854584958308                                                                
MAE:                                                                             
9437.712209012263                                                                
MAE:                                                                             
9437.50248353033                                                                 
MAE:                                                                

# Exploring Different XGB Implementation with Cross-Val

In [16]:
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, random_state=1337, test_size=0.1)
xgb_dtrain = xgb.DMatrix(X_train, y_train)
# xgb_dval = xgb.DMatrix(X_val, y_val)
xgb_dtest = xgb.DMatrix(X_test, y_test)
param = {
    'max_depth': 16, 
    'eta': 0.1, 
    'objective': 'reg:squarederror',
    'eval_metric': 'mae'
    }
evallist = [(xgb_dtrain, 'train')]

In [17]:
num_round = 1000
bst = xgb.cv(param, xgb_dtrain, num_round, metrics='mae', early_stopping_rounds=5)

In [18]:
bst

Unnamed: 0,train-mae-mean,train-mae-std,test-mae-mean,test-mae-std
0,16925.475912,63.494392,16937.354818,143.993683
1,15323.347005,59.199637,15371.183268,144.886182
2,13912.18457,56.058313,14018.279622,144.999213
3,12671.313477,52.929467,12844.576823,144.611386
4,11574.495443,47.230916,11823.134115,147.078306
5,10608.156576,44.584086,10936.218424,145.644828
6,9755.651693,39.352714,10173.391602,143.546164
7,9005.082031,36.323444,9518.486979,144.321584
8,8343.281901,32.993114,8952.847331,141.189245
9,7757.85791,30.422887,8467.227214,137.01582
