# Ensemble Learning - Basis-Modelle beschädigte Bäume
Build and optimize base models for damaged vegetation classes

In [2]:
%matplotlib inline

In [3]:
import warnings
warnings.filterwarnings('ignore')

In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import os
import json

import pickle

from sklearn import metrics
from sklearn.base import clone
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, HistGradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
import xgboost as xgb

from imblearn.ensemble import BalancedRandomForestClassifier, RUSBoostClassifier

from collections import Counter

pd.set_option("display.max_columns", None)

## Data Import & Data Preparation

In [5]:
DATA_PATH = "./data"
EXPORT_PATH = f"{DATA_PATH}/BaseLearner/v3"

SPECIES_CODE = {
    "1":"Deciduous",
    "2":"Conifer",
    "1 2":"Mixed"
}

SPECIES_CODE_INVERSE = {
    "Deciduous": "1",
    "Conifer": "2",
    "Mixed": "1 2"
}

In [6]:
df = pd.read_csv(f"{DATA_PATH}/location_cluster.csv")

df = df.sort_values(by="year")

# -----

def is_low(damage_class:float)->int:
    if damage_class == 1.0:
        return 1
    else:
        return 0
    
def is_medium(damage_class:str)->int:
    if damage_class == 2.0:
        return 1
    else:
        return 0
    
def is_high(damage_class:str)->int:
    if damage_class == 3.0:
        return 1
    else:
        return 0
    
df["is_low"] = df["damage_class"].apply(lambda x: is_low(x))
df["is_medium"] = df["damage_class"].apply(lambda x: is_medium(x))
df["is_high"] = df["damage_class"].apply(lambda x: is_high(x))

# -----

static_cols = [
    col for col in df.columns if "location_cluster_" in col
]
static_cols.append("tree_1")

vegetation_cols = [
    col for col in df.columns if (("ndvi" in col) or ("evi2" in col)) and not col.endswith('max')
]

weather_cols = [
    'autumn_humidity',
    'autumn_rain_mm',
    'autumn_sun_h',
    'extreme_heat_day',
    'frozen_day',
    'heat_day',
    'heavy_rain',
    'spring_humidity',
    'spring_rain_mm',
    'spring_sun_h',
    'storm',
    'summer_humidity',
    'summer_longest_heat_wave',
    'summer_periods_without_rain_summer',
    'summer_rain_mm',
    'summer_sun_h',
    'winter_humidity',
    'winter_longest_cold_wave',
    'winter_rain_mm',
    'winter_snow_cm',
    'winter_sun_h',
]

all_features = static_cols + vegetation_cols + weather_cols

df[all_features].head()

Unnamed: 0,location_cluster_0,location_cluster_1,location_cluster_2,location_cluster_3,location_cluster_4,location_cluster_5,location_cluster_6,location_cluster_7,location_cluster_8,location_cluster_9,location_cluster_10,location_cluster_11,location_cluster_12,location_cluster_13,tree_1,evi2,evi2-1,evi2_diff,evi2_max_diff,evi2_raster,evi2_raster-1,evi2_raster_diff,evi2_vegetation_raster,evi2_vegetation_raster-1,evi2_vegetation_raster_diff,ndvi,ndvi-1,ndvi_diff,ndvi_max_diff,ndvi_raster,ndvi_raster-1,ndvi_raster_diff,ndvi_vegetation_raster,ndvi_vegetation_raster-1,ndvi_vegetation_raster_diff,autumn_humidity,autumn_rain_mm,autumn_sun_h,extreme_heat_day,frozen_day,heat_day,heavy_rain,spring_humidity,spring_rain_mm,spring_sun_h,storm,summer_humidity,summer_longest_heat_wave,summer_periods_without_rain_summer,summer_rain_mm,summer_sun_h,winter_humidity,winter_longest_cold_wave,winter_rain_mm,winter_snow_cm,winter_sun_h
0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,-0.780422,0.247127,0.134649,-0.063166,0.558887,-1.965227,2.321263,2.190397,-1.591913,1.467303,2.389869,0.269184,0.155518,-0.056732,0.396759,-2.000923,2.376153,4.699451,-1.625971,1.476802,3.188868,2.349058,-0.032661,-0.510027,-0.844847,0.729139,1.259041,0.020538,0.030137,-0.308301,0.671584,-1.489254,0.59512,0.671331,1.252978,0.111377,-0.265597,1.79314,0.343276,-1.081905,0.637993,-0.509518
5188,0,0,0,1,0,0,0,0,0,0,0,0,0,0,-0.630696,-1.377409,0.134649,1.21099,1.661654,-1.010216,0.702443,0.638921,-0.897612,0.639956,0.116478,-1.394638,0.155518,1.023457,1.773875,-1.02063,0.601524,-0.686361,-0.906359,0.610358,-1.136011,1.391816,0.207266,-0.081468,1.680325,-0.857075,1.889191,0.69102,-0.369506,0.32383,0.515489,0.852785,-0.814671,0.897376,0.046127,-0.719613,-0.140728,0.69179,-0.815622,0.119561,-0.193376,-1.501934
5187,0,0,0,1,0,0,0,0,0,0,0,0,0,0,-0.630696,-1.606661,0.134649,1.546983,1.771145,-1.010216,0.702443,0.372483,-0.897612,0.639956,-0.247533,-1.639614,0.155518,1.352077,1.90948,-1.02063,0.601524,-1.216333,-0.906359,0.610358,-1.472954,1.391816,0.207266,-0.081468,1.680325,-0.857075,1.889191,0.69102,-0.369506,0.32383,0.515489,0.852785,-0.814671,0.897376,0.046127,-0.719613,-0.140728,0.69179,-0.815622,0.119561,-0.193376,-1.501934
5186,0,0,0,1,0,0,0,0,0,0,0,0,0,0,-0.630696,-1.074025,0.134649,0.871901,1.478956,-1.010216,0.702443,0.858797,-0.897612,0.639956,0.44989,-1.07286,0.155518,0.711611,1.542268,-1.02063,0.601524,-0.168402,-0.906359,0.610358,-0.69591,1.391816,0.207266,-0.081468,1.680325,-0.857075,1.889191,0.69102,-0.369506,0.32383,0.515489,0.852785,-0.814671,0.897376,0.046127,-0.719613,-0.140728,0.69179,-0.815622,0.119561,-0.193376,-1.501934
5185,0,0,0,1,0,0,0,0,0,0,0,0,0,0,-0.630696,-1.332807,0.134649,1.155192,1.658395,-1.010216,0.702443,0.67814,-0.897612,0.639956,0.173522,-1.347134,0.155518,0.970864,1.763622,-1.02063,0.601524,-0.60006,-0.906359,0.610358,-1.071161,1.391816,0.207266,-0.081468,1.680325,-0.857075,1.889191,0.69102,-0.369506,0.32383,0.515489,0.852785,-0.814671,0.897376,0.046127,-0.719613,-0.140728,0.69179,-0.815622,0.119561,-0.193376,-1.501934


### Definition of Functions

In [7]:
def train_test_split_by_observation(data_frame:pd.DataFrame, target:str, features:list):
    data_frame["key"] = data_frame["observation_id"] + ":" + data_frame["year"].astype(str)
    
    # group by observation_id and year to get unique observations by year only to prevent overfitting
    df_group = data_frame.groupby(["observation_id", "year"]).agg({"damage_class": lambda x: x.value_counts().index[0], target: lambda x: x.value_counts().index[0], "aoi": "first"}).reset_index()
    # remove all damaged classes that are not in BF
    #df_group = df_group.loc[~((df_group.damage_class_group == "damaged") & (df_group.aoi != "blackforest"))]
    y_observation_id = df_group.pop(target)
    X_observation_id = df_group

    # key to identify splitted data
    X_observation_id["key"] = X_observation_id["observation_id"] + ":" + X_observation_id["year"].astype(str)
    # train test split
    X_train_observation_id, X_test_observation_id, y_train_observation_id, y_test_observation_id = train_test_split(X_observation_id, y_observation_id, train_size=0.8, stratify=y_observation_id, random_state=2022)

    # filter for train test samples based on observation:year
    _X_train = data_frame.loc[data_frame.key.isin(list(X_train_observation_id.key))][features]
    _X_test = data_frame.loc[data_frame.key.isin(list(X_test_observation_id.key))][features]

    return _X_train, _X_test

def define_class_weight(y:np.array)->dict:
    class_weights = dict()

    for _class, count in Counter(y).items():
        weight = len(y) / (len(Counter(y)) * count)
        class_weights[_class] = weight
    return class_weights

def define_pos_scale(y:np.array)->float:
    return dict(Counter(y))[0] / dict(Counter(y))[1]
    
# https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/
def parameter_tuning(_model=None, _params=None, _scoring=None, _X=None, _y=None):
    sk_fold = StratifiedKFold(n_splits=3, shuffle=True, random_state=2022)

    randomized_clf_param_tuning = RandomizedSearchCV(estimator=_model, param_distributions=_params, scoring=_scoring, cv=sk_fold, n_jobs=58, n_iter=100, verbose=0)

    randomized_clf_param_tuning.fit(_X, _y)
    return randomized_clf_param_tuning.best_params_

def apply_prediction(pos_probs, threshold):
     return (pos_probs >= threshold).astype('int')
    
def get_model_params(_model: str, _target: str, _species:str)->dict:
    base_model = clone(pickle.load(open(f"{DATA_PATH}/BaseLearner/tmp/{_model}_{_species}_{_target}.sav", 'rb')))
    return base_model.get_params()

def roc_auc(actual, pred_proba, axes):
    fpr, tpr, thresholds = metrics.roc_curve(actual, pred_proba)
    # calculate the g-mean for each threshold
    gmeans = np.sqrt(tpr * (1-fpr))
    # locate the index of the largest g-mean
    ix = np.argmax(gmeans)
    if axes:
        # plot the roc curve for the model
        sns.lineplot([0,1], [0,1], linestyle='--', label='No Skill', color="tab:gray", ax=axes, err_style=None)
        sns.lineplot(fpr, tpr, label='Model', color="black", ax=axes, err_style=None)
        sns.scatterplot([fpr[ix]], [tpr[ix]], marker='^', color='tab:red', label=f'Optimal threshold @ {thresholds[ix]:.3f}', s=150, zorder=2, ax=axes)
    
    return thresholds[ix], gmeans[ix]

def prc_rc(actual, pred_proba, axes):
    precision, recall, thresholds = metrics.precision_recall_curve(actual, pred_proba)
    fscore = (2 * precision * recall) / (precision + recall)
    # locate the index of the largest f score
    ix = np.argmax(fscore)
    # plot the roc curve for the model
    no_skill = len(actual[actual==1]) / len(actual)
    if axes:
        sns.lineplot([0,1], [no_skill,no_skill], linestyle='--', label='No Skill', color="tab:gray", ax=axes, err_style=None)
        sns.lineplot(recall, precision, label='Model', color="black", ax=axes, err_style=None)
        sns.scatterplot([recall[ix]], [precision[ix]], marker='^', color='tab:red', label=f'Optimal threshold @ {thresholds[ix]:.3f}', s=150, zorder=2, ax=axes)

    return thresholds[ix], fscore[ix]
    
def tune_threshold(actual, pred_proba, axes):
    # Array for finding the optimal threshold
    thresholds = np.arange(0.0, 1.0, 0.001)
    fscore = np.zeros(shape=(len(thresholds)))
    # Fit the model
    def to_labels(pos_probs, threshold):
         return (pos_probs >= threshold).astype('int')
    fscore = [metrics.f1_score(actual, to_labels(pred_proba, t)) for t in thresholds]
        
    # Find the optimal threshold
    ix = np.argmax(fscore)
    if axes:
        sns.lineplot(thresholds, fscore, color="black", ax=axes, err_style=None)
        sns.scatterplot([thresholds[ix]], [fscore[ix]], marker='^', color='tab:red', label=f'Optimal threshold @ {thresholds[ix]:.3f}', s=150, zorder=2, ax=axes)

    return thresholds[ix], fscore[ix]


def parse_results(data_frame:pd.DataFrame):
    param_cols = [col for col in data_frame.columns if col.startswith("params")]
    metric_cols = [col for col in data_frame.columns if col.startswith("metrics")]
    feat_permutation_cols = [col for col in data_frame.columns if col.startswith("feature_permutaion_importance")]

    param_cols.extend(["model", "target", "species"])
    metric_cols.extend(["model", "target", "species"])
    feat_permutation_cols.extend(["model", "target", "species"])
    
    metric_groups = ["roc", "prc", "tuning"]
    actual_metrics = ["threshold", "cohen_kappa", "f1", "recall", "precision", "tn", "fp", "fn", "tp"]

    metrics_values = []
    for metric_group in metric_groups:
        for _i, _r in data_frame.iterrows():
            vals = {"group": metric_group, "model": _r["model"], "target": _r["target"], "species": _r["species"], "auc": _r["metrics.auc"]}
            for col in metric_cols:
                for _m in actual_metrics:
                    if _m in col and metric_group in col:
                        vals[_m] = _r[col]

            metrics_values.append(vals)

    df_metrics = pd.json_normalize(metrics_values)
    df_metrics = df_metrics.sort_values(by="f1", ascending=False)
    #print(f"model results -> {df_metrics.shape[0]} x {list(df_metrics.columns)}")

    best_model_selection = list()
    for _i, _r in df_metrics.groupby(["target", "species", "model"]).size().reset_index().iterrows():
        df_filter = df_metrics.loc[(df_metrics.target == _r.target) & (df_metrics.species == _r.species) & (df_metrics.model == _r.model)].sort_values(by="threshold", ascending=False)
        best_model_selection.append(df_filter.loc[df_filter.cohen_kappa == df_filter.cohen_kappa.max()].iloc[0][["target", "species", "model", "group"]].values)

    df_model_selection = pd.DataFrame(data=best_model_selection, columns=["target", "species", "model", "group"])
    df_best_metrics = df_metrics.merge(df_model_selection, on=["target", "species", "model", "group"], how="inner").drop_duplicates()
    #print(f"best model results -> {df_best_metrics.shape[0]} x {list(df_best_metrics.columns)}")

    df_parameter = data_frame[param_cols].melt(id_vars=param_cols[-3:], var_name="param", value_name="val")
    df_parameter["param"] = df_parameter["param"].apply(lambda x: x.replace("params.", ""))
    df_parameter = df_parameter.dropna()
    #print(f"model params -> {df_parameter.shape[0]} x {list(df_parameter.columns)}")

    df_feature_importance = data_frame[feat_permutation_cols].melt(id_vars=feat_permutation_cols[-3:], var_name="feature", value_name="feature_importance")
    df_feature_importance["feature"] = df_feature_importance["feature"].apply(lambda x: x.replace("feature_permutaion_importance.", ""))
    df_feature_importance = df_feature_importance.sort_values(by="feature_importance", ascending=False)
    #print(f"features -> {df_feature_importance.shape[0]} x {list(df_feature_importance.columns)}")
    
    return df_metrics, df_best_metrics, df_parameter, df_feature_importance

### Model Parameters

In [8]:
PARAMS_RF = {
    'max_depth': np.arange(16, 256, 16),
    'max_features': ["sqrt"],
    'min_samples_leaf': np.arange(2, 16, 2),
    'min_samples_split': np.arange(2, 16, 2),
    'n_estimators': np.arange(32, 512, 16),
    'n_jobs': [58],
    'warm_start': [True, False],
    'random_state': [2022],
}
PARAMS_XGB = {
    'max_depth': np.arange(3, 8, 1),
    'learning_rate': np.arange(0.001, 0.01, 0.002),
    'n_estimators': np.arange(50, 500, 15),
    'colsample_bytree': np.arange(0.5, 1, 0.05),
    'subsample': np.arange(0.5, 1, 0.05),
    'eval_metric': ["aucpr", "auc"],
    'objective': ['binary:logistic'],
    'gamma': np.arange(0, 1, 0.05),
    'lambda': np.arange(0, 1, 0.1),
    'seed': [2022],
    'n_jobs': [58],
}
PARAMS_KN = {
    'n_neighbors': np.arange(2, 18, 1),
    'weights': ["distance", "uniform"],
    'algorithm': ['ball_tree', 'kd_tree', 'brute'],
    'leaf_size': np.arange(10, 50, 2),
    'p': [1, 2],
    'n_jobs': [58],
    'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski', 'wminkowski', 'seuclidean', 'mahalanobis']
}
PARAMS_DT = {
    'criterion': ["gini", "entropy"],
    'splitter': ["random", "best"],
    'max_depth': np.arange(2, 32, 1),
    'min_samples_split': np.arange(2, 32, 2),
    'min_samples_leaf': np.arange(1, 16, 1),
    'max_features': ["sqrt", "log2"],
    'random_state': [2022],
    'min_weight_fraction_leaf': np.arange(0.0, 1.0, 0.05),
    'max_leaf_nodes': np.arange(1, 2048, 32),
    'min_impurity_decrease': np.arange(0.0, 1.0, 0.05),
    'ccp_alpha': np.arange(0, 1, 1),
}
PARAMS_HGB = {
    'loss': ['binary_crossentropy'],
    'learning_rate': np.arange(0.01, 0.05, 0.0005),
    'max_iter': np.arange(25, 500, 25),
    'max_leaf_nodes': np.arange(1, 128, 7),
    'max_depth': np.arange(1, 128, 7),
    'min_samples_leaf': np.arange(2, 32, 1),
    'l2_regularization': np.arange(0.0, 1.0, 0.05),
    #'warm_start': [True, False],
    'scoring': ["f1", "roc_auc"],
    #'tol': np.arange(1e-7, 9e-7, 1e-8),
    'random_state': [2022],
}
PARAMS_BRF = {
    'max_depth': np.arange(16, 256, 16),
    'max_features': ["sqrt"],
    'min_samples_leaf': np.arange(2, 24, 2),
    'min_samples_split': np.arange(2, 24, 2),
    'n_estimators': np.arange(32, 512, 16),
    'n_jobs': [58],
    'warm_start': [True, False],
    'random_state': [2022],
    #'class_weight': [define_class_weight(_y.values)]
}
PARAMS_RUS = {
    'replacement': [True, False],
    'learning_rate': np.arange(0.001, 0.1, 0.002),
    'n_estimators': np.arange(50, 500, 15),
    'algorithm': ["SAMME", "SAMME.R"],
    'random_state': [2022],
}
PARAMS_ADA = {
    'learning_rate': np.arange(0.05, 0.5, 0.025),
    'n_estimators': np.arange(50, 500, 15),
    'algorithm': ["SAMME", "SAMME.R"],
    'random_state': [2022],
}
PARAMS_SVM = {
    'gamma': np.arange(0.001, 1, 0.005),
    'C': np.arange(0.1, 10, 0.25),
    'shrinking': [False, True],
    'kernel': ['rbf', 'poly', 'sigmoid'],
    'probability': [True],
    'tol': np.arange(0.001, 0.01, 0.001),
    'decision_function_shape': ['ovo'],
    'random_state': [2022],
}

### Define Hold-Out

In [9]:
np.random.seed(2022)

binned_data = dict()
for species in ["1", "2", "1 2"]:
    _binned = df.copy()

    _binned = _binned.loc[(_binned.list_tree_sp_cat_ids == species) & (_binned.damage_class != 0.0) & (_binned.damage_class != 4.0)]

    for col in all_features:
        _binned[col] = pd.qcut(_binned[col].rank(method='first'), 10) # _binned[col]
        _binned[col] =_binned[col].cat.codes

    min_max_scaler = MinMaxScaler()
    _binned[all_features] = min_max_scaler.fit_transform(_binned[all_features])

    validation_observation_ids = np.random.choice(_binned.loc[_binned.aoi == "blackforest"].observation_id.unique(), size=2)
    _validation = _binned.loc[_binned.observation_id.isin(validation_observation_ids)]
    
    _train, _test = train_test_split_by_observation(data_frame=_binned.loc[~_binned.observation_id.isin(validation_observation_ids)], target="damage_class", features=_binned.columns)
    _test = pd.concat([_test, _validation])
    
    splitted_data = {
        'train': _train,
        'test': _test,
        'validation': _validation
    }
    
    binned_data[species] = splitted_data

## Base Learner
The following steps are performed for all damaged damage classes:
1) Backward Feature Elemination to identify the 50% most relevant features.
2) Perform parameter optimization of the base models.
3) Bagging ensemble
4) Perform Feature Permutation Importance to calculate the relevance of the features used.
5) Evaluate estimates including the determination of the decision boundary as well as the metrics
6) Export the results

### Small Damaged Vegetation

In [17]:
SPECIESES = ["1", "2", "1 2"]
MODELS = ["RandomForestClassifier", "BalancedRandomForestClassifier", "DecisionTreeClassifier", "KNeighborsClassifier", "SVM", "RUSBoostClassifier" , "AdaBoostClassifier", "XGBClassifier", "HistGradientBoostingClassifier"]
TARGET = "is_low"

binary_classifcation_results = list()

for SPECIES in SPECIESES:
    for MODEL in MODELS:
        if os.path.exists(f"{EXPORT_PATH}/{MODEL}_{SPECIES_CODE[SPECIES]}_{TARGET}.json"):
            continue
        print("-"*100)
        print(f"{MODEL} -> {SPECIES_CODE[SPECIES]}@{TARGET}")
        print("\t performing backward feature elemination...")
        try:
            init_model_params = get_model_params(_model=MODEL, _target=TARGET, _species=SPECIES_CODE[SPECIES])
        except FileNotFoundError:
            init_model_params = {}
            
        if "class_weight" in init_model_params:
            init_model_params["class_weight"] = define_class_weight(binned_data[SPECIES]["train"][TARGET])
        if "pos_scale" in init_model_params:
            init_model_params["pos_scale"] = define_pos_scale(binned_data[SPECIES]["train"][TARGET])

        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=2022)
        f1_scorer = metrics.make_scorer(metrics.f1_score, **{"average": "weighted"})

        if MODEL == "RandomForestClassifier":
            model = RandomForestClassifier(**init_model_params)
        elif MODEL == "XGBClassifier":
            model = xgb.XGBClassifier(**init_model_params)
        elif MODEL == "KNeighborsClassifier":
            model = KNeighborsClassifier(**init_model_params)
        elif MODEL == "DecisionTreeClassifier":
            model = DecisionTreeClassifier(**init_model_params)
        elif MODEL == "HistGradientBoostingClassifier":
            model = HistGradientBoostingClassifier(**init_model_params)
        elif MODEL == "BalancedRandomForestClassifier":
            model = BalancedRandomForestClassifier(**init_model_params)
        elif MODEL == "RUSBoostClassifier":
            model = RUSBoostClassifier(**init_model_params)
        elif MODEL == "AdaBoostClassifier":
            model = AdaBoostClassifier(**init_model_params)
        else: #SVM
            model = SVC(**init_model_params)

        sfs = SequentialFeatureSelector(estimator=model, direction="backward", scoring=f1_scorer, cv=skf, n_jobs=58)
        sfs.fit(binned_data[SPECIES]["train"][all_features], binned_data[SPECIES]["train"][TARGET])
        relevant_features = list(binned_data[SPECIES]["train"][all_features].columns[sfs.get_support()])

        # TRAINING DATA
        X_train = binned_data[SPECIES]["train"][relevant_features]
        y_train = binned_data[SPECIES]["train"][TARGET]
        # TEST DATA
        X_test = binned_data[SPECIES]["test"][relevant_features]
        y_test = binned_data[SPECIES]["test"][TARGET]

        print("\t performing hyperparameter tuning...")

        if MODEL == "RandomForestClassifier":
            params_rf = PARAMS_RF
            params_rf["class_weight"] = [define_class_weight(y_train.values)]
            fitted_model_params = parameter_tuning(_model=RandomForestClassifier(), _params=params_rf, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = RandomForestClassifier(**fitted_model_params)
        elif MODEL == "XGBClassifier":
            params_xgb = PARAMS_XGB
            params_xgb['scale_pos_weight'] = [define_pos_scale(y_train.values)]
            fitted_model_params = parameter_tuning(_model=xgb.XGBClassifier(), _params=params_xgb, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = xgb.XGBClassifier(**fitted_model_params)
        elif MODEL == "KNeighborsClassifier":
            params_kn = PARAMS_KN
            fitted_model_params = parameter_tuning(_model=KNeighborsClassifier(), _params=params_kn, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = KNeighborsClassifier(**fitted_model_params)
        elif MODEL == "DecisionTreeClassifier":
            params_dt = PARAMS_DT
            params_dt['class_weight'] = [define_class_weight(y_train.values)]
            fitted_model_params = parameter_tuning(_model=DecisionTreeClassifier(), _params=params_dt, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = DecisionTreeClassifier(**fitted_model_params)
        elif MODEL == "HistGradientBoostingClassifier":
            params_hgb = PARAMS_HGB
            fitted_model_params = parameter_tuning(_model=HistGradientBoostingClassifier(), _params=params_hgb, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = HistGradientBoostingClassifier(**fitted_model_params)
        elif MODEL == "BalancedRandomForestClassifier":
            params_brf = PARAMS_BRF
            fitted_model_params = parameter_tuning(_model=BalancedRandomForestClassifier(), _params=params_brf, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = BalancedRandomForestClassifier(**fitted_model_params)
        elif MODEL == "RUSBoostClassifier":
            params_rus = PARAMS_RUS
            fitted_model_params = parameter_tuning(_model=RUSBoostClassifier(), _params=params_rus, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = RUSBoostClassifier(**fitted_model_params)
        elif MODEL == "AdaBoostClassifier":
            params_ada = PARAMS_ADA
            fitted_model_params = parameter_tuning(_model=AdaBoostClassifier(), _params=params_ada, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = AdaBoostClassifier(**fitted_model_params)
        else:
            params_svm = PARAMS_SVM
            params_svm['class_weight'] = [define_class_weight(y_train.values)]
            fitted_model_params = parameter_tuning(_model=SVC(), _params=params_svm, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = SVC(**fitted_model_params)

        # CREATING BAGGING CLASSIFIER
        model = BaggingClassifier(base_estimator=model, n_estimators=15, max_samples=.2, max_features=1., bootstrap=True, bootstrap_features=True, oob_score=True, n_jobs=16)
        print("\t fitting model...")
        model.fit(X_train, y_train)
        # save model
        pickle.dump(model, open(f"{EXPORT_PATH}/{MODEL}_{SPECIES_CODE[SPECIES]}_{TARGET}.sav", 'wb'))

        # predict probabilities
        y_pred_proba = model.predict_proba(X_test)

        print("\t performing permutation feature importance...")
        from sklearn.inspection import permutation_importance
        permutation_importance = permutation_importance(model, X_test, y_test, n_repeats=5, random_state=2022, n_jobs=32, scoring=f1_scorer)
        permutation_importance_obj = {}
        for i, feat_imp in enumerate(permutation_importance.importances_mean):
            permutation_importance_obj[all_features[i]] = feat_imp

        print("\t evaluating...")
        results = {}
        results["species"] = SPECIES_CODE[SPECIES]
        results["model"] = MODEL
        results["target"] = TARGET
        results["params"] = fitted_model_params
        results["relevant_features"] = relevant_features
        results["feature_permutaion_importance"] = permutation_importance_obj

        results["metrics"] = {}
        results["metrics"]["auc"] = metrics.roc_auc_score(y_test.values, y_pred_proba[:, 1])

        ## ROC AUC
        results["metrics"]["roc"] = {}
        threshold_roc, gmean_roc = roc_auc(y_test.values, y_pred_proba[:,1], axes=None)
        results["metrics"]["roc"]["threshold"] = threshold_roc
        y_pred_roc = apply_prediction(y_pred_proba[:, 1], threshold_roc)
        results["metrics"]["roc"]["cohen_kappa"] = metrics.cohen_kappa_score(y_test.values, y_pred_roc)
        results["metrics"]["roc"]["f1"] = metrics.f1_score(y_test.values, y_pred_roc)
        results["metrics"]["roc"]["recall"] = metrics.recall_score(y_test.values, y_pred_roc)
        results["metrics"]["roc"]["precision"] = metrics.precision_score(y_test.values, y_pred_roc)
        tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred_roc).ravel()
        results["metrics"]["roc"]["tn"] = tn
        results["metrics"]["roc"]["fp"] = fp
        results["metrics"]["roc"]["fn"] = fn
        results["metrics"]["roc"]["tp"] = tp

        ## PRC
        results["metrics"]["prc"] = {}
        threshold_prc, fscore_prc = prc_rc(y_test.values, y_pred_proba[:,1], axes=None)
        results["metrics"]["prc"]["threshold"] = threshold_prc
        y_pred_prc = apply_prediction(y_pred_proba[:, 1], threshold_prc)
        results["metrics"]["prc"]["cohen_kappa"] = metrics.cohen_kappa_score(y_test.values, y_pred_prc)
        results["metrics"]["prc"]["f1"] = metrics.f1_score(y_test.values, y_pred_prc)
        results["metrics"]["prc"]["recall"] = metrics.recall_score(y_test.values, y_pred_prc)
        results["metrics"]["prc"]["precision"] = metrics.precision_score(y_test.values, y_pred_prc)
        tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred_prc).ravel()
        results["metrics"]["prc"]["tn"] = tn
        results["metrics"]["prc"]["fp"] = fp
        results["metrics"]["prc"]["fn"] = fn
        results["metrics"]["prc"]["tp"] = tp

        ## TUNING
        results["metrics"]["tuning"] = {}
        threshold_tuning, fscore_tuneing = tune_threshold(y_test.values, y_pred_proba[:,1], axes=None)
        results["metrics"]["tuning"]["threshold"] = threshold_tuning
        y_pred_tuning = apply_prediction(y_pred_proba[:, 1], threshold_tuning)
        results["metrics"]["tuning"]["cohen_kappa"] = metrics.cohen_kappa_score(y_test.values, y_pred_tuning)
        results["metrics"]["tuning"]["f1"] = metrics.f1_score(y_test.values, y_pred_tuning)
        results["metrics"]["tuning"]["recall"] = metrics.recall_score(y_test.values, y_pred_tuning)
        results["metrics"]["tuning"]["precision"] = metrics.precision_score(y_test.values, y_pred_tuning)
        tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred_tuning).ravel()
        results["metrics"]["tuning"]["tn"] = tn
        results["metrics"]["tuning"]["fp"] = fp
        results["metrics"]["tuning"]["fn"] = fn
        results["metrics"]["tuning"]["tp"] = tp

        # quick export to dump data
        with open(f"{EXPORT_PATH}/{MODEL}_{SPECIES_CODE[SPECIES]}_{TARGET}.json", 'w', encoding='utf-8') as f:
            json.dump(str(dict((k, v) for k,v in results.items())), f, ensure_ascii=False, indent=4)
        pd.json_normalize(results).to_csv(f"{EXPORT_PATH}/{MODEL}_{SPECIES_CODE[SPECIES]}_{TARGET}.csv", index=False)

        binary_classifcation_results.append(results)

print("-"*75)
print(f"EXPORTING OVERALL RESULTS TO {EXPORT_PATH}/modelling_pipeline_results.csv")
print("-"*75)
df_result = pd.json_normalize(binary_classifcation_results)
df_result.to_csv(f"{EXPORT_PATH}/modelling_pipeline_results_low.csv", index=False)
df_result

----------------------------------------------------------------------------------------------------
SVM -> Deciduous@is_low
	 performing backward feature elemination...
	 performing hyperparameter tuning...
	 fitting model...
	 performing permutation feature importance...
	 evaluating...
----------------------------------------------------------------------------------------------------
RUSBoostClassifier -> Deciduous@is_low
	 performing backward feature elemination...
	 performing hyperparameter tuning...
	 fitting model...
	 performing permutation feature importance...
	 evaluating...
----------------------------------------------------------------------------------------------------
AdaBoostClassifier -> Deciduous@is_low
	 performing backward feature elemination...
	 performing hyperparameter tuning...
	 fitting model...
	 performing permutation feature importance...
	 evaluating...
----------------------------------------------------------------------------------------------------

Unnamed: 0,species,model,target,relevant_features,params.tol,params.shrinking,params.random_state,params.probability,params.kernel,params.gamma,params.decision_function_shape,params.class_weight.0,params.class_weight.1,params.C,feature_permutaion_importance.location_cluster_0,feature_permutaion_importance.location_cluster_1,feature_permutaion_importance.location_cluster_2,feature_permutaion_importance.location_cluster_3,feature_permutaion_importance.location_cluster_4,feature_permutaion_importance.location_cluster_5,feature_permutaion_importance.location_cluster_6,feature_permutaion_importance.location_cluster_7,feature_permutaion_importance.location_cluster_8,feature_permutaion_importance.location_cluster_9,feature_permutaion_importance.location_cluster_10,feature_permutaion_importance.location_cluster_11,feature_permutaion_importance.location_cluster_12,feature_permutaion_importance.location_cluster_13,feature_permutaion_importance.tree_1,feature_permutaion_importance.evi2,feature_permutaion_importance.evi2-1,feature_permutaion_importance.evi2_diff,feature_permutaion_importance.evi2_max_diff,feature_permutaion_importance.evi2_raster,feature_permutaion_importance.evi2_raster-1,feature_permutaion_importance.evi2_raster_diff,feature_permutaion_importance.evi2_vegetation_raster,feature_permutaion_importance.evi2_vegetation_raster-1,feature_permutaion_importance.evi2_vegetation_raster_diff,feature_permutaion_importance.ndvi,feature_permutaion_importance.ndvi-1,feature_permutaion_importance.ndvi_diff,metrics.auc,metrics.roc.threshold,metrics.roc.cohen_kappa,metrics.roc.f1,metrics.roc.recall,metrics.roc.precision,metrics.roc.tn,metrics.roc.fp,metrics.roc.fn,metrics.roc.tp,metrics.prc.threshold,metrics.prc.cohen_kappa,metrics.prc.f1,metrics.prc.recall,metrics.prc.precision,metrics.prc.tn,metrics.prc.fp,metrics.prc.fn,metrics.prc.tp,metrics.tuning.threshold,metrics.tuning.cohen_kappa,metrics.tuning.f1,metrics.tuning.recall,metrics.tuning.precision,metrics.tuning.tn,metrics.tuning.fp,metrics.tuning.fn,metrics.tuning.tp,params.replacement,params.n_estimators,params.learning_rate,params.algorithm,params.subsample,params.seed,params.scale_pos_weight,params.objective,params.n_jobs,params.max_depth,params.lambda,params.eval_metric,params.colsample_bytree,params.scoring,params.min_samples_leaf,params.max_leaf_nodes,params.max_iter,params.loss,params.l2_regularization,params.warm_start,params.min_samples_split,params.max_features,params.splitter,params.min_weight_fraction_leaf,params.min_impurity_decrease,params.criterion,params.ccp_alpha,params.weights,params.p,params.n_neighbors,params.metric,params.leaf_size
0,Deciduous,SVM,is_low,"[location_cluster_5, location_cluster_7, locat...",0.007,True,2022.0,True,rbf,0.871,ovo,0.755257,1.479406,9.1,0.015431,0.002484,0.0034,0.031061,0.020651,-0.006838,0.02315,0.0057,0.017343,-0.000778,0.009398,0.000765,0.004155,0.009493,-0.001677,-0.0142,0.015568,0.032316,-0.040274,0.019212,-0.001358,0.004425,0.005854,0.032869,0.01437,0.014108,-0.001613,0.008946,0.581654,0.327453,0.260385,0.503676,0.480702,0.528958,418,122,148,137,0.900988,-0.002422,0.0,0.0,0.0,539,1,285,0,0.098,0.052185,0.530303,0.982456,0.363165,49,491,5,280,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,Deciduous,RUSBoostClassifier,is_low,"[location_cluster_0, location_cluster_3, locat...",,,2022.0,,,,,,,,0.018157,0.011579,0.028542,0.00641,0.011186,-0.004962,0.008617,0.000492,0.002054,0.005541,-0.001858,0.001535,0.001781,-0.012544,0.008536,0.004376,-0.011907,0.013282,-0.008555,0.004001,0.004134,-0.011801,0.009087,0.001112,-0.011276,4.1e-05,0.03713,0.033506,0.678772,0.493961,0.298294,0.623557,0.947368,0.464716,229,311,15,270,0.493961,0.298294,0.623557,0.947368,0.464716,229,311,15,270,0.494,0.295673,0.621965,0.94386,0.463793,229,311,16,269,True,395.0,0.093,SAMME.R,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,Deciduous,AdaBoostClassifier,is_low,"[location_cluster_0, location_cluster_1, locat...",,,2022.0,,,,,,,,-0.006788,0.017482,-0.014771,0.005751,0.003137,0.015852,0.000289,0.007033,-0.004098,0.000717,0.004537,-0.001412,-0.003711,-0.005747,0.002555,-0.006481,-0.008194,0.002035,0.010572,0.016224,-0.001003,0.000682,-0.009286,-0.00185,-0.013923,-0.00849,-0.000957,0.033643,0.627706,0.497273,0.15048,0.515272,0.680702,0.41453,266,274,91,194,0.493376,0.151067,0.566154,0.968421,0.4,126,414,9,276,0.494,0.155158,0.564854,0.947368,0.402385,139,401,15,270,,485.0,0.475,SAMME.R,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,Deciduous,XGBClassifier,is_low,"[location_cluster_3, location_cluster_4, locat...",,,,,,0.9,,,,,-0.020315,-0.008266,-0.016716,0.001689,0.006774,-0.005036,0.003061,0.003413,0.002777,-0.001009,-0.004327,-0.004681,0.004649,0.005594,-0.002621,-0.002804,0.003959,-0.00453,0.0031,-0.006819,-0.002262,-0.01335,-0.010282,-0.007721,0.007209,0.010354,-0.015326,0.0016,0.634409,0.318495,0.225552,0.55144,0.705263,0.452703,297,243,84,201,0.778927,-0.002422,0.0,0.0,0.0,539,1,285,0,0.167,0.206661,0.587353,0.961404,0.42284,166,374,11,274,,455.0,0.009,,0.95,2022.0,1.958812,binary:logistic,58.0,7.0,0.2,aucpr,0.7,,,,,,,,,,,,,,,,,,,
4,Deciduous,HistGradientBoostingClassifier,is_low,"[location_cluster_4, location_cluster_7, locat...",,,2022.0,,,,,,,,0.007144,-0.00237,-0.00318,-0.001325,0.054731,0.001622,0.003104,0.004017,0.011036,-0.003684,0.000327,-0.003629,-0.005221,0.004214,0.001546,0.004415,-0.008214,-6.8e-05,0.011644,-0.001898,0.007775,0.004741,0.042029,-0.006293,0.005279,-0.003823,0.005584,0.000178,0.730234,0.171965,0.322928,0.61599,0.824561,0.491632,297,243,50,235,0.164427,0.325251,0.622618,0.859649,0.488048,283,257,40,245,0.166,0.326103,0.622449,0.85614,0.488978,285,255,41,244,,,0.034,,,,,,,127.0,,,,roc_auc,2.0,43.0,350.0,binary_crossentropy,0.75,,,,,,,,,,,,,
5,Conifer,RandomForestClassifier,is_low,"[location_cluster_2, location_cluster_5, locat...",,,2022.0,,,,,0.818418,1.285131,,-0.004555,-0.00439,0.002717,0.003511,0.003523,0.002567,-0.003082,0.000454,-0.000813,-0.000731,0.006463,0.000415,-0.002759,-0.000814,0.001168,-0.002229,0.002277,0.005728,0.007678,0.003161,0.00509,-0.022202,-0.006559,-0.014839,-0.068217,-0.02153,-0.031094,-0.001389,0.650492,0.364162,0.252994,0.588435,0.679319,0.519,684,481,245,519,0.86475,-0.004143,0.0,0.0,0.0,1161,4,764,0,0.242,0.160498,0.611298,0.948953,0.450871,282,883,39,725,,496.0,,,,,,,58.0,208.0,,,,,2.0,,,,,False,4.0,sqrt,,,,,,,,,,
6,Conifer,BalancedRandomForestClassifier,is_low,"[location_cluster_0, location_cluster_2, locat...",,,2022.0,,,,,,,,-0.001959,-0.000718,-0.004545,-0.007095,-0.000424,0.005262,0.0069,-0.001585,-0.001834,-0.003507,0.002813,-0.006472,-0.001624,0.003772,0.007239,-0.004742,-0.007937,-0.005284,0.001418,0.01724,-0.004509,-0.00414,0.003845,0.002655,-0.018833,-0.006464,0.01154,-0.000665,0.586663,0.427172,0.166533,0.548066,0.649215,0.474187,615,550,268,496,0.906459,-0.003108,0.0,0.0,0.0,1162,3,764,0,0.34,0.15408,0.58791,0.83377,0.454027,399,766,127,637,,304.0,,,,,,,58.0,128.0,,,,,2.0,,,,,True,2.0,sqrt,,,,,,,,,,
7,Conifer,DecisionTreeClassifier,is_low,"[ndvi_max_diff, ndvi_raster, ndvi_raster-1, nd...",,,2022.0,,,,,0.818418,1.285131,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.5,1.503603,0.0,0.0,0.0,0.0,1165,0,764,0,0.503603,0.0,0.567397,1.0,0.39606,0,1165,0,764,0.0,0.0,0.567397,1.0,0.39606,0,1165,0,764,,,,,,,,,,2.0,,,,,10.0,1441.0,,,,,16.0,log2,random,0.5,0.7,gini,0.0,,,,,
8,Conifer,KNeighborsClassifier,is_low,"[location_cluster_0, location_cluster_2, locat...",,,,,,,,,,,-0.002191,-0.003031,-0.001295,-0.011599,-0.003819,-0.000492,-0.004577,-0.0195,0.003242,-0.011712,0.006508,0.005549,-0.010294,-0.005556,-0.012504,-0.010744,-0.008263,-0.014114,-0.002107,-0.01407,0.008989,0.012688,-0.015426,0.013198,-0.017471,-0.002678,-0.01624,0.00918,0.560899,0.274245,0.091042,0.53082,0.681937,0.434529,487,678,243,521,0.979501,-0.01034,0.0,0.0,0.0,1155,10,764,0,0.001,0.032532,0.573953,0.977749,0.406199,73,1092,17,747,,,,ball_tree,,,,,58.0,,,,,,,,,,,,,,,,,,,distance,1.0,3.0,minkowski,46.0
9,Conifer,SVM,is_low,"[location_cluster_0, location_cluster_1, locat...",0.005,False,2022.0,True,rbf,0.961,ovo,0.818418,1.285131,9.1,0.020844,0.003953,-0.002037,0.012435,0.020739,0.015927,-0.003948,0.029523,0.024775,0.039589,0.027588,0.008775,0.013002,0.005935,0.00804,0.009523,0.007958,0.021575,0.017114,0.015665,-0.01345,0.027691,0.034846,0.019479,0.023085,0.029379,0.007835,0.027636,0.673415,0.325296,0.291031,0.594465,0.646597,0.550111,761,404,270,494,0.258161,0.216404,0.594788,0.76178,0.487846,554,611,182,582,0.258,0.216404,0.594788,0.76178,0.487846,554,611,182,582,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


### Medium Damaged Trees

In [None]:
SPECIESES = ["1", "2", "1 2"]
MODELS = ["RandomForestClassifier", "BalancedRandomForestClassifier", "DecisionTreeClassifier", "KNeighborsClassifier", "SVM", "RUSBoostClassifier" , "AdaBoostClassifier", "XGBClassifier", "HistGradientBoostingClassifier"]
TARGET = "is_medium"

binary_classifcation_results = list()

for SPECIES in SPECIESES:
    for MODEL in MODELS:
        if os.path.exists(f"{EXPORT_PATH}/{MODEL}_{SPECIES_CODE[SPECIES]}_{TARGET}.json"):
            continue
            
        # DO NOT INCLUDE DAMAGE CLASS 2.0 FOR DEAD OR HEALTHY CLASSIFICATION
        data_selection = binned_data[SPECIES]["train"].copy()
        
        print("-"*100)
        print(f"{MODEL} -> {SPECIES_CODE[SPECIES]}@{TARGET}")
        print("\t performing backward feature elemination...")
        try:
            init_model_params = get_model_params(_model=MODEL, _target=TARGET, _species=SPECIES_CODE[SPECIES])
        except FileNotFoundError:
            init_model_params = {}

        if "class_weight" in init_model_params:
            init_model_params["class_weight"] = define_class_weight(data_selection[TARGET])
        if "pos_scale" in init_model_params:
            init_model_params["pos_scale"] = define_pos_scale(data_selection[TARGET])

        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=2022)
        f1_scorer = metrics.make_scorer(metrics.f1_score, **{"average": "weighted"})

        if MODEL == "RandomForestClassifier":
            model = RandomForestClassifier(**init_model_params)
        elif MODEL == "XGBClassifier":
            model = xgb.XGBClassifier(**init_model_params)
        elif MODEL == "KNeighborsClassifier":
            model = KNeighborsClassifier(**init_model_params)
        elif MODEL == "DecisionTreeClassifier":
            model = DecisionTreeClassifier(**init_model_params)
        elif MODEL == "HistGradientBoostingClassifier":
            model = HistGradientBoostingClassifier(**init_model_params)
        elif MODEL == "BalancedRandomForestClassifier":
            model = BalancedRandomForestClassifier(**init_model_params)
        elif MODEL == "RUSBoostClassifier":
            model = RUSBoostClassifier(**init_model_params)
        elif MODEL == "AdaBoostClassifier":
            model = AdaBoostClassifier(**init_model_params)
        else: #SVM
            model = SVC(**init_model_params)

        sfs = SequentialFeatureSelector(estimator=model, direction="backward", scoring=f1_scorer, cv=skf, n_jobs=58)
        sfs.fit(data_selection[all_features], data_selection[TARGET])
        relevant_features = list(data_selection[all_features].columns[sfs.get_support()])

        # TRAINING DATA
        X_train = data_selection[relevant_features]
        y_train = data_selection[TARGET]
        # TEST DATA
        X_test = binned_data[SPECIES]["test"][relevant_features]
        y_test = binned_data[SPECIES]["test"][TARGET]

        print("\t performing hyperparameter tuning...")

        if MODEL == "RandomForestClassifier":
            params_rf = PARAMS_RF
            params_rf["class_weight"] = [define_class_weight(y_train.values)]
            fitted_model_params = parameter_tuning(_model=RandomForestClassifier(), _params=params_rf, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = RandomForestClassifier(**fitted_model_params)
        elif MODEL == "XGBClassifier":
            params_xgb = PARAMS_XGB
            params_xgb['scale_pos_weight'] = [define_pos_scale(y_train.values)]
            fitted_model_params = parameter_tuning(_model=xgb.XGBClassifier(), _params=params_xgb, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = xgb.XGBClassifier(**fitted_model_params)
        elif MODEL == "KNeighborsClassifier":
            params_kn = PARAMS_KN
            fitted_model_params = parameter_tuning(_model=KNeighborsClassifier(), _params=params_kn, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = KNeighborsClassifier(**fitted_model_params)
        elif MODEL == "DecisionTreeClassifier":
            params_dt = PARAMS_DT
            params_dt['class_weight'] = [define_class_weight(y_train.values)]
            fitted_model_params = parameter_tuning(_model=DecisionTreeClassifier(), _params=params_dt, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = DecisionTreeClassifier(**fitted_model_params)
        elif MODEL == "HistGradientBoostingClassifier":
            params_hgb = PARAMS_HGB
            fitted_model_params = parameter_tuning(_model=HistGradientBoostingClassifier(), _params=params_hgb, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = HistGradientBoostingClassifier(**fitted_model_params)
        elif MODEL == "BalancedRandomForestClassifier":
            params_brf = PARAMS_BRF
            fitted_model_params = parameter_tuning(_model=BalancedRandomForestClassifier(), _params=params_brf, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = BalancedRandomForestClassifier(**fitted_model_params)
        elif MODEL == "RUSBoostClassifier":
            params_rus = PARAMS_RUS
            fitted_model_params = parameter_tuning(_model=RUSBoostClassifier(), _params=params_rus, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = RUSBoostClassifier(**fitted_model_params)
        elif MODEL == "AdaBoostClassifier":
            params_ada = PARAMS_ADA
            fitted_model_params = parameter_tuning(_model=AdaBoostClassifier(), _params=params_ada, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = AdaBoostClassifier(**fitted_model_params)
        else:
            params_svm = PARAMS_SVM
            params_svm['class_weight'] = [define_class_weight(y_train.values)]
            fitted_model_params = parameter_tuning(_model=SVC(), _params=params_svm, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = SVC(**fitted_model_params)

        # CREATING BAGGING CLASSIFIER
        model = BaggingClassifier(base_estimator=model, n_estimators=15, max_samples=.2, max_features=1., bootstrap=True, bootstrap_features=True, oob_score=True, n_jobs=16)
        print("\t fitting model...")
        model.fit(X_train, y_train)
        # save model
        pickle.dump(model, open(f"{EXPORT_PATH}/{MODEL}_{SPECIES_CODE[SPECIES]}_{TARGET}.sav", 'wb'))

        # predict probabilities
        y_pred_proba = model.predict_proba(X_test)
        
        # fallback for only 1.0 for 0 predition
        try:
            y_pred_proba[:, 1]
        except IndexError:
            y_pred_proba = np.full((y_pred_proba.shape[0], 2), [1., 0.])

        print("\t performing permutation feature importance...")
        from sklearn.inspection import permutation_importance
        permutation_importance = permutation_importance(model, X_test, y_test, n_repeats=5, random_state=2022, n_jobs=32, scoring=f1_scorer)
        permutation_importance_obj = {}
        for i, feat_imp in enumerate(permutation_importance.importances_mean):
            permutation_importance_obj[all_features[i]] = feat_imp

        print("\t evaluating...")
        results = {}
        results["species"] = SPECIES_CODE[SPECIES]
        results["model"] = MODEL
        results["target"] = TARGET
        results["params"] = fitted_model_params
        results["relevant_features"] = relevant_features
        results["feature_permutaion_importance"] = permutation_importance_obj

        results["metrics"] = {}
        results["metrics"]["auc"] = metrics.roc_auc_score(y_test.values, y_pred_proba[:, 1])

        ## ROC AUC
        results["metrics"]["roc"] = {}
        threshold_roc, gmean_roc = roc_auc(y_test.values, y_pred_proba[:,1], axes=None)
        results["metrics"]["roc"]["threshold"] = threshold_roc
        y_pred_roc = apply_prediction(y_pred_proba[:, 1], threshold_roc)
        results["metrics"]["roc"]["cohen_kappa"] = metrics.cohen_kappa_score(y_test.values, y_pred_roc)
        results["metrics"]["roc"]["f1"] = metrics.f1_score(y_test.values, y_pred_roc)
        results["metrics"]["roc"]["recall"] = metrics.recall_score(y_test.values, y_pred_roc)
        results["metrics"]["roc"]["precision"] = metrics.precision_score(y_test.values, y_pred_roc)
        tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred_roc).ravel()
        results["metrics"]["roc"]["tn"] = tn
        results["metrics"]["roc"]["fp"] = fp
        results["metrics"]["roc"]["fn"] = fn
        results["metrics"]["roc"]["tp"] = tp

        ## PRC
        results["metrics"]["prc"] = {}
        threshold_prc, fscore_prc = prc_rc(y_test.values, y_pred_proba[:,1], axes=None)
        results["metrics"]["prc"]["threshold"] = threshold_prc
        y_pred_prc = apply_prediction(y_pred_proba[:, 1], threshold_prc)
        results["metrics"]["prc"]["cohen_kappa"] = metrics.cohen_kappa_score(y_test.values, y_pred_prc)
        results["metrics"]["prc"]["f1"] = metrics.f1_score(y_test.values, y_pred_prc)
        results["metrics"]["prc"]["recall"] = metrics.recall_score(y_test.values, y_pred_prc)
        results["metrics"]["prc"]["precision"] = metrics.precision_score(y_test.values, y_pred_prc)
        tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred_prc).ravel()
        results["metrics"]["prc"]["tn"] = tn
        results["metrics"]["prc"]["fp"] = fp
        results["metrics"]["prc"]["fn"] = fn
        results["metrics"]["prc"]["tp"] = tp

        ## TUNING
        results["metrics"]["tuning"] = {}
        threshold_tuning, fscore_tuneing = tune_threshold(y_test.values, y_pred_proba[:,1], axes=None)
        results["metrics"]["tuning"]["threshold"] = threshold_tuning
        y_pred_tuning = apply_prediction(y_pred_proba[:, 1], threshold_tuning)
        results["metrics"]["tuning"]["cohen_kappa"] = metrics.cohen_kappa_score(y_test.values, y_pred_tuning)
        results["metrics"]["tuning"]["f1"] = metrics.f1_score(y_test.values, y_pred_tuning)
        results["metrics"]["tuning"]["recall"] = metrics.recall_score(y_test.values, y_pred_tuning)
        results["metrics"]["tuning"]["precision"] = metrics.precision_score(y_test.values, y_pred_tuning)
        tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred_tuning).ravel()
        results["metrics"]["tuning"]["tn"] = tn
        results["metrics"]["tuning"]["fp"] = fp
        results["metrics"]["tuning"]["fn"] = fn
        results["metrics"]["tuning"]["tp"] = tp

        # quick export to dump data
        with open(f"{EXPORT_PATH}/{MODEL}_{SPECIES_CODE[SPECIES]}_{TARGET}.json", 'w', encoding='utf-8') as f:
            json.dump(str(dict((k, v) for k,v in results.items())), f, ensure_ascii=False, indent=4)
        pd.json_normalize(results).to_csv(f"{EXPORT_PATH}/{MODEL}_{SPECIES_CODE[SPECIES]}_{TARGET}.csv", index=False)

        binary_classifcation_results.append(results)

print("-"*75)
print(f"EXPORTING OVERALL RESULTS TO {EXPORT_PATH}/modelling_pipeline_results.csv")
print("-"*75)
df_result = pd.json_normalize(binary_classifcation_results)
df_result.to_csv(f"{EXPORT_PATH}/modelling_pipeline_results_medium.csv", index=False)
df_result

----------------------------------------------------------------------------------------------------
RandomForestClassifier -> Deciduous@is_medium
	 performing backward feature elemination...
	 performing hyperparameter tuning...
	 fitting model...
	 performing permutation feature importance...
	 evaluating...
----------------------------------------------------------------------------------------------------
BalancedRandomForestClassifier -> Deciduous@is_medium
	 performing backward feature elemination...
	 performing hyperparameter tuning...
	 fitting model...
	 performing permutation feature importance...
	 evaluating...
----------------------------------------------------------------------------------------------------
DecisionTreeClassifier -> Deciduous@is_medium
	 performing backward feature elemination...
	 performing hyperparameter tuning...
	 fitting model...
	 performing permutation feature importance...
	 evaluating...
--------------------------------------------------------

### Heavy Damaged Trees

In [None]:
SPECIESES = ["1", "2", "1 2"]
MODELS = ["RandomForestClassifier", "BalancedRandomForestClassifier", "DecisionTreeClassifier", "KNeighborsClassifier", "SVM", "RUSBoostClassifier" , "AdaBoostClassifier", "XGBClassifier", "HistGradientBoostingClassifier"]
TARGET = "is_high"

binary_classifcation_results = list()

for SPECIES in SPECIESES:
    for MODEL in MODELS:
        #if os.path.exists(f"{EXPORT_PATH}/{MODEL}_{SPECIES_CODE[SPECIES]}_{TARGET}.json"):
        #    continue
            
        # DO NOT INCLUDE DAMAGE CLASS 2.0 FOR DEAD OR HEALTHY CLASSIFICATION
        data_selection = binned_data[SPECIES]["train"].copy()
        
        print("-"*100)
        print(f"{MODEL} -> {SPECIES_CODE[SPECIES]}@{TARGET}")
        print("\t performing backward feature elemination...")
        try:
            init_model_params = get_model_params(_model=MODEL, _target=TARGET, _species=SPECIES_CODE[SPECIES])
        except FileNotFoundError:
            init_model_params = {}

        if "class_weight" in init_model_params:
            init_model_params["class_weight"] = define_class_weight(data_selection[TARGET])
        if "pos_scale" in init_model_params:
            init_model_params["pos_scale"] = define_pos_scale(data_selection[TARGET])

        skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=2022)
        f1_scorer = metrics.make_scorer(metrics.f1_score, **{"average": "weighted"})

        if MODEL == "RandomForestClassifier":
            model = RandomForestClassifier(**init_model_params)
        elif MODEL == "XGBClassifier":
            model = xgb.XGBClassifier(**init_model_params)
        elif MODEL == "KNeighborsClassifier":
            model = KNeighborsClassifier(**init_model_params)
        elif MODEL == "DecisionTreeClassifier":
            model = DecisionTreeClassifier(**init_model_params)
        elif MODEL == "HistGradientBoostingClassifier":
            model = HistGradientBoostingClassifier(**init_model_params)
        elif MODEL == "BalancedRandomForestClassifier":
            model = BalancedRandomForestClassifier(**init_model_params)
        elif MODEL == "RUSBoostClassifier":
            model = RUSBoostClassifier(**init_model_params)
        elif MODEL == "AdaBoostClassifier":
            model = AdaBoostClassifier(**init_model_params)
        else: #SVM
            model = SVC(**init_model_params)

        sfs = SequentialFeatureSelector(estimator=model, direction="backward", scoring=f1_scorer, cv=skf, n_jobs=58)
        sfs.fit(data_selection[all_features], data_selection[TARGET])
        relevant_features = list(data_selection[all_features].columns[sfs.get_support()])

        # TRAINING DATA
        X_train = data_selection[relevant_features]
        y_train = data_selection[TARGET]
        # TEST DATA
        X_test = binned_data[SPECIES]["test"][relevant_features]
        y_test = binned_data[SPECIES]["test"][TARGET]

        print("\t performing hyperparameter tuning...")

        if MODEL == "RandomForestClassifier":
            params_rf = PARAMS_RF
            params_rf["class_weight"] = [define_class_weight(y_train.values)]
            fitted_model_params = parameter_tuning(_model=RandomForestClassifier(), _params=params_rf, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = RandomForestClassifier(**fitted_model_params)
        elif MODEL == "XGBClassifier":
            params_xgb = PARAMS_XGB
            params_xgb['scale_pos_weight'] = [define_pos_scale(y_train.values)]
            fitted_model_params = parameter_tuning(_model=xgb.XGBClassifier(), _params=params_xgb, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = xgb.XGBClassifier(**fitted_model_params)
        elif MODEL == "KNeighborsClassifier":
            params_kn = PARAMS_KN
            fitted_model_params = parameter_tuning(_model=KNeighborsClassifier(), _params=params_kn, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = KNeighborsClassifier(**fitted_model_params)
        elif MODEL == "DecisionTreeClassifier":
            params_dt = PARAMS_DT
            params_dt['class_weight'] = [define_class_weight(y_train.values)]
            fitted_model_params = parameter_tuning(_model=DecisionTreeClassifier(), _params=params_dt, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = DecisionTreeClassifier(**fitted_model_params)
        elif MODEL == "HistGradientBoostingClassifier":
            params_hgb = PARAMS_HGB
            fitted_model_params = parameter_tuning(_model=HistGradientBoostingClassifier(), _params=params_hgb, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = HistGradientBoostingClassifier(**fitted_model_params)
        elif MODEL == "BalancedRandomForestClassifier":
            params_brf = PARAMS_BRF
            fitted_model_params = parameter_tuning(_model=BalancedRandomForestClassifier(), _params=params_brf, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = BalancedRandomForestClassifier(**fitted_model_params)
        elif MODEL == "RUSBoostClassifier":
            params_rus = PARAMS_RUS
            fitted_model_params = parameter_tuning(_model=RUSBoostClassifier(), _params=params_rus, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = RUSBoostClassifier(**fitted_model_params)
        elif MODEL == "AdaBoostClassifier":
            params_ada = PARAMS_ADA
            fitted_model_params = parameter_tuning(_model=AdaBoostClassifier(), _params=params_ada, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = AdaBoostClassifier(**fitted_model_params)
        else:
            params_svm = PARAMS_SVM
            params_svm['class_weight'] = [define_class_weight(y_train.values)]
            fitted_model_params = parameter_tuning(_model=SVC(), _params=params_svm, _scoring=f1_scorer, _X=X_train, _y=y_train)
            model = SVC(**fitted_model_params)

        # CREATING BAGGING CLASSIFIER
        model = BaggingClassifier(base_estimator=model, n_estimators=15, max_samples=.2, max_features=1., bootstrap=True, bootstrap_features=True, oob_score=True, n_jobs=16)
        print("\t fitting model...")
        model.fit(X_train, y_train)
        # save model
        pickle.dump(model, open(f"{EXPORT_PATH}/{MODEL}_{SPECIES_CODE[SPECIES]}_{TARGET}.sav", 'wb'))

        # predict probabilities
        y_pred_proba = model.predict_proba(X_test)

        # fallback for only 1.0 for 0 predition
        try:
            y_pred_proba[:, 1]
        except IndexError:
            y_pred_proba = np.full((y_pred_proba.shape[0], 2), [1., 0.])
        
        print("\t performing permutation feature importance...")
        from sklearn.inspection import permutation_importance
        permutation_importance = permutation_importance(model, X_test, y_test, n_repeats=5, random_state=2022, n_jobs=32, scoring=f1_scorer)
        permutation_importance_obj = {}
        for i, feat_imp in enumerate(permutation_importance.importances_mean):
            permutation_importance_obj[all_features[i]] = feat_imp

        print("\t evaluating...")
        results = {}
        results["species"] = SPECIES_CODE[SPECIES]
        results["model"] = MODEL
        results["target"] = TARGET
        results["params"] = fitted_model_params
        results["relevant_features"] = relevant_features
        results["feature_permutaion_importance"] = permutation_importance_obj

        results["metrics"] = {}
        results["metrics"]["auc"] = metrics.roc_auc_score(y_test.values, y_pred_proba[:, 1])

        ## ROC AUC
        results["metrics"]["roc"] = {}
        threshold_roc, gmean_roc = roc_auc(y_test.values, y_pred_proba[:,1], axes=None)
        results["metrics"]["roc"]["threshold"] = threshold_roc
        y_pred_roc = apply_prediction(y_pred_proba[:, 1], threshold_roc)
        results["metrics"]["roc"]["cohen_kappa"] = metrics.cohen_kappa_score(y_test.values, y_pred_roc)
        results["metrics"]["roc"]["f1"] = metrics.f1_score(y_test.values, y_pred_roc)
        results["metrics"]["roc"]["recall"] = metrics.recall_score(y_test.values, y_pred_roc)
        results["metrics"]["roc"]["precision"] = metrics.precision_score(y_test.values, y_pred_roc)
        tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred_roc).ravel()
        results["metrics"]["roc"]["tn"] = tn
        results["metrics"]["roc"]["fp"] = fp
        results["metrics"]["roc"]["fn"] = fn
        results["metrics"]["roc"]["tp"] = tp

        ## PRC
        results["metrics"]["prc"] = {}
        threshold_prc, fscore_prc = prc_rc(y_test.values, y_pred_proba[:,1], axes=None)
        results["metrics"]["prc"]["threshold"] = threshold_prc
        y_pred_prc = apply_prediction(y_pred_proba[:, 1], threshold_prc)
        results["metrics"]["prc"]["cohen_kappa"] = metrics.cohen_kappa_score(y_test.values, y_pred_prc)
        results["metrics"]["prc"]["f1"] = metrics.f1_score(y_test.values, y_pred_prc)
        results["metrics"]["prc"]["recall"] = metrics.recall_score(y_test.values, y_pred_prc)
        results["metrics"]["prc"]["precision"] = metrics.precision_score(y_test.values, y_pred_prc)
        tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred_prc).ravel()
        results["metrics"]["prc"]["tn"] = tn
        results["metrics"]["prc"]["fp"] = fp
        results["metrics"]["prc"]["fn"] = fn
        results["metrics"]["prc"]["tp"] = tp

        ## TUNING
        results["metrics"]["tuning"] = {}
        threshold_tuning, fscore_tuneing = tune_threshold(y_test.values, y_pred_proba[:,1], axes=None)
        results["metrics"]["tuning"]["threshold"] = threshold_tuning
        y_pred_tuning = apply_prediction(y_pred_proba[:, 1], threshold_tuning)
        results["metrics"]["tuning"]["cohen_kappa"] = metrics.cohen_kappa_score(y_test.values, y_pred_tuning)
        results["metrics"]["tuning"]["f1"] = metrics.f1_score(y_test.values, y_pred_tuning)
        results["metrics"]["tuning"]["recall"] = metrics.recall_score(y_test.values, y_pred_tuning)
        results["metrics"]["tuning"]["precision"] = metrics.precision_score(y_test.values, y_pred_tuning)
        tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred_tuning).ravel()
        results["metrics"]["tuning"]["tn"] = tn
        results["metrics"]["tuning"]["fp"] = fp
        results["metrics"]["tuning"]["fn"] = fn
        results["metrics"]["tuning"]["tp"] = tp

        # quick export to dump data
        with open(f"{EXPORT_PATH}/{MODEL}_{SPECIES_CODE[SPECIES]}_{TARGET}.json", 'w', encoding='utf-8') as f:
            json.dump(str(dict((k, v) for k,v in results.items())), f, ensure_ascii=False, indent=4)
        pd.json_normalize(results).to_csv(f"{EXPORT_PATH}/{MODEL}_{SPECIES_CODE[SPECIES]}_{TARGET}.csv", index=False)

        binary_classifcation_results.append(results)

print("-"*75)
print(f"EXPORTING OVERALL RESULTS TO {EXPORT_PATH}/modelling_pipeline_results.csv")
print("-"*75)
df_result = pd.json_normalize(binary_classifcation_results)
df_result.to_csv(f"{EXPORT_PATH}/modelling_pipeline_results_high.csv", index=False)
df_result

----------------------------------------------------------------------------------------------------
RandomForestClassifier -> Deciduous@is_high
	 performing backward feature elemination...
	 performing hyperparameter tuning...
	 fitting model...
	 performing permutation feature importance...
	 evaluating...
----------------------------------------------------------------------------------------------------
BalancedRandomForestClassifier -> Deciduous@is_high
	 performing backward feature elemination...
	 performing hyperparameter tuning...
	 fitting model...
	 performing permutation feature importance...
	 evaluating...
----------------------------------------------------------------------------------------------------
DecisionTreeClassifier -> Deciduous@is_high
	 performing backward feature elemination...
	 performing hyperparameter tuning...
	 fitting model...
	 performing permutation feature importance...
	 evaluating...
--------------------------------------------------------------

## Ensemble Learning: Basis Model Results

In [None]:
df_results = [pd.read_csv(f"{EXPORT_PATH}/{file}.csv") for file in ["modelling_pipeline_results_low", "modelling_pipeline_results_medium", "modelling_pipeline_results_high"]]
df_results = pd.concat(df_results, ignore_index=True)
df_metrics, df_best_metrics, df_parameter, df_feature_importance = parse_results(df_results)
df_best_metrics