# Train a boosted tree model predicting emission intensity of HP reservoirs and use DALEX to interpret the results
### T. Janus
### 15/04/24

## Outline:
1. Load ML and xAI libraries
2. Load emissions and HP production data from file(s) generated in Notebook_9b
3. Filter the data to include only the reservoirs and exclude RoR
4. Fit and test the ML model
5. Import emission interpretations generated in Notebook_7
6. Generated figures for the second composite figure

## Load ML and xAI libraries

In [None]:
from typing import List
import pathlib
import os
import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import seaborn as sns

# Enable the output data from scikit-learn's Pipeline to be in Pandas, rather than numpy ndarray format
from sklearn import set_config
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
set_config(transform_output="pandas")

# Load tree-based regression models
import catboost as cb
import xgboost as xgb
import lightgbm as lgbm
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

from lib.hypertune import HyperTuner, hypertune_model
from lib.utils import (
    save_model, load_model, plot_gini_feature_importances, plot_permutation_feature_importances,
    plot_shap_feature_importances, model_check, plot_scores)
from lib.utils import (
    calculate_gini_feature_importances, calculate_permutation_feature_importances,
    calculate_shap_feature_importances)
from lib.utils import model_feature_importances

import dalex as dx
import shap

In [None]:
def plot_feat_importances(
        model, X_train, X_test, y_test, title: str = "Feature importances",
        file_name: str| None = None, **kwargs) -> None:
    fig, axs = plt.subplots(2, 2, figsize=(10,7))
    fig.suptitle(title)
    for ix, ax in enumerate(axs.flat):
        if ix == 0:
            plot_gini_feature_importances(
                model, X_train, 15, 
                'GINI-based Feature Importances', ax = ax)
        if ix == 1:
            # Computed on test data
            plot_permutation_feature_importances(
                model, X_test, y_test, max_vars = 15,
                n_repeats = 7,
                title='Permutation-based Feature Importances', ax = ax)
        if ix == 2:
            plot_shap_feature_importances(
                model, X_test, 
                max_vars = 15,
                title='Mean SHAP values',
                plot_type = 'bar', ax=ax)

    fig.delaxes(axs[1,1])
    plt.tight_layout()
    if file_name:
        fig.savefig(file_name, dpi = 300, bbox_inches='tight', **kwargs)

In [None]:
# Execution options
rerun_initial_fit = False # Rerun initial model fitting
simu_type = "local" # if 'local' then the models are fit on a local computer using CPU, if 'colab' then
                    # the models are intended to be run on the Google colab platform
override = True # Saves the model (after fitting) even if saved model already exists
fitted_models = set(['xgboost', 'catboost', 'lightgbm'])

In [None]:
# Load and filter tabular data for ML and xAI
input_output = pd.read_excel(pathlib.Path("intermediate/out_par_em_ifc.xlsx"))\
    .rename(columns={'Unnamed: 0': 'Reservoir'})\
    .loc[:,['Reservoir', 'flow', 'hp', 'reservoir_type', 'Status', 
            'res_area', 'em_intensity', 'tot_em_net', 'plant_factor',
           'des_head', 'des_flow', 'q_mean_des', 'h_mean_des']]
input_output_sto = input_output.loc[~input_output['res_area'].isna()]
# Load shp data for plotting
input_output_gdf = gpd.read_file(pathlib.Path("intermediate/out_par_em_ifc.geojson"))

In [None]:
# Create training/validation/cross-validation data
X = input_output_sto.loc[
    :,
    ['des_flow', 'q_mean_des', 'des_head', 'h_mean_des', 'tot_em_net', 'res_area']]
y = input_output_sto['em_intensity']
# Perform data splitting - use 90% train and 10% test
random_seed = 666 # Initialise with the number of the beast to maximize chances of getting lucky
X_train, X_test, y_train, y_test = \
    train_test_split(X, y, train_size=0.9, test_size=0.1, random_state=random_seed)
X_train_test = pd.concat([X_train, X_test])
y_train_test = pd.concat([y_train, y_test])

In [None]:
Xy_train = pd.concat([X_train, y_train], axis=1)
corr_matrix = Xy_train.corr()
mask_matrix = np.triu(corr_matrix)
plt.figure(figsize=(9,6))
sns.set(font_scale=0.7)
heatmap = sns.heatmap(corr_matrix, vmin=-1, vmax=1, annot=True, cmap="YlGnBu", mask=mask_matrix)
heatmap.set_title('Correlation Matrix Heatmap - all features', fontdict={'fontsize':12}, pad=14);
heatmap.tick_params(axis='both', which='major', labelsize=8)

In [None]:
# NOTE: We save the model to a separate folder called saved_models but read from bin/regression_models/..
# This is done in order to not overwrite alread saved models by error.
# The saved files need to be moved/copied manually

# Make fast and dirty boosted tree regression fitting first
cat_features = [] # If empty then no categorical features included in the model/data
if simu_type == "local":
    em_intensity_model = CatBoostRegressor(loss_function = 'RMSE', task_type="CPU", iterations=5000)
elif simu_type == "colab":
    em_intensity_model = CatBoostRegressor(loss_function = 'RMSE', task_type="GPU" )
em_intensity_model_quick_path = pathlib.Path("bin/regression_models/em_intensity_model_catboost_quick.cbm")
saved_model_path = pathlib.Path("saved_models")

if rerun_initial_fit or not os.path.isfile(em_intensity_model_quick_path):
    em_intensity_model.fit(X_train, y_train, cat_features = cat_features, silent=True)
    file_path = saved_model_path / "em_intensity_model_catboost_quick.cbm"
    if not file_path.exists():
        em_intensity_model.save_model(
            saved_model_path / "em_intensity_model_catboost_quick.cbm", format="cbm")
    if override:
        if not saved_model_path.exists():
            saved_model_path.mkdir()
        em_intensity_model.save_model(
            saved_model_path / "em_intensity_model_catboost_quick.cbm", format="cbm")
else:
    em_intensity_model.load_model(em_intensity_model_quick_path) 

In [None]:
model_check(
    model=em_intensity_model, 
    X_train = X_train, X_test = X_test, 
    y_train = y_train, y_test = y_test)

## Boosted tree model fitting with hyperparameter tuning

In [None]:
override = False # Set override OFF for hypertuned models
refit = False

### Tune the CATBoost model

In [None]:
model_catboost = hypertune_model(
    X_train, y_train, num_evals = 2000, hypertuner=HyperTuner.CATBOOST,
    file=os.path.join('bin', 'regression_models', 'catboost_em_intensity.pkl'), override = override)

In [None]:
model_catboost.params = {
    'silent': True, 'verbose': False, 'logging_level': 'Silent',
    'metric_period':100}
model_catboost.metric_period = 10000
model_catboost.logging_level = 'Silent'
model_catboost.verbose = False
model_catboost.silent = True
if refit:
    model_catboost.fit(X_train_test, y_train_test)

In [None]:
print("CATBOOST MODEL REGRESSION STATISTICS")
print("---------------------------------------")
model_check(model=model_catboost, 
    X_train = X_train, X_test = X_test, 
    y_train = y_train, y_test = y_test)

In [None]:
# Check feature importances
fig, axs = plt.subplots(1,1, figsize=(6,3))
plot_shap_feature_importances(
                model_catboost, X_train_test,
                max_vars = 15,
                title=" ",
                plot_type = 'bar', ax=axs)

### Tune the XGBoost models

In [None]:
model_xgboost = hypertune_model(
    X_train, y_train, num_evals = 1_000, hypertuner=HyperTuner.XGBOOST,
    file=os.path.join('bin', 'regression_models', 'xgboost_em_intensity.pkl'), override = override)

In [None]:
# Retrain on full data set (for model explainability analysis)
if refit:
    model_xgboost.fit(X_train_test, y_train_test)
print("XGBOOST MODEL REGRESSION STATISTICS")
print("---------------------------------------")
model_check(model=model_xgboost, 
    X_train = X_train, X_test = X_test, 
    y_train = y_train, y_test = y_test)

In [None]:
# Check feature importances
fig, axs = plt.subplots(1,1, figsize=(6,3))
plot_shap_feature_importances(
                model_xgboost, X_train_test,
                max_vars = 15,
                title=" ",
                plot_type = 'bar', ax=axs)

### Tune the LightGBM models

In [None]:
model_lightgbm = hypertune_model(
    X_train, y_train, num_evals = 1_000, hypertuner=HyperTuner.LIGHTGBM,
    file=os.path.join('bin', 'regression_models', 'lightgbm_em_intensity.pkl'), override = override)

In [None]:
# Remove warnings in the LightGBM CO2 regression model - ONLY WORKS FOR PRE-TRAINED MODELS
# IF TRAINING NEW MODELS - COMMENT OUT AND SEE THE WARNINGS FIRST BEFORE TURNING SOME CONFLICTING
# REGRESSION PARAMETERS OFF
import lightgbm as lgb

model_lightgbm.min_child_samples = None
model_lightgbm.min_split_gain=None
model_lightgbm.subsample=None
model_lightgbm.boosting_type=None
model_lightgbm.colsample_bytree=None
model_lightgbm.reg_alpha = None
model_lightgbm.reg_lambda = None
model_lightgbm.params={'verbose': -1, 'verbose_eval' : -1}
model_lightgbm.free_raw_data=False
# Retrain on full data set (for model explainability analysis)
#model_co2_lightgbm.predict_raw_score = False
if refit:
    model_lightgbm.metric = {'rmse'}
    model_lightgbm.fit(X_train_test, y_train_test)

In [None]:
print("LIGHTGBM MODEL REGRESSION STATISTICS")
print("---------------------------------------")
model_check(model=model_lightgbm, 
    X_train = X_train, X_test = X_test, 
    y_train = y_train, y_test = y_test)

In [None]:
# Check feature importances
fig, axs = plt.subplots(1,1, figsize=(6,3))
plot_shap_feature_importances(
                model_lightgbm, X_train_test,
                max_vars = 15,
                title=" ",
                plot_type = 'bar', ax=axs)

## Plot feature importances

In [None]:
plot_feat_importances(
    model_xgboost, X_train_test, X_train_test, y_train_test, 
    title = "Feature importances - XGBoost model",
    file_name = pathlib.Path('figures/model_explanation/feature_importances_xgboost_em_intensity.png'),
    transparent=False)

In [None]:
plot_feat_importances(
    model_catboost, X_train_test, X_train_test, y_train_test, 
    title = "Feature importances - CATBoost model",
    file_name = pathlib.Path('figures/model_explanation/feature_importances_catboost_em_intensity.png'),
    transparent=False)

In [None]:
plot_feat_importances(
    model_lightgbm, X_train_test, X_train_test, y_train_test, 
    title = "Feature importances - LightGBM model",
    file_name = pathlib.Path('figures/model_explanation/feature_importances_lightgbm_em_intensity.png'),
    transparent=False)

# Model and predictions explanation with DALEX

## DALEX instance-level explanations

In [None]:
def find_index_by_name(name: str, X: pd.DataFrame, df_full: pd.DataFrame = input_output_sto) -> pd.Int64Index | None:
    """Uses full dataset with Reservoir column to obtain an index of a row containing the input data for the
    reservoir which can be used to select data in the train/test dataset, e.g. for inspecting variable
    importance for each reservoir"""
    ix = X[df_full['Reservoir']==name].index
    if not ix.empty:
        return ix
    else:
        print(f"Reservoir with name {name} not found")
        return None
    
def loc_index_to_iloc(loc_index: pd.Int64Index, data: pd.DataFrame = X_train_test) -> int:
    """ """
    loc_index_int = int(np.mean(loc_index))
    return data.index.get_loc(loc_index_int)

def reservoir_names(df_full: pd.DataFrame =input_output_sto) -> List[str]:
    return list(df_full['Reservoir'])

In [None]:
# Rename columns of the data and of the explained for visualisation purposes
X_train_test_renamed = X_train_test.rename(
    columns = {
        "tot_em_net": "emission",
        "res_area": "area",
        "h_mean_des": "hmean/hdes",
        "des_head": "hdes",
        "q_mean_des" : "qmean/qdes",
        "des_flow" : "qdes"
    })

model_lightgbm.fit(X_train_test_renamed, y_train_test)
model_catboost.fit(X_train_test_renamed, y_train_test)
model_xgboost.fit(X_train_test_renamed, y_train_test)
exp_xgboost = dx.Explainer(
    model_xgboost, X_train_test_renamed, y_train_test, 
    label='xgboost model em intensity') # Uses dalex model explainer
exp_lightgbm = dx.Explainer(
    model_lightgbm, X_train_test_renamed, y_train_test, 
    label='lightgbm model em intensity') # Uses dalex model explainer
exp_catboost = dx.Explainer(
    model_catboost, X_train_test_renamed, y_train_test, 
    label='catboost model em intensity') # Uses dalex model explainer

In [None]:
variable = "y"
yvariable = "residuals"
exp_xgboost.model_diagnostics().plot(variable=variable, yvariable=yvariable)
exp_lightgbm.model_diagnostics().plot(variable=variable, yvariable=yvariable)
exp_catboost.model_diagnostics().plot(variable=variable, yvariable=yvariable)

In [None]:
exp_xgboost.model_performance()

In [None]:
exp_catboost.model_performance()

In [None]:
exp_lightgbm.model_performance()

In [None]:
def plot_emission_intensity_breakdown(
        reservoir_name: str, dataset: pd.DataFrame = X_train_test_renamed, 
        y_data: pd.Series = y_train_test, model: str = 'lightgbm',
        file_location: str = "figures/model_explanation/",
        print_titles: bool = False,
        interaction_preference: int = 1, seed: int | None = 42):
    """ """
    if model == 'lightgbm':
        explainer = exp_lightgbm
    elif model == "xgboost":
        explainer = exp_xgboost
    elif model == "catboost":
        explainer = exp_catboost
    else:
        raise ValueError(f"Model {model} not recognized.")
    ix = find_index_by_name(name=reservoir_name, X=dataset, df_full=input_output_sto)
    num_row = loc_index_to_iloc(loc_index=ix, data=dataset)
    input_reservoir = dataset.iloc[[num_row]]
    # Find true emisison intensity value for the reservoir
    output_true = y_data.iloc[num_row]
    # Find predicted value for reservoir using lightgbm model
    cp = explainer.predict_profile(input_reservoir)
    output_pred = explainer.predict(input_reservoir)
    # Calculate the prediction breakdown
    #title = f'GHG emission intensity - {reservoir_name}'
    if print_titles:
        title = 'GHG emission intensity'
    else:
        title = " "
    explanation_sample = explainer.predict_parts(
        input_reservoir, 
        type='break_down_interactions', 
        interaction_preference = interaction_preference, 
        random_state = seed,
        label=title, B=25) 
    p1 = explanation_sample.plot(
        title=title,
        max_vars=10, 
        bar_width = 15,
        vertical_spacing = 0.05,
        vcolors=("#2471a3", '#89b38a', '#c7644c'), show=False)
    p1.update_layout(
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        xaxis=dict(
            showgrid=False,  # Remove x-axis grid lines
            tickfont=dict(color='black')  # Set x-axis tick font color to black
        ),
        yaxis=dict(
            showgrid=False,  # Remove y-axis grid lines
            tickfont=dict(color='black')  # Set y-axis tick font color to black
        ),
        font=dict(color='black')  # Set general font color to black
    )
    p1.update_traces(opacity=0.90)
    p1.data[0].connector.line.color = 'black'
    for shape in p1.layout.shapes:
        if shape.type == 'line':
            shape.line.color = '#424345'  # Set line color to black
            shape.line.width = 2
    
    file_svg = reservoir_name + "_breakdown_interactions_em_intensity" + ".svg"
    file_png = reservoir_name + "_breakdown_interections_em_intensity" + ".png"
    p1.write_image(pathlib.Path(file_location) / file_svg)
    p1.write_image(pathlib.Path(file_location) / file_png)
    return p1, output_true, output_pred, cp

In [None]:
print("  --  ".join(input_output_sto['Reservoir']))

In [None]:
reservoir_names = [
    "Thaphanseik", "Sedawgyi", "Zawgyi II", "Belin", "Laza", 
    "Mone Chaung", "Yeywa (upper)",
    "Kyee Ohn Kyee Wa", "Hawkham (upper)", "Myitsone"]

In [None]:
# pick reservoir
output_comparison = []
for reservoir_name in reservoir_names:
    p1plt, output_true, output_pred, cp = plot_emission_intensity_breakdown(
        reservoir_name, 
        model = 'catboost',
        dataset = X_train_test_renamed,
        seed = 42,
        interaction_preference = 2)
    output_comparison.append([reservoir_name, output_true, output_pred])
    #p1plt.show()

In [None]:
output_comparison

In [None]:
p1plt

In [None]:
print(f"True output: {output_true}, Output prediction: {output_pred}")

In [None]:
## Plot ZawgyiII
p1plt, output_true, output_pred, cp = plot_emission_intensity_breakdown(
    'Zawgyi II', 
    model = 'catboost',
    dataset = X_train_test_renamed,
    seed = 42,
    interaction_preference = 2)

In [None]:
p1plt

In [None]:
cp.result

In [None]:
p1 = cp.plot(variables = ['qmean/qdes', 'hmean/hdes'], show=False, size=3)

In [None]:
type(p1)

In [None]:
p1.update_layout(
    xaxis=dict(
        tickfont=dict(color='rgba(0, 0, 0, 0.8)', size=16),  # Set x-axis tick font color to black
        title=dict(font=dict(size=18, color='black'))
    ),
    yaxis=dict(
        tickfont=dict(color='rgba(0, 0, 0, 0.8)', size=16),  # Set y-axis tick font color to black
        title=dict(font=dict(size=18, color='black'))
    ),
    xaxis2=dict(
        tickfont=dict(color='rgba(0, 0, 0, 0.8)', size=16)  # Set x-axis tick font color to black
    ),
    yaxis2=dict(
        tickfont=dict(color='rgba(0, 0, 0, 0.8)', size=16)  # Set y-axis tick font color to black
    ),
    title=dict(
        text='Ceteris Paribus Plots for Emission Intensity Prediction',  # Main plot title
        font=dict(size=18, color='black')  # Increase font size and change color to black for the main title
    ),
    font=dict(color='black'),
    width=1000,
    height=450
)
p1.update_traces(
    line=dict(width=3, color='rgba(0, 0, 0, 0.6)'),
    opacity=0.70)
p1.show()

In [None]:
p1.write_image("figures/model_explanation/zagyi2cp3.svg")

### Display the full dataframe of results with GHG emisions, emisison intensities, and water-resources model derived parameters

In [None]:
input_output_sto