# Creating Boosted Tree regression models from GHG emission model input-output data

### The analysis is performed via introspection of the fitted CatBoost, XGBoost and LightGBM boosted random-forest regression models using various explainable AI techniques in DALEX package and the `shap` package

- Author: Tomasz Janus
- E-mail: tomasz.k.janus@gmail.com, tomasz.janus@manchester.ac.uk
- Mui Ne, 22/10/2023
- Modified on 28/06/2024

### The notebooks proceeds in the following steps:
  1. Load required libraries and the input/output data
  2. Visualise relationships in the input data
  3. Prune / clean the input dataset
  4. Fit the catboost model using pre-set hyperparameter values and a fixed train/validation data-split to serve as a baseline quick check of what we can expect from boosted tree models
  5. Fit the catboost, lightgbm and xgboost models using hyperparameter tuning and KFOLD cross-validation
  6. Save the fitted models to files
  7. Explore the model structure using DALEX (that provides interface to SHAP and LIME) and additionally using the `shap` package
  
## NOTE:
#### Creating composite explanation figures requires prior calculation of emission intensity predictions that are calculated in `Notebook_9b_process_additional_information_from_water_resource_models`

## 1a. Import the required libraries

In [None]:
from typing import Protocol, Dict, Protocol, List, Any, Literal, Tuple
import os
import sys
import pathlib
import re
from dataclasses import dataclass
from collections import defaultdict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.pylab as plt2
import seaborn as sns # For plotting data

# Load tree-based regression models
import catboost as cb
import xgboost as xgb
import lightgbm as lgbm
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Load scikit-learn's classes for model and feature selection, validation and data transformation
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, KFold
from sklearn.inspection import permutation_importance
from sklearn.feature_selection import VarianceThreshold # Feature selecto
from sklearn.pipeline import Pipeline, make_pipeline
#from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.feature_selection import SelectKBest, f_regression, mutual_info_regression

# Enable the output data from scikit-learn's Pipeline to be in Pandas, rather than numpy ndarray format
from sklearn import set_config
set_config(transform_output="pandas")

# Import the hyperparameter optimal tuning tool
import lib.hypertune as hypertune

# Import library for automatic data profiling
# Although, it seems to be running very slowly on our input data hence we might resort to manual 
# data exploration
from ydata_profiling import ProfileReport

# Import explainers
import shap
from lime import lime_tabular
import dalex as dx

# Model loading/saving
import joblib

# Import from local library folder
from lib.pipeline import DataFrameOneHotEncoder
from lib.pipeline import (
    ProfileDropper, ReduceResAreaFractions, ReplaceTempProfileWithMean,
    ColumnDropper, StaticColRemover)

import importlib
from lib.hypertune import HyperTuner, hypertune_model
from lib.utils import (
    save_model, load_model, plot_gini_feature_importances, plot_permutation_feature_importances,
    plot_shap_feature_importances, model_check, plot_scores)
from lib.utils import (
    calculate_gini_feature_importances, calculate_permutation_feature_importances,
    calculate_shap_feature_importances)
from lib.utils import model_feature_importances
import pickle
from math import isclose

import svgutils.transform as sg 

plt.style.use('ggplot')
pd.set_option("display.max_columns", 200)
plt.rcParams['figure.figsize'] = (6,6)

# -----------------------------------------
# Internal functions
# -----------------------------------------
def filter_corr_matrix(corr_df: pd.DataFrame, threshold: float = 0.80) -> pd.DataFrame:
    """Removes rows and columns that do not feature any correlation coefficient larger equal threshold"""
    selected_columns = []
    for col in corr_df.columns:
        series = corr_df[col].drop(col, axis=0)
        if (series.abs() >= threshold).any():
            selected_columns.append(col)

    return corr_df.loc[selected_columns,selected_columns]

### Execution options

In [None]:
# EXECUTION OPTIONS
fitted_models = set(['xgboost', 'catboost', 'lightgbm'])
prune_additional_features: bool = True # Remove some of the potentially redundant features

rerun = False # Runs model fitting even if the model in saved form already exists
override = False # Saves the model (after fitting) even if saved model already exists
recalculate_breakdown_interactions = False

## 1b. Input data load, check and merge

In [None]:
# Load data
data_full = pd.read_csv(
    pathlib.Path("outputs/reemission/combined/combined_outputs.csv"))
elev_data = pd.read_csv(
    pathlib.Path("config/elev.csv"))
# Add elevation data to the full emission estimates dataset
merged_df = pd.merge(
    data_full, elev_data[['ifc_name', 'fsl_masl']], left_on='Name', right_on='ifc_name', how='inner')
merged_df.drop(columns=['ifc_name'], inplace=True)
# Perform prefiltering of data outside the pipeline - required for sorting data with regards to:
#    soil types, landuse intensities and treatment factors (levels of ww treatment in the catchment)
merged_df[['Soil', 'Landuse intensity', 'Treatment']] = \
            merged_df['Scenario'].str.split('_', expand=True)
# Create a version with CH4 emissions without degassing - required to test a 'what-if' scenario if
# reservoirs are operated with shallow water intakes
merged_df['ch4_net_nodegassing'] = merged_df['ch4_net'] - merged_df['ch4_degassing']
# Make sure that we haven't removed any rows during merging
assert len(merged_df) == len(data_full)
# Create a separate dataframe for plotting purposes
df_plot = merged_df.copy()
df_plot['Name_Code'] = pd.factorize(df_plot['Name'])[0]
df_plot['Scenario_Code'] = pd.factorize(df_plot['Scenario'])[0]

## 1c. Filter data using pipelines

In [None]:
# Specify columns to be dropped from data
corr_vars = ['trophic_status'] # It's probably correlated with other data as it's calculated from 
                              # other data within reemission
aux_vars = ['id', 'type', 'gasses_0', 'gasses_1', 'gasses_2']
interm_result_vars = [
    'co2_diffusion', 'co2_diffusion_nonanthro', 'co2_preimp', 'co2_minus_nonanthro', 'co2_net', 
    'co2_total_lifetime', 'ch4_diffusion', 'ch4_ebullition', 'ch4_degassing', 'ch4_preimp', 'ch4_net',
    'ch4_total_lifetime', 'n2o_methodA', 'n2o_methodB', 'n2o_mean', 'n2o_total_lifetime',
    'co2_total_per_year', 'ch4_total_per_year', 'n2o_total_per_year', 'nitrogen_downstream_conc']
marginal_vars = [
    'catch_riv_length', 'res_water_intake_depth', 
    'surface_density', 'bottom_density'] 
# comment: res water intake depth does not play part in regression (deep intake only)
duplicated_vars = ['mean_radiance_lat'] # duplicated with res_mean_radiance
additional_vars_prune = ['coordinates_0', 'coordinates_1', 'bottom_temperature', 
                         'surface_temperature', 'fsl_masl', 'thermocline_depth',
                         'nitrogen_load', 'phosphorus_load', 'res_max_depth',
                         'littoral_area_frac', 'retention_coeff']

columns_to_drop = corr_vars + aux_vars + interm_result_vars + marginal_vars + duplicated_vars + \
    additional_vars_prune
# Add additional columns that should not play part in the model as they're either internally calculated
# or highly correlated with other variables
if prune_additional_features:
    additional_columns_to_drop = [
            'reservoir_tn', 'reservoir_tp', 'inflow_p_conc', 'inflow_n_conc',
            'global_radiance', 'catch_precip', 'ch4_net_nodegassing']
else:
    additional_columns_to_drop = []
# reservoir_tn and reservoir_tp are internal variables, inflow_p_conc and inflow_n_conc are correlated with 
# nitrogen and phosphorus loads, global radiance is correlated with res_mean_radiance, catch_precip is highly
# correlated with catch_runoff
columns_to_drop.extend(additional_columns_to_drop)

if 'bottom_temperature' in columns_to_drop or 'surface_temperature' in columns_to_drop:
    plot_pairplot1 = False
if 'population' in columns_to_drop or 'phosphorus_load' in columns_to_drop:
    plot_pairplot2 = False
plot_pairplot3 = True
if 'res_max_depth' in columns_to_drop or 'res_mean_depth' in columns_to_drop:
    plot_pairplot4 = False
plot_pairplot5 = True
    
print(f"Columns selected from dropping from the dataset: \n{' ** '.join(columns_to_drop)}")

### Create a preprocessing pipeline

#### Think of other columns to drop, e.g. `nitrogen_load`, `phosphorus_load`
#### Perhaps think of adding other preprocessing steps such as removing low variance columns

In [None]:
pipe_1_options = [
    ('prof_dropper', ProfileDropper()), # drop emission profile outputs from data
    ('red_res_area_fractions', ReduceResAreaFractions()),
    ('mean_temp', ReplaceTempProfileWithMean()),
    ('col_drop_1', ColumnDropper(columns_to_drop + ['Scenario', 'Name'])),
    ('stat_col_rem', VarianceThreshold())
    #('stat_col_rem', StaticColRemover())
]
pipe_1 = Pipeline(pipe_1_options)

### Create different data options for different types of analyses

In [None]:
# Divide raw data into mineral soils and organic soils
merged_df_min = merged_df[merged_df['Soil']=='MIN'].drop("Soil", axis=1)
merged_df_org = merged_df[merged_df['Soil']=='ORG'].drop("Soil", axis=1)
# Drop other options, i.e treatment and landuse intensity
merged_df_min_prim_low = merged_df_min.query(
    "`Landuse intensity` == 'LOW' & Treatment == 'PRIM'").drop(["Landuse intensity", "Treatment"], axis=1)
merged_df_org_prim_low = merged_df_org.query(
    "`Landuse intensity` == 'LOW' & Treatment == 'PRIM'").drop(["Landuse intensity", "Treatment"], axis=1)

# Create datasets for CO2 regression and CH4 regression tasks for scenario with mineral soil,
# primary treatment and low landuse intensity
X_co2_min = merged_df_min_prim_low.drop(columns=['co2_net'])
y_co2_min = merged_df_min_prim_low['co2_net']
X_ch4_min = merged_df_min_prim_low.drop(columns='ch4_net')
y_ch4_min = merged_df_min_prim_low['ch4_net']
# Same for organic soil, primary treatment and low landuse intensity
X_co2_org = merged_df_org_prim_low.drop(columns=['co2_net'])
y_co2_org = merged_df_org_prim_low['co2_net']
X_ch4_org = merged_df_org_prim_low.drop(columns='ch4_net')
y_ch4_org = merged_df_org_prim_low['ch4_net']

# Perform data splitting - use 90% train and 10% test
_co2_data_random_seed = 42
_ch4_data_random_seed = 42

X_co2_train, X_co2_test, y_co2_train, y_co2_test = \
    train_test_split(X_co2_min, y_co2_min, train_size=0.9, test_size=0.1, random_state=_co2_data_random_seed)
X_ch4_train, X_ch4_test, y_ch4_train, y_ch4_test = \
    train_test_split(X_ch4_min, y_ch4_min, train_size=0.9, test_size=0.1, random_state=_ch4_data_random_seed)

### Apply pipelines

In [None]:
fit_pipeline = pipe_1.fit(X_co2_train, y_co2_train)
X_co2_train = fit_pipeline.transform(X_co2_train)
X_co2_test = fit_pipeline.transform(X_co2_test)
X_ch4_train = pipe_1.fit_transform(X_ch4_train, y_ch4_train)
X_ch4_test = pipe_1.transform(X_ch4_test)
# Merge test and train data - for model introspection we would like to look into the model trained
# on all data. Train/test split is done to check how the model trained on train data generalizes to
# test data in order to sense if it's under or overfitting
X_co2_train_test = pd.concat([X_co2_train, X_co2_test])
y_co2_train_test = pd.concat([y_co2_train, y_co2_test])
X_ch4_train_test = pd.concat([X_ch4_train, X_ch4_test])
y_ch4_train_test = pd.concat([y_ch4_train, y_ch4_test])
### Rename some columns to improve the understanding of variables
col_name_map = {
    "catch_area_fractions_0": "catchment bare soil fraction",
    "catch_area_fractions_1": "catchment snow and ice fraction",
    "catch_area_fractions_2": "catchment urban area fraction",
    "catch_area_fractions_3": "catchment water area fraction",
    "catch_area_fractions_4": "catchment wetland area fraction",
    "catch_area_fractions_5": "catchment crop area fraction",
    "catch_area_fractions_6": "catchment shrub area fraction",
    "catch_area_fractions_7": "catchment forest area fraction",
    "catch_area_fractions_8": "catchment unknown area fraction",
    "res_area_fractions_red_0": "reservoir bare soil fraction",
    "res_area_fractions_red_1": "reservoir snow and ice fraction",
    "res_area_fractions_red_2": "reservoir urban area fraction",
    "res_area_fractions_red_3": "reservoir water area fraction",
    "res_area_fractions_red_4": "reservoir wetland area fraction",
    "res_area_fractions_red_5": "reservoir crop area fraction",
    "res_area_fractions_red_6": "reservoir shrub area fraction",
    "res_area_fractions_red_7": "reservoir forest area fraction",
    "res_area_fractions_red_8": "reservoir unknown area fraction",
    "coordinates_0": "latitude",
    "coordinates_1": "longitude",
    'catch_runoff': 'catchment runoff',
    'catch_area': 'catchment area',
    'catch_population': 'population',
    'catch_slope': 'catchment slope',
    'catch_etransp': 'evapotranspiration',
    'catch_soil_wetness': 'catchment soil wetness',
    'catch_mean_olsen': 'catchment mean olsen',
    'res_volume': 'reservoir volume',
    'res_area': 'reservoir area',
    'res_max_depth': 'max depth',
    'res_mean_depth': 'mean depth',
    'res_soil_carbon': 'reservoir soil carbon',
    'res_mean_radiance': 'reservoir mean radiance',
    'res_mean_radiance_may_sept': 'reservoir mean radiance may-sept',
    'res_mean_radiance_nov_mar': 'reservoir mean radiance nov-mar',
    'res_mean_monthly_windspeed': 'reservoir mean monthly windspeed',
    'retention_coeff': 'retention coefficient',
    'littoral_area_frac': 'littoral area fraction',
    'bottom_temperature': 'bottom temp',
    'surface_temperature': 'surface temp',
    'thermocline_depth': 'thermocline depth',
    'nitrogen_load': 'N load',
    'phosphorus_load': 'P load',
    'fsl_masl': 'fsl',
    'ave_temp': 'air temperature'
}
for data_frame in [X_co2_train, X_co2_test, X_ch4_train, X_ch4_test, X_co2_train_test, X_ch4_train_test]:
    data_frame.rename(columns=col_name_map, inplace=True)

## 2. Exploratory data analysis

#### Manual feature selection / analysis of features data

doing EDA, it can also be used for checking multi co-linearity in data

* Information gain
* Correlation with target
* Pairwise correlation
* Variance threshold 
* ...

In [None]:
# Check distribution of outputs
fig, (ax1, ax2) = plt.subplots(1, 2)
fig.set_figwidth(6)
fig.set_figheight(3)
#fig.suptitle('Distributions of outputs variables')
# CO2 emissions
y_co2_train.plot(kind="hist", bins=10, ax=ax1, color='red')
ax1.title.set_text("CO$_2$ net unit emissions")
ax1.set_xlabel("Emission, gCO$_{2,eq}$/m$^2$/year", fontsize = 11)
ax1.set_ylabel("Frequency", fontsize = 11)
# CH4 emissions
y_ch4_train.plot(kind="hist", bins=10, ax=ax2, color='green')
ax2.title.set_text("CH$_4$ net unit emission")
ax2.set_xlabel("Emission, gCO$_{2,eq}$/m$^2$/year", fontsize = 11)
ax2.set_ylabel("Frequency", fontsize = 11)
fig.tight_layout()
plt.show()
fig.savefig(
    pathlib.Path('figures/data_exploration/co2_ch4_distributions.png'), 
    transparent = False, dpi = 300, bbox_inches='tight')

### Find outliers - it's done via visual inspection  of histograms

In [None]:
fig, ax = plt.subplots(1, 1)
fig.set_figwidth(7)
fig.set_figheight(3)
ax.set_ylim([0,20])
X_co2_train['catchment area'].hist(bins=75)

In [None]:
# Find reservoirs with very large catchment areas indicated in the plot above
large_c_area_indices = X_co2_train[['catchment area']].query('`catchment area` > 150000').index
merged_df_min.loc[large_c_area_indices, :][['Name', 'co2_net', 'ch4_net', 'res_area', 'catch_area']]

## Find correlations among variables in the input data space and between inputs and the target variable

### A) For $CO_2$ emissions

In [None]:
Xy_co2_train = pd.concat([X_co2_train, y_co2_train], axis=1).rename(columns={"co2_net": "Net CO2 emissions"})
corr_matrix = Xy_co2_train.corr()
mask_matrix = np.triu(corr_matrix)
plt.figure(figsize=(28, 16))
sns.set(font_scale=1.1)
heatmap = sns.heatmap(corr_matrix, vmin=-1, vmax=1, annot=True, cmap="YlGnBu", mask=mask_matrix)
heatmap.set_title('Correlation Matrix Heatmap - all features', fontdict={'fontsize':22}, pad=14);
heatmap.tick_params(axis='both', which='major', labelsize=24)

In [None]:
corr_threshold = 0.80 # Value below which correlations are not displayed (e.g. in correlation plots)

corr_matrix_no_large_c_area = Xy_co2_train[~Xy_co2_train.index.isin(large_c_area_indices)].corr()
without_outliers: bool = True
if without_outliers:
    corr_matrix_filt_co2 = filter_corr_matrix(corr_matrix_no_large_c_area, threshold=corr_threshold)
else:
    corr_matrix_filt_co2 = filter_corr_matrix(corr_matrix, threshold=corr_threshold)
fig_corr = plt.figure(figsize=(14, 8))
sns.set(font_scale=1.1)
mask_matrix = np.triu(corr_matrix_filt_co2)
heatmap = sns.heatmap(corr_matrix_filt_co2, vmin=-1, vmax=1, annot=True, cmap="YlGnBu", mask=mask_matrix)
heatmap.set_title(
    f'Feature Correlation Matrix for correlation coefficients > {corr_threshold}', pad=14, fontsize = 20)
heatmap.tick_params(axis='both', which='major', labelsize=18)
heatmap.tick_params(axis='x', rotation=90)
fig_corr.savefig(
    pathlib.Path('figures/data_exploration/feature_correlation_matrix.png'), 
    transparent = True, dpi = 300, bbox_inches='tight')

### A) For $CH_4$ emissions

In [None]:
Xy_ch4_train = pd.concat([X_ch4_train, y_ch4_train], axis=1).rename(
    columns={"ch4_net": "Net CH4 emissions"})
corr_matrix = Xy_ch4_train.corr()
mask_matrix = np.triu(corr_matrix)
plt.figure(figsize=(28, 16))
sns.set(font_scale=1.1)
heatmap = sns.heatmap(corr_matrix, vmin=-1, vmax=1, annot=True, cmap="YlGnBu", mask=mask_matrix)
heatmap.set_title('Correlation Matrix Heatmap - all features', fontdict={'fontsize':22}, pad=14);
heatmap.tick_params(axis='both', which='major', labelsize=24)

In [None]:
corr_threshold = 0.80 # Value below which correlations are not displayed (e.g. in correlation plots)

corr_matrix_no_large_c_area = Xy_ch4_train[~Xy_ch4_train.index.isin(large_c_area_indices)].corr()
without_outliers: bool = True
if without_outliers:
    corr_matrix_filt_ch4 = filter_corr_matrix(corr_matrix_no_large_c_area, threshold=corr_threshold)
else:
    corr_matrix_filt_ch4 = filter_corr_matrix(corr_matrix, threshold=corr_threshold)
    
plt.figure(figsize=(14, 8))
sns.set(font_scale=1.1)
mask_matrix = np.triu(corr_matrix_filt_ch4)
heatmap = sns.heatmap(corr_matrix_filt_ch4, vmin=-1, vmax=1, annot=True, cmap="YlGnBu", mask=mask_matrix)
heatmap.set_title(
    f'Feature Correlation Matrix for correlation coefficients > {corr_threshold}', pad=14, fontsize = 20)
heatmap.tick_params(axis='both', which='major', labelsize=18)
heatmap.tick_params(axis='x', rotation=90)

We should remove latitude and longitude form the list of features since they have not been used in the calculations. We might want to eliminate either P load or population as they're significantly correlated. Perhaps bottom temp and surface temp can be removed and only air temperature be used as an input variable since it correlates with both bottom and surface temperatures. Mean depth and max depth can be replaced with a single variable depth. **See plots below**:

### Plot pairplots of selected features

In [None]:
if plot_pairplot1:
    X_co2_train_binned = X_co2_train.copy()
    X_co2_train_binned['max_depth_bins'] = pd.cut(X_co2_train_binned['max depth'], bins=5)
    pairplot = sns.pairplot(
        X_co2_train_binned, vars=['air temperature', 'bottom temp', 'surface temp'], hue='max_depth_bins',
        markers="+",
        kind='reg',
        corner=True,
        diag_kws= {'color': 'orange'})
    fig.show()
    pairplot.figure.savefig(
        pathlib.Path('figures/data_exploration/temp_corr_pairplots.png'), 
        transparent = False, dpi = 300, bbox_inches='tight')

In [None]:
# Note we plot the correlation for filtered data where we removed 'extremely' large catchments
if plot_pairplot2:
    catchment_area_threshold = 50_000
    fig = plt.figure(figsize=(3, 3), dpi= 100, facecolor='w', edgecolor='k')
    p_pop_pairplot = sns.pairplot(
        X_co2_train.query(f"`catchment area` < {catchment_area_threshold}"), vars=['P load', 'population'],
        markers="+",
        kind='reg',
        diag_kind="hist",
        plot_kws={'line_kws':{'color':'black'},
               'scatter_kws': {'alpha': 0.9,
                               'color': 'green'}},
        corner=True,
        diag_kws= {'color': 'orange'})
    p_pop_pairplot.fig.suptitle(
        f"Relationship between population and P load for catchments < {catchment_area_threshold} km2",
        y=1.04, fontsize = 12) 
    fig.show()
    p_pop_pairplot.figure.savefig(
        pathlib.Path('figures/data_exploration/pop_phosphorus_corr_pairplots.png'), 
        transparent = False, dpi = 300, bbox_inches='tight')

In [None]:
if plot_pairplot3:
    fig = plt.figure(figsize=(2, 2), dpi= 100, facecolor='w', edgecolor='k')
    urb_area_fractions_pairplot = sns.pairplot(
        X_co2_train.query(
            '`catchment urban area fraction` < 0.5'), 
        vars=['catchment urban area fraction', 'reservoir urban area fraction'],
        markers="+",
        kind='reg',
        corner=True,
        diag_kws= {'color': 'orange'})
    fig.show()
    urb_area_fractions_pairplot.figure.savefig(
        pathlib.Path('figures/data_exploration/res_catch_urban_fractions_corr_pairplots.png'), 
        transparent = False, dpi = 300, bbox_inches='tight')

In [None]:
if plot_pairplot4:
    fig = plt.figure(figsize=(3, 3), dpi= 100, facecolor='w', edgecolor='k')
    min_max_depth_pairplot = sns.pairplot(
        X_co2_train.query('`mean depth` < 20'), vars=['max depth', 'mean depth'],
        markers="+",
        kind='reg',
        corner=True,
        diag_kws= {'color': 'orange'})
    fig.show()
    min_max_depth_pairplot.figure.savefig(
        pathlib.Path('figures/data_exploration/min_max_depth_corr_pairplots.png'), 
        transparent = False, dpi = 300, bbox_inches='tight')

In [None]:
if plot_pairplot5:
    fig = plt.figure(figsize=(3, 3), dpi= 100, facecolor='w', edgecolor='k')
    area_volume_pairplot = sns.pairplot(
        X_co2_train.query('`reservoir area` < 100'), vars=['reservoir volume', 'reservoir area'],
        markers="+",
        kind='reg',
        corner=True,
        diag_kws= {'color': 'orange'})
    fig.show()
    area_volume_pairplot.figure.savefig(
        pathlib.Path('figures/data_exploration/area_volume_corr_pairplots.png'), 
        transparent = False, dpi = 300, bbox_inches='tight')

In [None]:
X_co2_train['air temperature'].hist(bins = 30)

### Observations:
1. Unit CO$_2$ net emissions are normally distributed whilst unit CH$_4$ net emissions have an L shaped distribution (strong left skew). 
2. Dataset contains two outliers for reservoirs with VERY LARGE catchment areas (Ywathit and Mong Tong reservoirs). It is not clear if this is an error or it is physically possible to have such large catchment areas for reservoirs, e.g. large flat surfaces, etc.
3. There are a few correlations between features, namely:
  - Population vs P. load, although the correlation does not hold well for very small catchments, which are predominant in the dataset. 
  - Catchment urban area fraction vs. reservoir urban area fraction seem correlated but it's an arficact of sparase histogram of data. In reality, these two variables are not correlated for small fracton values as shown in one of the pair-plots above
  - Similar to above correlation between mean depth and max depth is high for the entire dataset but as the distributions of data are left skewed, it falls down for eg. small depths. HOWEVER, we decided to drop max_depth as less significant for predictions and only use mean_depth as a measure of the depth of the reservoir.
  - Correlations between air, bottom and surface temperatures are also high. The distributions are right skewed (J-shape). We decided to only use air temperature as a feature as we assume that reservoir temperature is positivekly correlated with air temperature and negatively correlated with depth
4. Reservoir volume vs reservoir area - although the correlation seems high, it does not hold for small reservoir volumes / areas. Therefore, we keep both the volume and the surface area in the feature dataset.

## Feature Scores

In [None]:
N_FEAT = 7
best_features_f_reg_co2 = SelectKBest(score_func = f_regression, k=N_FEAT)
best_features_f_info_co2 = SelectKBest(score_func = mutual_info_regression, k=N_FEAT)
best_features_f_reg_ch4 = SelectKBest(score_func = f_regression, k=N_FEAT)
best_features_f_info_ch4 = SelectKBest(score_func = mutual_info_regression, k=N_FEAT)
# ============== CO2 ==============
fit_f_reg_co2 = best_features_f_reg_co2.fit(X_co2_train, y_co2_train)
# X_new_co2_f_reg = fit_f_reg_co2.transform(X_co2_train)
fit_f_info_co2 = best_features_f_info_co2.fit(X_co2_train, y_co2_train)
# X_new_co2_f_info = fit_f_info_co2.transform(X_co2_train)
# ============== CH4 ==============
fit_f_reg_ch4 = best_features_f_reg_ch4.fit(X_ch4_train, y_ch4_train)
fit_f_info_ch4 = best_features_f_info_ch4.fit(X_ch4_train, y_ch4_train)

In [None]:
#importlib.reload(lib.utils)
#from lib.utils import plot_scores
# PLOT CO2 REGRESSION FEATURE SCORES

In [None]:
n_features = 10
fig, axs = plt.subplots(1, 2, figsize=(10,6))
fig.suptitle("CO$_2$ regression feature scores")
for ix, ax in enumerate(axs.flat):
    if ix == 0:
        plot_scores(
            fit_f_reg_co2, X_co2_train, n_features, title = "F value", ax=ax, tick_fontsize = 11,
            title_fontsize = 13)
    if ix == 1:
        plot_scores(
            fit_f_info_co2, X_co2_train, n_features, title = "mutual information", ax=ax, tick_fontsize = 11,
            title_fontsize = 13)
plt.tight_layout()
fig.savefig(
    pathlib.Path('figures/data_exploration/co2_regression_feature_scores.png'), 
    transparent = False, dpi = 300, bbox_inches='tight')

In [None]:
fig, axs = plt.subplots(1, 2, figsize=(10,6))
fig.suptitle("CH$_4$ regression feature scores")
for ix, ax in enumerate(axs.flat):
    if ix == 0:
        plot_scores(
            fit_f_reg_ch4, X_ch4_train, n_features, title = "F value", ax=ax, tick_fontsize = 11,
            title_fontsize = 13)
    if ix == 1:
        plot_scores(
            fit_f_info_ch4, X_ch4_train, n_features, title = "mutual information", ax=ax, tick_fontsize = 11,
            title_fontsize = 13)
plt.tight_layout()
fig.savefig(
    pathlib.Path('figures/data_exploration/ch4_regression_feature_scores.png'), 
    transparent = False, dpi = 300, bbox_inches='tight')

#### SEEMS TO BE CRASHING THE KERNEL - DO NOT RUN
```
profile = ProfileReport(X_min_train_co2, title="Profiling Report")
profile.to_file("profile_report.html")
```

# Run CATBOOST, XGBOOST AND LIGHTGBM REGRESSIONS

Currently runs two regressions per regression model - one for CO2 emissions and one for CH4 emissions.
Both regressions do not include any scenario options (Soil Type, Treatment Factor or Landuse intensity)
Instead the regressions assume the regressions are done for data assumed constant for all reservoirs
in Myanmar, i.e. Mineral Soil, Primary Treatment, Low Landuse Intensity

### Make fast and dirty fitting first

In [None]:
simu_type = "local"
cat_features = [] # Removed as we're only fitting the model to mya data with preset-values of those
#                   ["Landuse intensity", "Treatment"]
if simu_type == "local":
    co2_model = CatBoostRegressor(loss_function = 'RMSE', task_type="CPU", iterations=5000)
    ch4_model = CatBoostRegressor(loss_function = 'RMSE', task_type="CPU", iterations=5000)
elif simu_type == "colab":
    co2_model = CatBoostRegressor(loss_function = 'RMSE', task_type="GPU" )
    ch4_model = CatBoostRegressor(loss_function = 'RMSE', task_type="GPU" )
co2_model_catboost_quick_path = pathlib.Path("bin/regression_models/co2_model_catboost_quick.cbm")
ch4_model_catboost_quick_path = pathlib.Path("bin/regression_models/ch4_model_catboost_quick.cbm")

saved_model_path = pathlib.Path("saved_models")

if rerun or not os.path.isfile(co2_model_catboost_quick_path):
    co2_model.fit(X_co2_train, y_co2_train, cat_features = cat_features, silent=True)
    if override:
        if not saved_model_path.exists():
            saved_model_path.mkdir()
        co2_model.save_model(
            saved_model_path / "co2_model_catboost_quick.cbm", format="cbm")
else:
    co2_model.load_model(co2_model_catboost_quick_path) 
        
if rerun or not os.path.isfile(ch4_model_catboost_quick_path):
    ch4_model.fit(X_ch4_train, y_ch4_train, cat_features = cat_features, silent=True)
    if override:
        if not saved_model_path.exists():
            saved_model_path.mkdir()
        ch4_model.save_model(
            saved_model_path / "ch4_model_catboost_quick.cbm", format="cbm")
else:
    ch4_model.load_model(ch4_model_catboost_quick_path) 

#### Make quick check of the quality of the models

## Boosted tree model fitting with hyperparameter tuning

### Tune the XGBoost models

In [None]:
# CO2 regression
# Change override to True to retune the model
model_co2_xgboost = hypertune_model(
    X_co2_train, y_co2_train, num_evals = 2_000, hypertuner=HyperTuner.XGBOOST,
    file=os.path.join('bin', 'regression_models', 'xgboost_co2.pkl'), override = override)

In [None]:
# Retrain on full data set (for model explainability analysis)
model_co2_xgboost.fit(X_co2_train_test, y_co2_train_test)

In [None]:
print("XGBOOST CO2 MODEL REGRESSION STATISTICS")
print("---------------------------------------")
model_check(model=model_co2_xgboost, 
    X_train = X_co2_train, X_test = X_co2_test, 
    y_train = y_co2_train, y_test = y_co2_test)

In [None]:
# CH4 regression
# Change override to True to retune the model
model_ch4_xgboost = hypertune_model(
    X_ch4_train, y_ch4_train, num_evals = 2_000, hypertuner=HyperTuner.XGBOOST,
    file=os.path.join('bin', 'regression_models', 'xgboost_ch4.pkl'), override = override)

In [None]:
# Retrain on full data set (for model explainability analysis)
model_ch4_xgboost.fit(X_ch4_train_test, y_ch4_train_test)

In [None]:
print("XGBOOST CH4 MODEL REGRESSION STATISTICS")
print("---------------------------------------")
model_check(model=model_ch4_xgboost, 
    X_train = X_ch4_train, X_test = X_ch4_test, 
    y_train = y_ch4_train, y_test = y_ch4_test)

### Tune the LightGBM models

In [None]:
# CO2 regression
model_co2_lightgbm = hypertune_model(
    X_co2_train, y_co2_train, num_evals = 1_000, hypertuner=HyperTuner.LIGHTGBM,
    file=os.path.join('bin', 'regression_models', 'lightgbm_co2.pkl'), override = override)

In [None]:
# Remove warnings in the LightGBM CO2 regression model - ONLY WORKS FOR PRE-TRAINED MODELS
# IF TRAINING NEW MODELS - COMMENT OUT AND SEE THE WARNINGS FIRST BEFORE TURNING SOME CONFLICTING
# REGRESSION PARAMETERS OFF
import lightgbm as lgb

model_co2_lightgbm.min_child_samples = None
model_co2_lightgbm.min_split_gain=None
model_co2_lightgbm.subsample=None
model_co2_lightgbm.boosting_type=None
model_co2_lightgbm.colsample_bytree=None
model_co2_lightgbm.reg_alpha = None
model_co2_lightgbm.reg_lambda = None
model_co2_lightgbm.params={'verbose': -1, 'verbose_eval' : -1}
model_co2_lightgbm.free_raw_data=False

In [None]:
# Retrain on full data set (for model explainability analysis)
#model_co2_lightgbm.predict_raw_score = False
model_co2_lightgbm.metric = {'rmse'}
model_co2_lightgbm.fit(X_co2_train_test, y_co2_train_test)

In [None]:
print("LIGHTGBM CO2 MODEL REGRESSION STATISTICS")
print("---------------------------------------")
model_check(model=model_co2_lightgbm, 
    X_train = X_co2_train, X_test = X_co2_test, 
    y_train = y_co2_train, y_test = y_co2_test)

In [None]:
# CH4 regression
model_ch4_lightgbm = hypertune_model(
    X_ch4_train, y_ch4_train, num_evals = 1_000, hypertuner=HyperTuner.LIGHTGBM,
    file=os.path.join('bin', 'regression_models', 'lightgbm_ch4.pkl'), override = override)

In [None]:
model_ch4_lightgbm.min_child_samples=None
model_ch4_lightgbm.colsample_bytree=None
model_ch4_lightgbm.boosting_type=None
model_ch4_lightgbm.min_split_gain=None
model_ch4_lightgbm.reg_alpha=None
model_ch4_lightgbm.subsample=None
model_ch4_lightgbm.reg_lambda=None
model_ch4_lightgbm.data_sample_strategy='goss'
model_ch4_lightgbm.params={'verbose': -1, 'verbose_eval' : -1}

In [None]:
# Retrain on full data set (for model explainability analysis)
model_ch4_lightgbm.fit(X_ch4_train_test, y_ch4_train_test)

In [None]:
print("LIGHTGBM CH4 MODEL REGRESSION STATISTICS")
print("---------------------------------------")
model_check(model=model_ch4_lightgbm, 
    X_train = X_ch4_train, X_test = X_ch4_test, 
    y_train = y_ch4_train, y_test = y_ch4_test)

### Tune the CATBoost models

#### Encountered problems with Catboost errors and therefore the models have not been fitted
* It is possible that the errors are caused by certain combinations of parameters during hypertuning. It may be possible to rectify this problem by removing some hyperparameters in hypertune or by reducing ranges of some hyperparameters

In [None]:
# CO2 regression
model_co2_catboost = hypertune_model(
    X_co2_train, y_co2_train, num_evals = 40, hypertuner=HyperTuner.CATBOOST,
    file=os.path.join('bin', 'regression_models', 'catboost_co2.pkl'), override = override)

In [None]:
model_co2_catboost.params = {
    'silent': True, 'verbose': False, 'logging_level': 'Silent',
    'metric_period':100}
model_co2_catboost.metric_period = 10000
model_co2_catboost.logging_level = 'Silent'
model_co2_catboost.verbose = False
model_co2_catboost.silent = True
# None of the above f**** work!!!

In [None]:
# Retrain on full data set (for model explainability analysis)
model_co2_catboost.fit(X_co2_train_test, y_co2_train_test)

In [None]:
print("CATBOOST CO2 MODEL REGRESSION STATISTICS")
print("---------------------------------------")
model_check(model=model_co2_catboost, 
    X_train = X_co2_train, X_test = X_co2_test, 
    y_train = y_co2_train, y_test = y_co2_test)

In [None]:
# CH4 regression
model_ch4_catboost = hypertune_model(
    X_ch4_train, y_ch4_train, num_evals = 40, hypertuner=HyperTuner.CATBOOST,
    file=os.path.join('bin', 'regression_models', 'catboost_ch4.pkl'), override = override)

In [None]:
model_ch4_catboost.verbose = -1
model_ch4_catboost.logging_level = 'Silent'
# Retrain on full data set (for model explainability analysis)
model_ch4_catboost.fit(X_ch4_train_test, y_ch4_train_test)

In [None]:
print("CATBOOST CH4 MODEL REGRESSION STATISTICS")
print("---------------------------------------")
model_check(model=model_ch4_catboost, 
    X_train = X_ch4_train, X_test = X_ch4_test, 
    y_train = y_ch4_train, y_test = y_ch4_test)

In [None]:
# Plot figure for the visual abstract
fig, axs = plt.subplots(1,1, figsize=(6,3))
plot_shap_feature_importances(
                model_co2_xgboost, X_co2_train_test,
                max_vars = 15,
                title=" ",
                plot_type = 'bar', ax=axs)
fig.savefig(
    pathlib.Path('figures/model_explanation/model_shaps_for_graphical_abstract.png'), 
    dpi = 300, bbox_inches='tight')

In [None]:
def plot_feat_importances(
        model, X_train, X_test, y_test, title: str = "Feature importances",
        file_name: str| None = None, **kwargs) -> None:
    fig, axs = plt.subplots(2, 2, figsize=(10,7))
    fig.suptitle(title)
    for ix, ax in enumerate(axs.flat):
        if ix == 0:
            plot_gini_feature_importances(
                model, X_train, 15, 
                'GINI-based Feature Importances', ax = ax)
        if ix == 1:
            # Computed on test data
            plot_permutation_feature_importances(
                model, X_test, y_test, max_vars = 15,
                n_repeats = 7,
                title='Permutation-based Feature Importances', ax = ax)
        if ix == 2:
            plot_shap_feature_importances(
                model, X_test, 
                max_vars = 15,
                title='Mean SHAP values',
                plot_type = 'bar', ax=ax)

    fig.delaxes(axs[1,1])
    plt.tight_layout()
    if file_name:
        fig.savefig(file_name, dpi = 300, bbox_inches='tight', **kwargs)

# Feature Importances for the CO$_2$ regression models

In [None]:
# Doubling arguments (see plot_feat_importances) is weird, I know, but there must have been a reason for 
# it, that I've already forgotten (TJ)
plot_feat_importances(
    model_co2_xgboost, X_co2_train_test, X_co2_train_test, y_co2_train_test, 
    title = "Feature importances - XGBoost model - CO$_2$ emissions",
    file_name = pathlib.Path('figures/model_explanation/feature_importances_xgboost_co2.png'),
    transparent=False)

In [None]:
plot_feat_importances(
    model_co2_lightgbm, X_co2_train_test, X_co2_train_test, y_co2_train_test, 
    title = "Feature importances - LightGBM model - CO$_2$ emissions",
    file_name = pathlib.Path('figures/model_explanation/feature_importances_lightgbm_co2.png'),
    transparent=False)

In [None]:
plot_feat_importances(
    model_co2_catboost, X_co2_train_test, X_co2_train_test, y_co2_train_test, 
    title = "Feature importances - CatBoost model - CO$_2$ emissions",
    file_name = pathlib.Path('figures/model_explanation/feature_importances_catboost_co2.png'),
    transparent=False)

# Feature Importances for the CH$_4$ regression models

In [None]:
plot_feat_importances(
    model_ch4_xgboost, X_ch4_train_test, X_ch4_train_test, y_ch4_train_test, 
    title = "Feature importances - XGBoost model - CH4 emissions",
    file_name = pathlib.Path('figures/model_explanation/feature_importances_xgboost_ch4.png'),
    transparent=False)

### Temporary code to export feature importances for the next notebook - move into a separate function / class and remove this temporary code

In [None]:
# --------------------- XGBOOST CH4 model ----------------------
feature_importance = permutation_importance(
        model_ch4_xgboost, X_ch4_train_test, y_ch4_train_test, n_repeats = 5, random_state = 42)
num_features = len(feature_importance.importances_mean)
sorted_idx = np.argsort(feature_importance.importances_mean)[::-1][:num_features]
importances_df = pd.DataFrame(
    data=feature_importance.importances_mean[sorted_idx]).T
importances_df.columns = X_ch4_train_test.columns[sorted_idx]
importances_df.to_csv(pathlib.Path("intermediate/ave_feature_importances_xgbost_ch4.csv"))
# ------------------- End of temporary code --------------------

In [None]:
plot_feat_importances(
    model_ch4_lightgbm, X_ch4_train_test, X_ch4_train_test, y_ch4_train_test, 
    title = "Feature importances - LightGBM model - CH4 emissions",
    file_name = pathlib.Path('figures/model_explanation/feature_importances_lightgbm_ch4.png'),
    transparent=False)

In [None]:
plot_feat_importances(
    model_ch4_catboost, X_ch4_train_test, X_ch4_train_test, y_ch4_train_test, 
    title = "Feature importances - CATBoost model - CH$_4$ emissions",
    file_name = pathlib.Path('figures/model_explanation/feature_importances_catboost_ch4.png'),
    transparent=False)

In [None]:
## Save all feature importances to a file
shap_values_folder = pathlib.Path('intermediate/shap_values')
if not shap_values_folder.exists():
    shap_values_folder.mkdir()

output_folder = shap_values_folder/'model_avg_feat_importances'
if not output_folder.exists():
    output_folder.mkdir()

model_feat_container = {}
feat_imp_type: str = 'permutation' # shap, gini

model_data_maps = {
    ('xgboost', 'co2') : (model_co2_xgboost, X_co2_train_test, y_ch4_train_test),
    ('xgboost', 'ch4') : (model_ch4_xgboost, X_ch4_train_test, y_ch4_train_test),
    ('lightgbm', 'co2') : (model_co2_lightgbm, X_co2_train_test, y_ch4_train_test),
    ('lightgbm', 'ch4') : (model_ch4_lightgbm, X_ch4_train_test, y_ch4_train_test),
    ('catboost', 'co2') : (model_co2_catboost, X_co2_train_test, y_ch4_train_test),
    ('catboost', 'ch4') : (model_ch4_catboost, X_ch4_train_test, y_ch4_train_test)}

for key, pars in model_data_maps.items():
    feats, cols = model_feature_importances(pars[0], pars[1], pars[2], feature_type=feat_imp_type)
    model_feat_container[key] = (feats, cols)

with open(output_folder / 'model_feats.pkl', 'wb') as handle:
    pickle.dump(model_feat_container, handle, protocol=pickle.HIGHEST_PROTOCOL)

## Publication permutation feature importances with removed colinearities between features
## Figure for CO$_2$ and CH$_4$ XGBoost models
* We played a bit with removal of multicollinear fetures before creating model feature importance plots using clustering. However, the methodology in scipy's documentation implemented here, removes whole clusters, but we'd rather choose one of the features in the cluster and remove the remaining correlated ones because we'd still want one of the correlated features to be included in the feature space. The code needs more fine-tuning. In the meantime, we removed correlated features manually in the beginning of the scripts and used cluster separation threshold on 0.0 which basically means that no features / clusters of features are being removed.
* Instead, we've removed littoral area fraction and used mean_depth as a proxy for littoral area fraction

In [None]:
from scipy.cluster import hierarchy
from scipy.spatial.distance import squareform
from scipy.stats import spearmanr

def plot_feature_correlation_clusters(data) -> np.ndarray:
    """ """
    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 8))
    corr = spearmanr(data).correlation

    # Ensure the correlation matrix is symmetric
    corr = (corr + corr.T) / 2
    np.fill_diagonal(corr, 1)

    # We convert the correlation matrix to a distance matrix before performing
    # hierarchical clustering using Ward's linkage.
    distance_matrix = 1 - np.abs(corr)
    dist_linkage = hierarchy.ward(squareform(distance_matrix))
    dendro = hierarchy.dendrogram(
        dist_linkage, labels=data.columns.to_list(), ax=ax1, leaf_rotation=90
    )
    dendro_idx = np.arange(0, len(dendro["ivl"]))

    ax2.imshow(corr[dendro["leaves"], :][:, dendro["leaves"]])
    ax2.set_xticks(dendro_idx)
    ax2.set_yticks(dendro_idx)
    ax2.set_xticklabels(dendro["ivl"], rotation="vertical")
    ax2.set_yticklabels(dendro["ivl"])
    _ = fig.tight_layout()
    return dist_linkage
dist_linkage = plot_feature_correlation_clusters(X_co2_train_test)

In [None]:
cluster_threshold = 0.0

# Reduce input features based on the results of clustering
cluster_ids = hierarchy.fcluster(dist_linkage, cluster_threshold, criterion="distance")
cluster_id_to_feature_ids = defaultdict(list)
for idx, cluster_id in enumerate(cluster_ids):
    cluster_id_to_feature_ids[cluster_id].append(idx)
selected_features = [v[0] for v in cluster_id_to_feature_ids.values()]
selected_features_names = X_co2_train_test.columns[selected_features]

# Get subsets of features 
X_co2_train_sel = X_co2_train[selected_features_names]
X_co2_test_sel = X_co2_test[selected_features_names]
X_co2_train_test_sel = X_co2_train_test[selected_features_names]

X_ch4_train_sel = X_ch4_train[selected_features_names]
X_ch4_test_sel = X_ch4_test[selected_features_names]
X_ch4_train_test_sel = X_ch4_train_test[selected_features_names]

# columns to drop
columns_to_drop = []
X_co2_train_sel = X_co2_train_sel.drop(columns=columns_to_drop)
X_co2_test_sel = X_co2_test_sel.drop(columns=columns_to_drop)
X_co2_train_test_sel = X_co2_train_test_sel.drop(columns=columns_to_drop)
X_ch4_train_sel = X_ch4_train_sel.drop(columns=columns_to_drop)
X_ch4_test_sel = X_ch4_test_sel.drop(columns=columns_to_drop)
X_ch4_train_test_sel = X_ch4_train_test_sel.drop(columns=columns_to_drop)

model_co2_xgboost.fit(X_co2_train_sel, y_co2_train)
model_ch4_xgboost.fit(X_ch4_train_sel, y_ch4_train)
mm1 = XGBRegressor()
mm2 = XGBRegressor()
mm1.fit(X_co2_train_sel, y_co2_train)
mm2.fit(X_ch4_train_sel, y_ch4_train)
print(
    "Baseline accuracy on test data with features removed - CO2 model:"
    f" {mm1.score(X_co2_test_sel, y_co2_test):.2}"
)
print(
    "Baseline accuracy on test data with features removed - CH4 model:"
    f" {mm2.score(X_ch4_test_sel, y_ch4_test):.2}"
)
print("selected_model")
print(
    "Baseline accuracy on test data with features removed - CO2 model:"
    f" {model_co2_lightgbm.score(X_co2_test_sel, y_co2_test):.2}"
)
print(
    "Baseline accuracy on test data with features removed - CH4 model:"
    f" {model_ch4_lightgbm.score(X_ch4_test_sel, y_ch4_test):.2}"
)

In [None]:
# Make a figure with two subplots - left subplot shows CO2 model permutation feature importances
# whilst the right subplot show the permutation feature importances for the CH4 model
# Save the figure to svg, pdf and png files.
import random
plot_boxplot = True
num_features: int = 8
random_state = 100 #random. randint(1,1_000)
model_co2 = model_co2_lightgbm
model_ch4 = model_ch4_lightgbm

sns.set_theme(style="white")
sns.set_context("paper", rc={"grid.linewidth": 0.00})

result_co2 = permutation_importance(
    model_co2, X_co2_train_test_sel, y_co2_train_test, n_repeats=50, 
    random_state=random_state, n_jobs=4, 
    scoring='neg_root_mean_squared_error'
)
result_ch4 = permutation_importance(
    model_ch4, X_ch4_train_test_sel, y_ch4_train_test, n_repeats=50, 
    random_state=random_state, n_jobs=4, 
    scoring='neg_root_mean_squared_error'
)

sorted_importances_idx_co2 = result_co2.importances_mean.argsort()[::-1][:num_features]#[::-1]
sorted_importances_idx_ch4 = result_ch4.importances_mean.argsort()[::-1][:num_features]#[::-1]
importances_co2 = pd.DataFrame(
    result_co2.importances[sorted_importances_idx_co2].T,
    columns=X_co2_train_test_sel.columns[sorted_importances_idx_co2],
)
importances_ch4 = pd.DataFrame(
    result_ch4.importances[sorted_importances_idx_ch4].T,
    columns=X_ch4_train_test_sel.columns[sorted_importances_idx_ch4],
)

In [None]:
fig, (ax1, ax2) = plt.subplots(1,2,figsize=(12,3))
rename_labels: bool = False

if plot_boxplot:
    boxplot_co2 = sns.boxplot(
        data = importances_co2, orient = "h", hue=None, width=0.30,
        linewidth=0.75, ax=ax1,
        showcaps=True,
        flierprops={"marker": "o", "markersize": 2},
        whiskerprops={"color": "k"},
        boxprops={"facecolor": (.3, .6, .8, 1.0), "edgecolor": 'k'},
        medianprops={"color": "k", "linewidth": 0.7}) 
    boxplot_ch4 = sns.boxplot(
        data = importances_ch4, orient = "h", hue=None, width=0.30,
        linewidth=0.75, ax=ax2,
        showcaps=True,
        flierprops={"marker": "o", "markersize": 2},
        whiskerprops={"color": "k"},
        boxprops={"facecolor": (.3, .6, .8, 1.0), "edgecolor": 'k'},
        medianprops={"color": "k", "linewidth": 0.7}) 

barplot_co2 = sns.barplot(
    data=importances_co2, orient="h", ax=ax1,
    facecolor = 'gainsboro', alpha=0.75,
    linewidth=0.5, edgecolor='k', estimator = np.median,
    width = 0.65, ci = 0, zorder=0) 

barplot_ch4 = sns.barplot(
    data=importances_ch4, orient="h", ax=ax2,
    facecolor = 'gainsboro', alpha=0.75,
    linewidth=0.5, edgecolor='k', estimator = np.median,
    width = 0.65, ci = 0, zorder=0) 

if rename_labels:
    new_labels_left = ["fFr", "evap", "T_air", "pop", "slope", "hmean", "runoff", "fSr"]
    new_labels_right = ["hmean", "Ac", "T_air", "pop", "V", "evap", "slope", "fWc"]
    ax1.set_yticklabels(new_labels_left, fontsize=12)
    ax2.set_yticklabels(new_labels_right, fontsize=12)


ax1.set_xlabel("RMSE loss after permutations, gCO$_{2e}$ m$^{-2}$ yr$^{-1}$", fontsize=15)
ax1.spines['left'].set_linewidth(0.5)
ax1.spines['bottom'].set_linewidth(0.5)

ax2.set_xlabel("RMSE loss after permutations, gCO$_{2e}$ m$^{-2}$ yr$^{-1}$", fontsize=15)
ax2.spines['left'].set_linewidth(0.5)
ax2.spines['bottom'].set_linewidth(0.5)

ax1.tick_params(axis='x', which='major', labelsize=13)
ax1.tick_params(axis='y', which='major', labelsize=13)
ax2.tick_params(axis='x', which='major', labelsize=13)
ax2.tick_params(axis='y', which='major', labelsize=13)

sns.despine(offset=10, trim=False)
fig.tight_layout()

In [None]:
# INFORMATION: We use lightgbm models for the creation of feature importance plots
fig.savefig(
    pathlib.Path('figures/model_explanation/permutation_feat_importance_pub.png'), 
    transparent = False, dpi = 300, bbox_inches='tight')
fig.savefig(
    pathlib.Path('figures/model_explanation/permutation_feat_importance_pub.svg'), 
    transparent = False, dpi = 300, bbox_inches='tight')
fig.savefig(
    pathlib.Path('figures/model_explanation/permutation_feat_importance_pub.pdf'), 
    transparent = False, dpi = 300, bbox_inches='tight')

In [None]:
# Plot one feature-importance plot for graphical abstract
fig2, ax1 = plt.subplots(1,1,figsize=(11,4))
boxplot_co2 = sns.boxplot(
    data = importances_co2, orient = "h", hue=None, width=0.30,
    linewidth=0.75, ax=ax1,
    showcaps=True,
    flierprops={"marker": "o", "markersize": 2},
    whiskerprops={"color": "k"},
    boxprops={"facecolor": (.3, .6, .8, 1.0), "edgecolor": 'k'},
    medianprops={"color": "k", "linewidth": 0.7}) 
barplot_co2 = sns.barplot(
    data=importances_co2, orient="h", ax=ax1,
    facecolor = 'gainsboro', alpha=0.75,
    linewidth=0.5, edgecolor='k', estimator = np.median,
    width = 0.65, ci = 0, zorder=0) 

ax1.set_xlabel("RMSE loss after permutations, gCO$_{2e}$ m$^{-2}$ yr$^{-1}$", fontsize=18)
ax1.spines['left'].set_linewidth(0.5)
ax1.spines['bottom'].set_linewidth(0.5)

ax1.tick_params(axis='x', which='major', labelsize=16)
ax1.tick_params(axis='y', which='major', labelsize=16)

sns.despine(offset=10, trim=False)
fig2.tight_layout()
fig2.savefig(
    pathlib.Path('figures/model_explanation/permutation_feat_importance_abstract.svg'), 
    transparent = False, dpi = 300, bbox_inches='tight')

# Model and predictions explanation with DALEX

### Create DALEX explainers for all 6 models

In [None]:
exp_co2_xgboost = dx.Explainer(
    model_co2_xgboost, X_co2_train_test, y_co2_train_test, 
    label='xgboost model co2 emissions') # Uses dalex model explainer
exp_co2_lightgbm = dx.Explainer(
    model_co2_lightgbm, X_co2_train_test, y_co2_train_test, 
    label='lightgbm model co2 emissions') # Uses dalex model explainer
exp_co2_catboost = dx.Explainer(
    model_co2_catboost, X_co2_train_test, y_co2_train_test, 
    label='catboost model co2 emissions') # Uses dalex model explainer

exp_ch4_xgboost = dx.Explainer(
    model_ch4_xgboost, X_ch4_train_test, y_ch4_train_test, 
    label='xgboost model ch4 emissions') # Uses dalex model explainer
exp_ch4_lightgbm = dx.Explainer(
    model_ch4_lightgbm, X_ch4_train_test, y_ch4_train_test, 
    label='lightgbm model ch4 emissions') # Uses dalex model explainer
exp_ch4_catboost = dx.Explainer(
    model_ch4_catboost, X_ch4_train_test, y_ch4_train_test, 
    label='catboost model ch4 emissions') # Uses dalex model explainer

## Model diagnostics

In [None]:
variable = "y"
yvariable = "residuals"
exp_co2_xgboost.model_diagnostics().plot(variable=variable, yvariable=yvariable)
exp_co2_lightgbm.model_diagnostics().plot(variable=variable, yvariable=yvariable)
exp_co2_catboost.model_diagnostics().plot(variable=variable, yvariable=yvariable)

In [None]:
variable = "y"
yvariable = "residuals"
exp_ch4_xgboost.model_diagnostics().plot(variable=variable, yvariable=yvariable)
exp_ch4_lightgbm.model_diagnostics().plot(variable=variable, yvariable=yvariable)
exp_ch4_catboost.model_diagnostics().plot(variable=variable, yvariable=yvariable)

## Model performance

In [None]:
exp_co2_xgboost.model_performance()

In [None]:
exp_co2_lightgbm.model_performance()

In [None]:
exp_co2_catboost.model_performance()

In [None]:
exp_ch4_xgboost.model_performance()

In [None]:
exp_ch4_lightgbm.model_performance()

In [None]:
exp_ch4_catboost.model_performance()

## Model explanations

### Variable importances

In [None]:
importances_type = 'permutational'
no_permutations = 30
n_processes = 4
# Other attributes for the `model_parts` method
# types: permutational, variable_importance, feature_importance, ratio, difference, shap_wrapper, 
# shap_explainer = 'TreeExplainer'
# e.g. co2_lightgbm_shp_vals = exp_co2_lightgbm.model_parts(type='shap_wrapper', shap_explainer='TreeExplainer')

In [None]:
co2_xgboost_importances = exp_co2_xgboost.model_parts(
    type = importances_type , keep_distributions = True, label="XGBoost CO2 emissions", B=no_permutations,
    processes=n_processes)
co2_lightgbm_importances = exp_co2_lightgbm.model_parts(
    type = importances_type, keep_distributions = True, label="LightGBM CO2 emissions", B=no_permutations,
    processes=n_processes)
co2_catboost_importances = exp_co2_catboost.model_parts(
    type = importances_type, keep_distributions = True, label="CATBoost CO2 emissions", B=no_permutations,
    processes=n_processes)

In [None]:
# Experimental (comment it out)
mp = exp_co2_xgboost.model_parts(type='shap_wrapper', shap_explainer_type="TreeExplainer")
mp.plot(plot_type='bar', axis_color='k', color='k', show=True, max_display=6)

In [None]:
co2_xgboost_importances.plot(
    title=None, max_vars=10, bar_width=20, vertical_spacing=0, split='model', digits=2
)

In [None]:
co2_xgboost_importances.plot([co2_lightgbm_importances, co2_catboost_importances], max_vars=10)

In [None]:
ch4_xgboost_importances = exp_ch4_xgboost.model_parts(
    type = importances_type , keep_distributions = True, label="XGBoost CH4 emissions", B=no_permutations)
ch4_lightgbm_importances = exp_ch4_lightgbm.model_parts(
    type = importances_type, keep_distributions = True, label="LightGBM CH4 emissions", B=no_permutations)
ch4_catboost_importances = exp_ch4_catboost.model_parts(
    type = importances_type, keep_distributions = True, label="CATBoost CH4 emissions", B=no_permutations)

In [None]:
ch4_xgboost_importances.plot([ch4_lightgbm_importances, ch4_catboost_importances], max_vars=10)

### Partial, local and accumulated dependence profiles for XGBoost, LightGBM and CATBoost models

In [None]:
## TODO: add groups and make grouped profiles with keyword : value pair of groups = 'cat var col name'

### CO2 emissions

### 1. CO2 XGBoost

In [None]:
# Create a plot for visual abstract
selected_variables_co2 = ['reservoir forest area fraction', 'evapotranspiration']
# Partial dependence profiles
pd_co2_xgboost = exp_co2_xgboost.model_profile(
    variables=selected_variables_co2,
    N=50, label = 'Partial dependence')
# Local dependence profiles
ld_co2_xgboost = exp_co2_xgboost.model_profile(
    variables=selected_variables_co2,
    type='conditional',
    N=50, label = 'Local dependence')
# Accumulated dependence profiles
ad_co2_xgboost = exp_co2_xgboost.model_profile(
    variables=selected_variables_co2,
    type='accumulated',
    N=50, label = 'Accumulated dependence')
agg_plot = pd_co2_xgboost.plot([ld_co2_xgboost, ad_co2_xgboost], show=False, y_title="")

In [None]:
import copy

In [None]:
agg_plot2 = copy.deepcopy(agg_plot)
tick_font_size = 16
label_font_size = 18
agg_plot.update_layout(
    font=dict(color='black'),
    yaxis_title="prediction",
    legend_title=""
)

agg_plot2.update_xaxes(
    linewidth=2,
    showgrid=False,
    tickfont=dict(size=tick_font_size), title_font=dict(size=label_font_size), color='black', row=1, col=1)
agg_plot2.update_yaxes(
    linewidth=2,
    showgrid=False,
    tickfont=dict(size=tick_font_size), title_font=dict(size=label_font_size), color='black', row=1, col=1)
agg_plot2.update_xaxes(
    linewidth=2,
    showgrid=False,
    tickfont=dict(size=tick_font_size), title_font=dict(size=label_font_size), color='black', row=1, col=2)
agg_plot2.update_yaxes(
    linewidth=2,
    showgrid=False,
    tickfont=dict(size=tick_font_size), title_font=dict(size=label_font_size), color='black', row=1, col=2)
agg_plot2.update_traces(
    line_width=2, 
    opacity=1
)
agg_plot2['data'][0]['line']['color']="#799ed9"
agg_plot2['data'][1]['line']['color']='#89b38a'
agg_plot2['data'][2]['line']['color']='#c7644c'
agg_plot2['data'][3]['line']['color']="#799ed9"
agg_plot2['data'][4]['line']['color']='#89b38a'
agg_plot2['data'][5]['line']['color']='#c7644c'


In [None]:
agg_plot2.show()

In [None]:
selected_variables_co2 = [
    'reservoir forest area fraction', 'evapotranspiration', 
    'population', 'catchment runoff', 'air temperature', 'catchment slope']
# Partial dependence profiles
pd_co2_xgboost = exp_co2_catboost.model_profile(
    variables=selected_variables_co2,
    N=50, label = 'Partial dependence XGBoost CO2 emissions')
# Local dependence profiles
ld_co2_xgboost = exp_co2_catboost.model_profile(
    variables=selected_variables_co2,
    type='conditional',
    N=50, label = 'Local dependence XGBoost CO2 emissions')
# Accumulated dependence profiles
ad_co2_xgboost = exp_co2_catboost.model_profile(
    variables=selected_variables_co2,
    type='accumulated',
    N=50, label = 'Accumulated dependence XGBoost CO2 emissions')
pd_co2_xgboost.plot([ld_co2_xgboost, ad_co2_xgboost])

### 2. CO2 LightGBM

In [None]:
# Partial dependence profiles
pd_co2_lightgbm = exp_co2_lightgbm.model_profile(
    variables=selected_variables_co2,
    N=50, label = 'Partial dependence LightGBM CO2 emissions')
# Local dependence profiles
ld_co2_lightgbm = exp_co2_lightgbm.model_profile(
    variables=selected_variables_co2,
    type='conditional',
    N=50, label = 'Local dependence LightGBM CO2 emissions')
# Accumulated dependence profiles
ad_co2_lightgbm = exp_co2_lightgbm.model_profile(
    variables=selected_variables_co2,
    type='accumulated',
    N=50, label = 'Accumulated dependence LightGBM CO2 emissions')
pd_co2_lightgbm.plot([ld_co2_lightgbm, ad_co2_lightgbm])

### 3. CO2 CATBoost

In [None]:
# Partial dependence profiles
pd_co2_catboost = exp_co2_catboost.model_profile(
    variables=selected_variables_co2,
    N=50, label = 'Partial dependence CATBoost CO2 emissions')
# Local dependence profiles
ld_co2_catboost = exp_co2_catboost.model_profile(
    variables=selected_variables_co2,
    type='conditional',
    N=50, label = 'Local dependence CATBoost CO2 emissions')
# Accumulated dependence profiles
ad_co2_catboost = exp_co2_catboost.model_profile(
    variables=selected_variables_co2,
    type='accumulated',
    N=50, label = 'Accumulated dependence CATGBoost CO2 emissions')
pd_co2_catboost.plot([ld_co2_catboost, ad_co2_catboost])

### 4. CH4 XGBoost

In [None]:
selected_variables_ch4 = [
   'mean depth', 'population', 'catchment area',
    'air temperature', 'reservoir shrub area fraction']
#  'littoral area fraction', 'N load', 'retention coefficient', 

# Partial dependence profiles
pd_ch4_xgboost = exp_ch4_xgboost.model_profile(
    variables=selected_variables_ch4,
    N=50, label = 'Partial dependence XGBoost CH4 emissions')
# Local dependence profiles
ld_ch4_xgboost = exp_ch4_xgboost.model_profile(
    variables=selected_variables_ch4,
    type='conditional',
    N=50, label = 'Local dependence XGBoost CH4 emissions')
# Accumulated dependence profiles
ad_ch4_xgboost = exp_ch4_xgboost.model_profile(
    variables=selected_variables_ch4,
    type='accumulated',
    N=50, label = 'Accumulated dependence XGBoost CH4 emissions')
pd_ch4_xgboost.plot([ld_ch4_xgboost, ad_ch4_xgboost])

### 5. CH4 LightGBM

In [None]:
# Partial dependence profiles
pd_ch4_lightgbm = exp_ch4_lightgbm.model_profile(
    variables=selected_variables_ch4,
    N=50, label = 'Partial dependence LightGBM CH4 emissions')
# Local dependence profiles
ld_ch4_lightgbm = exp_ch4_lightgbm.model_profile(
    variables=selected_variables_ch4,
    type='conditional',
    N=50, label = 'Local dependence LightGBM CH4 emissions')
# Accumulated dependence profiles
ad_ch4_lightgbm = exp_ch4_lightgbm.model_profile(
    variables=selected_variables_ch4,
    type='accumulated',
    N=50, label = 'Accumulated dependence LightGBM CH4 emissions')
pd_ch4_lightgbm.plot([ld_ch4_lightgbm, ad_ch4_lightgbm])

### 6. CH4 CATBoost

In [None]:
# Partial dependence profiles
pd_ch4_catboost = exp_ch4_catboost.model_profile(
    variables=selected_variables_ch4,
    N=50, label = 'Partial dependence CATBoost CH4 emissions')
# Local dependence profiles
ld_ch4_catboost = exp_ch4_catboost.model_profile(
    variables=selected_variables_ch4,
    type='conditional',
    N=50, label = 'Local dependence CATBoost CH4 emissions')
# Accumulated dependence profiles
ad_ch4_catboost = exp_ch4_catboost.model_profile(
    variables=selected_variables_ch4,
    type='accumulated',
    N=50, label = 'Accumulated dependence CATBoost CH4 emissions')
pd_ch4_catboost.plot([ld_ch4_catboost, ad_ch4_catboost])

## Accumulated profiles for all models

### CO2 accumulated dependence profiles

In [None]:
# Create a plot for visual abstract
selected_variables_co2 = ['reservoir forest area fraction', 'evapotranspiration']
# Partial dependence profiles
pd_co2_xgboost = exp_co2_xgboost.model_profile(
    variables=selected_variables_co2,
    N=50, label = 'Partial dependence')
# Local dependence profiles
ld_co2_xgboost = exp_co2_xgboost.model_profile(
    variables=selected_variables_co2,
    type='conditional',
    N=50, label = 'Local dependence')
# Accumulated dependence profiles
ad_co2_xgboost = exp_co2_xgboost.model_profile(
    variables=selected_variables_co2,
    type='accumulated',
    N=50, label = 'Accumulated dependence')
agg_plot = pd_co2_xgboost.plot([ld_co2_xgboost, ad_co2_xgboost], show=False, y_title="")

In [None]:
selected_variables_co2_vis_abstract = ['reservoir forest area fraction', 'air temperature']
prof_lightgbm_co2 = exp_co2_lightgbm.model_profile(
    variables=selected_variables_co2_vis_abstract,
    type='accumulated',
    label="LightGBM",
    N=50)

prof_xgboost_co2 = exp_co2_xgboost.model_profile(
    variables=selected_variables_co2_vis_abstract,
    type='accumulated',
    label="XGBoost",
    N=50)

prof_catboost_co2 = exp_co2_catboost.model_profile(
    variables=selected_variables_co2_vis_abstract,
    type='accumulated',
    label="CATBoost",
    N=50)

plot_visabstract = prof_lightgbm_co2.plot(
    [prof_xgboost_co2, prof_catboost_co2], show=False, y_title="",
    title="Aggregated Profiles - CO2 Models") # type = 'accumulated', type = 'conditional', geom='profiles'
plot_visabstract.show()

In [None]:
agg_plot2 = copy.deepcopy(plot_visabstract)
tick_font_size = 16
label_font_size = 18
agg_plot2.update_layout(
    width=800,
    height=400,
    font=dict(color='black'),
    yaxis_title="prediction",
    legend_title=""
)
line_width = 3

agg_plot2.update_xaxes(
    linewidth=line_width,
    showgrid=False,
    tickfont=dict(size=tick_font_size), title_font=dict(size=label_font_size), color='black', row=1, col=1)
agg_plot2.update_yaxes(
    linewidth=line_width,
    showgrid=False,
    tickfont=dict(size=tick_font_size), title_font=dict(size=label_font_size), color='black', row=1, col=1)
agg_plot2.update_xaxes(
    linewidth=line_width,
    showgrid=False,
    tickfont=dict(size=tick_font_size), title_font=dict(size=label_font_size), color='black', row=1, col=2)
agg_plot2.update_yaxes(
    linewidth=line_width,
    showgrid=False,
    tickfont=dict(size=tick_font_size), title_font=dict(size=label_font_size), color='black', row=1, col=2)
agg_plot2.update_traces(
    line_width=line_width, 
    opacity=0.85
)
agg_plot2['data'][0]['line']['color']="#799ed9"
agg_plot2['data'][1]['line']['color']='#89b38a'
agg_plot2['data'][2]['line']['color']='#c7644c'
agg_plot2['data'][3]['line']['color']="#799ed9"
agg_plot2['data'][4]['line']['color']='#89b38a'
agg_plot2['data'][5]['line']['color']='#c7644c'

In [None]:
agg_plot2.show()

In [None]:
agg_plot2.write_image(pathlib.Path("figures/model_explanation/dependency_profiles_visabstract.svg"))
agg_plot2.write_image(pathlib.Path("figures/model_explanation/dependency_profiles_visabstract.png"))

### CH4 accumulated dependence profiles

In [None]:
prof_lightgbm_ch4 = exp_ch4_lightgbm.model_profile(
    variables=selected_variables_ch4,
    type='accumulated',
    N=50)
prof_xgboost_ch4 = exp_ch4_xgboost.model_profile(
    variables=selected_variables_ch4,
    type='accumulated',
    N=50)
prof_catboost_ch4 = exp_ch4_catboost.model_profile(
    variables=selected_variables_ch4,
    type='accumulated',
    N=50)

prof_lightgbm_ch4.plot(
    [prof_xgboost_ch4, prof_catboost_ch4]) # type = 'accumulated', type = 'conditional', geom='profiles'

# Instance explanations

In [None]:
def find_index_by_name(name: str, df_full: pd.DataFrame = merged_df_min_prim_low) -> pd.Int64Index | None:
    """Uses full dataset with Name column to obtain an index of a row containing the input data for the
    reservoir which can be used to select data in the train/test dataset, e.g. for inspecting variable
    importance for each reservoir"""
    ix = merged_df_min_prim_low[df_full['Name']==name].index
    if not ix.empty:
        return ix
    else:
        print(f"Reservoir with name {name} not found")
        return None
    
def loc_index_to_iloc(loc_index: pd.Int64Index, data: pd.DataFrame = X_co2_train_test) -> int:
    """ """
    loc_index_int = int(np.mean(loc_index))
    return data.index.get_loc(loc_index_int)

def reservoir_names(df_full: pd.DataFrame = merged_df_min_prim_low) -> List[str]:
    return list(df_full['Name'])

In [None]:
len(reservoir_names())

In [None]:
print(f"We have {len(reservoir_names())} reservoirs ...")
print(" -- ".join(reservoir_names()))

### PICK RESERVOIR

In [None]:
reservoir_name = 'Zawgyi II'

## Feature Importances on an instance

## Breakdown interactions for all reservoirs - may take some time to compute

In [None]:
# May take a while to run - uses the dalex interface to calculate SHAP values
from typing import List
import pickle
from ipywidgets import IntProgress
from IPython.display import display

def run_and_save_breakdown_interactions_via_dalex(
        reservoir_list: List[str], 
        input_data: pd.DataFrame = X_co2_train_test,
        input_data_trimmed: pd.DataFrame = X_co2_train_test_sel,
        output_path: str = "outputs/model_explanations",
        B: int = 50, 
        interaction_preference: int = 1,
        random_state: int = 42) -> None:
    """ """
    def to_dataframe(shaps, reservoir_name) -> pd.DataFrame:
        """ """
        df = \
            shaps.result[['contribution', 'variable_name']]\
            .groupby('variable_name').mean().T  
        df['reservoir name'] = reservoir_name
        return df
    
    shp_conversion_config = {
        'breakdown_lightbm_co2': (exp_co2_lightgbm, 'CO2 emissions for '),
        'breakdown_xgboost_co2': (exp_co2_xgboost, 'CO2 emissions for '),
        'breakdown_catboost_co2': (exp_co2_catboost, 'CO2 emissions for '),
        'breakdown_lightgbm_ch4': (exp_ch4_lightgbm, 'CH4 emissions for '),
        'breakdown_catboost_ch4': (exp_ch4_catboost,'CH4 emissions for '),
        'breakdown_xgboost_ch4': (exp_ch4_xgboost, 'CH4 emissions for ')
    }
    num_iter = len(reservoir_list) * len(shp_conversion_config)
    f = IntProgress(min=0, max=num_iter) # instantiate the bar
    display(f)
    
    print("Calculating breakdown interactions using DALEX")
    print("Note that the pre-calculated interaction values with DALEX can be found in `model_explanations_precalculated`")

    for identifier, parameters in shp_conversion_config.items():
        if identifier == 'breakdown_xgboost_co2' or identifier == 'breakdown_xgboost_ch4':
            input_data = input_data_trimmed
        else:
            input_data = input_data
        print(f"Calculating breakdown interaction values for {identifier}...")
        # Initialise empty containers for data
        shaps_dict = dict()
        shaps_df = pd.DataFrame()
        for reservoir_name in reservoir_list:
            num_row = loc_index_to_iloc(find_index_by_name(name=reservoir_name), input_data)
            input_reservoir = input_data.iloc[[num_row]]
            #print(f"Processing SHAP values for reservoir {reservoir_name}")
            shaps = parameters[0].predict_parts(
                input_reservoir, 'break_down_interactions', 
                interaction_preference = interaction_preference, 
                label = f'{parameters[1]}{reservoir_name}',
                B=B)
            shaps_dict[reservoir_name] = shaps
            # Add to a dataframe of shaps
            shap_df = to_dataframe(shaps, reservoir_name)
            shaps_df = pd.concat([shaps_df, shap_df])
            f.value += 1
        # Sanitise the dataframe
        shaps_df.set_index('reservoir name', drop=True, inplace=True)
        # Save the results
        # Binary file with pickle
        subfolder = f'interaction_preference_{interaction_preference}'
        full_pickle_path = os.path.join(output_path, subfolder)
        if not os.path.exists(full_pickle_path):
            # Create the folder
            os.makedirs(full_pickle_path)
        pickle_file_path = os.path.join(full_pickle_path, identifier+'_dalex.pkl')
        with open(pickle_file_path, 'wb') as fp:
            pickle.dump(shaps_dict, fp)
        # csv file with pandas
        shaps_df.to_csv(os.path.join(full_pickle_path, identifier+'_dalex.csv'))
        # xlsx file with pandas
        shaps_df.to_excel(os.path.join(full_pickle_path, identifier+'_dalex.xlsx'))

In [None]:
recalculate_breakdown_interactions = False
# BE CAREFUL WHEN RERUNNING BREAKDOWN INTERACTIONS - TAKES A VERY LONG TIME
# WE ARE RUNNING BREAKDOWN INTERACTIONS FOR 4 DIFFERENT LEVELS OF INTERACTION PREFERENCE AND FOR 3 DIFFERENT MODELS

if recalculate_breakdown_interactions == True:
    run_and_save_breakdown_interactions_via_dalex(
        reservoir_names(), interaction_preference = 0)
    run_and_save_breakdown_interactions_via_dalex(reservoir_names(), interaction_preference = 1)
    run_and_save_breakdown_interactions_via_dalex(reservoir_names(), interaction_preference = 2)
    run_and_save_breakdown_interactions_via_dalex(reservoir_names(), interaction_preference = 3)
else:
    print("Breakdown interactions have not been recalculated")
    print("You can find pre-calculated values in `bin/model_explanations_precalculated/breakdown_interactions`")


### Breakdown interactions (individual)

In [None]:
output_predch4 = exp_ch4_lightgbm.predict_profile(input_reservoir)

In [None]:
output_predch4.result.describe()

In [None]:
p1ch4 = output_predch4.plot(
    variables = [
        'Tair',
        'hmean'], show=False, size=3)

In [None]:
p1ch4.update_layout(
    xaxis=dict(
        tickfont=dict(color='rgba(0, 0, 0, 0.8)', size=16),  # Set x-axis tick font color to black
        title=dict(font=dict(size=18, color='black'))
    ),
    yaxis=dict(
        tickfont=dict(color='rgba(0, 0, 0, 0.8)', size=16),  # Set y-axis tick font color to black
        title=dict(font=dict(size=18, color='black'))
    ),
    xaxis2=dict(
        tickfont=dict(color='rgba(0, 0, 0, 0.8)', size=16)  # Set x-axis tick font color to black
    ),
    yaxis2=dict(
        tickfont=dict(color='rgba(0, 0, 0, 0.8)', size=16)  # Set y-axis tick font color to black
    ),
    title=dict(
        text='Ceteris Paribus Plots for CH4 Aerial Emission Prediction',  # Main plot title
        font=dict(size=18, color='black')  # Increase font size and change color to black for the main title
    ),
    font=dict(color='black'),
    width=1000,
    height=450
)
p1ch4.update_traces(
    line=dict(width=3, color='rgba(0, 0, 0, 0.6)'),
    opacity=0.70)
p1ch4.show()
p1ch4.write_image("figures/model_explanation/zagyi2cp4ch4.svg")

In [None]:
output_pred = exp_co2_lightgbm.predict_profile(input_reservoir)

In [None]:
output_pred.result.describe()

In [None]:
output_pred.plot(variables = ['evapotranspiration', 'reservoir forest area fraction', 'mean depth', 'reservoir soil carbon'])

In [None]:
num_row = loc_index_to_iloc(find_index_by_name(name=reservoir_name), X_co2_train_test)
input_reservoir = X_co2_train_test.iloc[[num_row]]
output_true = y_co2_train_test.iloc[num_row]
output_pred = exp_co2_lightgbm.predict(input_reservoir)
# Calculate, explain and plot the prediction using DALEX
explanation_sample = exp_co2_lightgbm.predict_parts(
    input_reservoir, type='break_down_interactions', interaction_preference = 1, 
    label=f'CO2 emissions for {reservoir_name}', B=25) 
# type="shap_wrapper", type='break_down' keep_distributions = True does not have any effect
explanation_sample.plot(max_vars=5)

In [None]:
num_row = loc_index_to_iloc(find_index_by_name(name=reservoir_name), X_ch4_train_test)
input_reservoir = X_ch4_train_test.iloc[[num_row]]
output_true = y_ch4_train_test.iloc[num_row]
output_pred = exp_ch4_lightgbm.predict(input_reservoir)
# Calculate, explain and plot the prediction using DALEX
explanation_sample = exp_ch4_lightgbm.predict_parts(
    input_reservoir, type='break_down_interactions', interaction_preference = 2, 
    label=f'CH4 emissions for {reservoir_name}') 
# type="shap_wrapper", type='break_down' keep_distributions = True does not have any effect
explanation_sample.plot(max_vars=5)

### SHAP values for individual reservoirs

In [None]:
# May take a while to run - uses the dalex interface to calculate SHAP values
from typing import List
import pickle
from ipywidgets import IntProgress
from IPython.display import display

def run_and_save_shaps_via_dalex(
        reservoir_list: List[str], 
        input_data: pd.DataFrame = X_co2_train_test,
        output_path: str = "outputs/model_explanations/dalex",
        B: int = 50, 
        random_state: int = 42) -> None:
    """ """
    def to_dataframe(shaps, reservoir_name) -> pd.DataFrame:
        """ """
        df = \
            shaps.result[['contribution', 'variable_name']]\
            .groupby('variable_name').mean().T  
        df['reservoir name'] = reservoir_name
        return df
    
    shp_conversion_config = {
        'shap_xgboost_co2': (exp_co2_xgboost, 'CO2 emissions for '),
        'shap_lightbm_co2': (exp_co2_lightgbm, 'CO2 emissions for '),
        'shap_catboost_co2': (exp_co2_catboost, 'CO2 emissions for '),
        'shap_xgboost_ch4': (exp_ch4_xgboost, 'CH4 emissions for '),
        'shap_lightgbm_ch4': (exp_ch4_lightgbm, 'CH4 emissions for '),
        'shap_catboost_ch4': (exp_ch4_catboost,'CH4 emissions for ')
    }
    num_iter = len(reservoir_list) * len(shp_conversion_config)
    f = IntProgress(min=0, max=num_iter) # instantiate the bar
    display(f)
    
    print("Calculating instance-level SHAP values using DALEX")
    print("Note that the pre-calculated SHAP values with DALEX can be found in `model_explanations_precalculated`")

    for identifier, parameters in shp_conversion_config.items():
        print(f"Calculating SHAP values for {identifier}...")
        # Initialise empty containers for data
        shaps_dict = dict()
        shaps_df = pd.DataFrame()
        for reservoir_name in reservoir_list:
            num_row = loc_index_to_iloc(find_index_by_name(name=reservoir_name), input_data)
            input_reservoir = input_data.iloc[[num_row]]
            #print(f"Processing SHAP values for reservoir {reservoir_name}")
            shaps = parameters[0].predict_parts(
                input_reservoir, type='shap', 
                shap_explainer_type="TreeExplainer",
                keep_distributions=True,
                processes=4,
                label = f'{parameters[1]}{reservoir_name}',
                B=B)
            shaps_dict[reservoir_name] = shaps
            # Add to a dataframe of shaps
            shap_df = to_dataframe(shaps, reservoir_name)
            shaps_df = pd.concat([shaps_df, shap_df])
            f.value += 1
        # Sanitise the dataframe
        shaps_df.set_index('reservoir name', drop=True, inplace=True)
        # Save the results
        # Binary file with pickle
        if not os.path.exists(output_path):
            # Create the folder
            os.makedirs(output_path)
        pickle_file_path = os.path.join(output_path, identifier+'_dalex.pkl')
        with open(pickle_file_path, 'wb') as fp:
            pickle.dump(shaps_dict, fp)
        # csv file with pandas
        shaps_df.to_csv(os.path.join(output_path, identifier+'_dalex.csv'))
        # xlsx file with pandas
        shaps_df.to_excel(os.path.join(output_path, identifier+'_dalex.xlsx'))

In [None]:
# Already pre-saved. Run only if you want to rerun all shaps again
recalculate_shaps = False
if recalculate_shaps:
    run_and_save_shaps_via_dalex(reservoir_names())
else:
    print("SHAPS values have not been recalculated")
    print("You can find pre-calculated values in `bin/model_explanations_precalculated/dalex`")    

In [None]:
input_reservoir

## Make plots for visual abstract

In [None]:
def plot_shaps(
        reservoir_name: str, model: str = 'lightgbm', 
        file_location: str = "figures/model_explanation/") -> None:
    """ """
    if model == "lightgbm":
        co2_explainer = exp_co2_lightgbm
        ch4_explainer = exp_ch4_lightgbm
    elif model == "xgboost":
        co2_explainer = exp_co2_xgboost
        ch4_explainer = exp_ch4_xgboost
    elif model == "catboost":
        co2_explainer = exp_co2_catboost
        ch4_explainer = exp_ch4_catboost
    else:
        raise ValueError(f"Model {model} not recognized.")
    res_location = loc_index_to_iloc(find_index_by_name(name=reservoir_name))
    input_reservoir = X_co2_train_test.iloc[[res_location]]
    explanation_sample_shap_co2 = co2_explainer.predict_parts(
        input_reservoir, type='shap',
        keep_distributions=True,
        label = f'CO2 emissions for {reservoir_name}',
        B=50,
        processes=4,
        random_state = 42)
    exp_plot1 = explanation_sample_shap_co2.plot(
    max_vars=6, title="", bar_width=15, vertical_spacing = 0.05,
    vcolors=("#799ed9", '#89b38a', '#c7644c'), show=False) 
    # shap_explainer_type="TreeExplainer" type="shap_wrapper", type="break_dowo", keep_distributions = True
    exp_plot1.update_layout(
        xaxis=dict(
            showgrid=False,  # Remove x-axis grid lines
            tickfont=dict(color='black')  # Set x-axis tick font color to black
        ),
        yaxis=dict(
            showgrid=False,  # Remove y-axis grid lines
            tickfont=dict(color='black')  # Set y-axis tick font color to black
        ),
        font=dict(color='black')  # Set general font color to black
    )
    exp_plot1.show()
    explanation_sample_shap_ch4 = ch4_explainer.predict_parts(
        input_reservoir, type='shap', 
        keep_distributions=True,
        processes=4,
        label = f'CH4 Emission Intensity for {reservoir_name}',
        B=50, random_state = 42)    
    exp_plot2 = explanation_sample_shap_ch4.plot(
        max_vars=6, title="", bar_width=15, vertical_spacing = 0.05,
        vcolors=("#799ed9", '#89b38a', '#c7644c'), show=False) 
    # shap_explainer_type="TreeExplainer" type="shap_wrapper", type="break_dowo", keep_distributions = True
    exp_plot2.update_layout(
        xaxis=dict(
            showgrid=False,  # Remove x-axis grid lines
            tickfont=dict(color='black')  # Set x-axis tick font color to black
        ),
        yaxis=dict(
            showgrid=False,  # Remove y-axis grid lines
            tickfont=dict(color='black')  # Set y-axis tick font color to black
        ),
        font=dict(color='black')  # Set general font color to black
    )
    exp_plot2.show()
    file_co2_svg = "shap_" + reservoir_name + "_co2.svg"
    file_co2_png = "shap_" + reservoir_name + "_co2.png"
    file_ch4_svg = "shap_" + reservoir_name + "_ch4.svg"
    file_ch4_png = "shap_" + reservoir_name + "_ch4.png"    

    exp_plot1.write_image(pathlib.Path(file_location) / file_co2_svg)
    exp_plot1.write_image(pathlib.Path(file_location) / file_co2_png)
    exp_plot2.write_image(pathlib.Path(file_location) / file_ch4_svg)
    exp_plot2.write_image(pathlib.Path(file_location) / file_ch4_png)    

In [None]:
# Rename columns for presentation purposes 

In [None]:
col_rename = {'catchment runoff': 'runoff',
 'catchment area': 'Ac',
 'population': 'pop',
 'catchment bare soil fraction': 'fBS_c',
 'catchment snow and ice fraction': 'fSI_c',
 'catchment urban area fraction': 'fU_c',
 'catchment water area fraction': 'fW_c',
 'catchment wetland area fraction': 'fWt_c',
 'catchment crop area fraction': 'fC_c',
 'catchment shrub area fraction': 'fS_c',
 'catchment forest area fraction': 'fF_c',
 'catchment slope': 'slope',
 'evapotranspiration': 'ET',
 'catchment soil wetness': 'SWet',
 'catchment mean olsen': 'OlsenP',
 'reservoir volume': 'V',
 'reservoir area': 'Ar',
 'mean depth': 'hmean',
 'reservoir soil carbon': 'soilC',
 'reservoir mean radiance': 'Le',
 'reservoir mean radiance may-sept': 'Le_May-Sept',
 'reservoir mean radiance nov-mar': 'Le_Nov-Mar',
 'reservoir mean monthly windspeed': 'vspeed',
 'reservoir urban area fraction': 'fU_r',
 'reservoir water area fraction': 'fW_r',
 'reservoir wetland area fraction': 'fWt_r',
 'reservoir crop area fraction': 'fC_r',
 'reservoir shrub area fraction': 'fS_r',
 'reservoir forest area fraction': 'fF_r',
 'air temperature': 'Tair'}

# Rename columns of the data and of the explained for visualisation purposes
X_co2_train_test_renamed = X_co2_train_test.rename(
    columns = col_rename)
X_ch4_train_test_renamed = X_ch4_train_test.rename(
    columns = col_rename)

X_co2_train_test_renamed['V'] = X_co2_train_test_renamed['V'] / 1e6
X_ch4_train_test_renamed['V'] = X_ch4_train_test_renamed['V'] / 1e6

model_co2_xgboost.fit(X_co2_train_test_renamed, y_co2_train_test)
model_ch4_xgboost.fit(X_ch4_train_test_renamed, y_ch4_train_test)
model_co2_lightgbm.fit(X_co2_train_test_renamed, y_co2_train_test)
model_ch4_lightgbm.fit(X_ch4_train_test_renamed, y_ch4_train_test)
model_co2_catboost.fit(X_co2_train_test_renamed, y_co2_train_test)
model_ch4_catboost.fit(X_ch4_train_test_renamed, y_ch4_train_test)
# LightGBM explainers
exp_co2_lightgbm = dx.Explainer(
    model_co2_lightgbm, X_co2_train_test_renamed, y_co2_train_test, 
    label='CO2 net emission - LightGBM')
exp_ch4_lightgbm = dx.Explainer(
    model_ch4_lightgbm, X_ch4_train_test_renamed, y_ch4_train_test, 
    label='CH4 net emission - LightGBM')
# CATBoost explainers
exp_co2_catboost = dx.Explainer(
    model_co2_catboost, X_co2_train_test_renamed, y_co2_train_test, 
    label='CO2 net emission - CATBoost')
exp_ch4_catboost = dx.Explainer(
    model_ch4_catboost, X_ch4_train_test_renamed, y_ch4_train_test, 
    label='CH4 net emission - CATBoost')
# XGBoost explainers
exp_co2_xgboost = dx.Explainer(
    model_co2_xgboost, X_co2_train_test_renamed, y_co2_train_test, 
    label='CO2 net emission - XGBoost')
exp_ch4_xgboost = dx.Explainer(
    model_ch4_xgboost, X_ch4_train_test_renamed, y_ch4_train_test, 
    label='CH4 net emission - XGBoost')

In [None]:
def plot_breakdowns(
        reservoir_name: str, model: str = 'lightgbm', 
        file_location: str = "figures/model_explanation/", interaction_preference: int = 1,
        max_vars = 6,
        rounding_digits: int = 3,
        print_titles: bool = False,
        input_data=X_co2_train_test_renamed, save_to_fig: bool = True) -> None:
    """ """
    if model == "lightgbm":
        co2_explainer = exp_co2_lightgbm
        ch4_explainer = exp_ch4_lightgbm
    elif model == "xgboost":
        co2_explainer = exp_co2_xgboost
        ch4_explainer = exp_ch4_xgboost
    elif model == "catboost":
        co2_explainer = exp_co2_catboost
        ch4_explainer = exp_ch4_catboost
    else:
        raise ValueError(f"Model {model} not recognized.")
    #title_1 = f'Unit CO2 emission in gCO2e/m2/year - {reservoir_name}'
    #title_2 = f'Unit CH4 Emission in gCO2e/m2/year - {reservoir_name}'
    if print_titles:
        title_1 = 'Unit carbon dioxide emission'
        title_2 = "Unit methane emission"
    else:
        title_1, title_2 = " ", " "
    res_location = loc_index_to_iloc(find_index_by_name(name=reservoir_name))
    input_reservoir = input_data.iloc[[res_location]]
    explanation_sample_shap_co2 = co2_explainer.predict_parts(
        input_reservoir, type='break_down_interactions', 
        interaction_preference = interaction_preference,
        keep_distributions=True,
        label = title_1,
        B=50,
        processes=4,
        random_state = 42)
    exp_plot1 = explanation_sample_shap_co2.plot(
        max_vars=6, title=title_1, bar_width=15, vertical_spacing = 0.05,
        digits = rounding_digits,
        vcolors=("#2471a3", '#89b38a', '#c7644c'), show=False) 
    # shap_explainer_type="TreeExplainer" type="shap_wrapper", type="break_dowo", keep_distributions = True
    exp_plot1.update_layout(
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        xaxis=dict(
            showgrid=False,  # Remove x-axis grid lines
            tickfont=dict(color='black')  # Set x-axis tick font color to black
        ),
        yaxis=dict(
            showgrid=False,  # Remove y-axis grid lines
            tickfont=dict(color='black')  # Set y-axis tick font color to black
        ),
        font=dict(color='black')  # Set general font color to black
    )
    exp_plot1.update_traces(opacity=0.90)
    exp_plot1.data[0].connector.line.color = 'black'
    for shape in exp_plot1.layout.shapes:
        if shape.type == 'line':
            shape.line.color = '#424345'  # Set line color to black
            shape.line.width = 2
    exp_plot1.show()
    explanation_sample_shap_ch4 = ch4_explainer.predict_parts(
        input_reservoir, type='break_down_interactions', 
        interaction_preference = interaction_preference,
        keep_distributions=True,
        processes=4,
        label = title_2,
        B=50, random_state = 42)    
    exp_plot2 = explanation_sample_shap_ch4.plot(
        digits = rounding_digits,
        max_vars=max_vars, title=title_2, bar_width=15, vertical_spacing = 0.05,
        vcolors=("#2471a3", '#89b38a', '#c7644c'), show=False) 
    #vcolors=("#799ed9", '#89b38a', '#c7644c')
    # shap_explainer_type="TreeExplainer" type="shap_wrapper", type="break_dowo", keep_distributions = True
    exp_plot2.update_layout(
        paper_bgcolor='rgba(0,0,0,0)',
        plot_bgcolor='rgba(0,0,0,0)',
        xaxis=dict(
            showgrid=False,  # Remove x-axis grid lines
            tickfont=dict(color='black')  # Set x-axis tick font color to black
        ),
        yaxis=dict(
            showgrid=False,  # Remove y-axis grid lines
            tickfont=dict(color='black')  # Set y-axis tick font color to black
        ),
        font=dict(color='black')  # Set general font color to black
    )
    exp_plot2.update_traces(opacity=0.90)
    exp_plot2.data[0].connector.line.color = 'black'
    for shape in exp_plot2.layout.shapes:
        if shape.type == 'line':
            shape.line.color = '#424345'  # Set line color to black
            shape.line.width = 2
    exp_plot2.show()
    file_co2_svg = reservoir_name + "_breakdown_interactions" + "_co2.svg"
    file_co2_png = reservoir_name + "_breakdown_interactions" + "_co2.png"
    file_ch4_svg = reservoir_name + "_breakdown_interactions" + "_ch4.svg"
    file_ch4_png = reservoir_name + "_breakdown_interactions" + "_ch4.png"    

    if not save_to_fig:
        return
    exp_plot1.write_image(pathlib.Path(file_location) / file_co2_svg)
    exp_plot1.write_image(pathlib.Path(file_location) / file_co2_png)
    exp_plot2.write_image(pathlib.Path(file_location) / file_ch4_svg)
    exp_plot2.write_image(pathlib.Path(file_location) / file_ch4_png)    

In [None]:
print(" -- ".join(sorted(list(set(merged_df['Name'])))))

In [None]:
reservoir_names = [
    "Thapanzeik", "Sedawgyi", "Zawgyi II", "Belin", "Laza", 
    "Mone Chaung", "Yeywa (upper)", "Mone Chaung",
    "Kyee Ohn Kyee Wa", "Hawkham (upper)", "Myitsone"]

In [None]:
for reservoir_name in reservoir_names:
    plot_breakdowns(
        reservoir_name=reservoir_name,
        interaction_preference = 1,
        model = 'lightgbm')

### This section requires all svg files to be generated including em intensity explanations that are calculated further on. Move this code to a new notebook

In [None]:
import pprint
# Collect plots for merging into composite figures
directory = pathlib.Path("figures/model_explanation/")
# Initialize an empty dictionary to store the mappings
reservoir_files = {}

# Define the pattern for matching file names
pattern = r'([^_]+)_breakdown_.*\.svg'
pattern = r'(.+)_breakdown_.*\.svg'
#pattern = r'([^_]+) breakdown.*\.svg'
pattern = r'(.+?)_breakdown_.*\.svg'

# Iterate over the files in the directory
for filename in os.listdir(directory):
    filepath = os.path.join(directory, filename)
    if os.path.isfile(filepath):
        # Check if the file matches the pattern
        match = re.match(pattern, filename)
        if match:
            reservoir_name = match.group(1)
            if reservoir_name == 'Thaphanseik':
                reservoir_name = 'Thapanzeik'
            # Add the file to the dictionary
            if reservoir_name in reservoir_files:
                reservoir_files[reservoir_name].append(filename)
            else:
                reservoir_files[reservoir_name] = [filename]

# Print the dictionary
pprint.pprint(reservoir_files)

In [None]:
def combine_breakdown_plots(
        reservoir_name: str, input_folder: str, input_files: List[str], output_file: pathlib.Path,
        plot_offsets: Tuple[float, float, float], text_offsets: Tuple[float, float, float]) -> None:
    """ """
    # TODO: Move all configs to here and later expose it to the caller
    y_margin: int = 40
    x_margin: int = 0
    title_font_size: int = 19
    subtitle_font_size: int = 16
    #create new SVG figure
    fig = sg.SVGFigure("10cm", "10cm")
    # Only accept three files (figures)
    try:
        assert len(input_files) == 3
    except AssertionError:
        raise ValueError(f"Only three subplots supported. Entered {len(input_files)} files.")
    file_paths = [pathlib.Path(input_folder) / input_file for input_file in input_files]
    # Load the fiure svg giles
    fig1 = sg.fromfile(file_paths[0])
    fig2 = sg.fromfile(file_paths[1])
    fig3 = sg.fromfile(file_paths[2])
    # get the plot objects
    plot1 = fig1.getroot()
    plot2 = fig2.getroot()
    plot3 = fig3.getroot()
    plot1.moveto(0, plot_offsets[0] + y_margin)
    plot2.moveto(0, plot_offsets[1] + y_margin)
    plot3.moveto(0, plot_offsets[2] + y_margin)
    # add text labels
    title = sg.TextElement(25,30, reservoir_name, size=title_font_size, weight="bold")
    txt1 = sg.TextElement(
        25+x_margin, text_offsets[0] + y_margin, 
        "Emission intensity", size=subtitle_font_size, weight="bold")
    txt2 = sg.TextElement(
        25+x_margin, text_offsets[1] + y_margin, 
        "Unit carbon dioxide emissions", size=subtitle_font_size, weight="bold")
    txt3 = sg.TextElement(
        25+x_margin, text_offsets[2] + y_margin, 
        "Unit methane emissions", size=subtitle_font_size, weight="bold")
    # append plots and labels to figure
    fig.append([plot1, plot2, plot3])
    fig.append([title, txt1, txt2, txt3])
    # save generated SVG files
    fig.save(output_file)

In [None]:
res_file_config = {
    'Belin': {
        "files" :
        [
             'Belin_breakdown_interactions_em_intensity.svg',
             'Belin_breakdown_interactions_co2.svg',
             'Belin_breakdown_interactions_ch4.svg'
        ],
        "plot_offsets": [-40, 160, 420],
        "text_offsets": [30, 220, 480]
    },
    'Hawkham (Upper)': {
        "files" : [
            'Hawkham (upper)_breakdown_interactions_em_intensity.svg',
            'Hawkham (upper)_breakdown_interactions_co2.svg',
            'Hawkham (upper)_breakdown_interactions_ch4.svg'
         ],
        "plot_offsets": [-40, 160, 420],
        "text_offsets": [30, 220, 480]
    },
     'Kyee Ohn Kyee Wa': {
        "files" : [
             'Kyee Ohn Kyee Wa_breakdown_interactions_em_intensity.svg',
             'Kyee Ohn Kyee Wa_breakdown_interactions_co2.svg',
             'Kyee Ohn Kyee Wa_breakdown_interactions_ch4.svg'
        ],
        "plot_offsets": [-40, 180, 440],
        "text_offsets": [30, 240, 500]
     },
     'Laza': {
        "files" : [
            'Laza_breakdown_interactions_em_intensity.svg',
            'Laza_breakdown_interactions_co2.svg',
            'Laza_breakdown_interactions_ch4.svg'
         ],
        "plot_offsets": [-40, 165, 425],
        "text_offsets": [30, 225, 485]
     },
     'Mone Chaung': {
        "files" : [
             'Mone Chaung_breakdown_interactions_em_intensity.svg',
             'Mone Chaung_breakdown_interactions_co2.svg',
             'Mone Chaung_breakdown_interactions_ch4.svg'
        ],
        "plot_offsets": [-40, 180, 440],
        "text_offsets": [30, 240, 500]
     },
     'Sedawgyi': {
        "files" : [
            'Sedawgyi_breakdown_interactions_em_intensity.svg',
            'Sedawgyi_breakdown_interactions_co2.svg',
            'Sedawgyi_breakdown_interactions_ch4.svg'
         ],
        "plot_offsets": [-40, 165, 425],
        "text_offsets": [30, 225, 485]
     },
     'Thapanseik': {
        "files" : [
            'Thaphanseik_breakdown_interactions_em_intensity.svg',
            'Thapanzeik_breakdown_interactions_co2.svg',
            'Thapanzeik_breakdown_interactions_ch4.svg'
         ],
        "plot_offsets": [-30, 150, 410],
        "text_offsets": [30, 210, 470]
     },
     'Yeywa (Upper)': {
        "files" : [
            'Yeywa (upper)_breakdown_interactions_em_intensity.svg',
            'Yeywa (upper)_breakdown_interactions_co2.svg',
            'Yeywa (upper)_breakdown_interactions_ch4.svg'
         ],
        "plot_offsets": [-40, 180, 440],
        "text_offsets": [30, 240, 500]
     },
     'Zawgyi II': {
        "files" : [
            'Zawgyi II_breakdown_interactions_em_intensity.svg',
            'Zawgyi II_breakdown_interactions_co2.svg',
            'Zawgyi II_breakdown_interactions_ch4.svg'
         ],
        "plot_offsets": [-25, 170, 430],
        "text_offsets": [30, 230, 490]
     }}

In [None]:
input_folder="figures/model_explanation"

for reservoir_name, config_data in res_file_config.items():
    file_list = config_data['files']
    plot_offsets = tuple(config_data["plot_offsets"])
    text_offsets = tuple(config_data["text_offsets"])
    file_name = reservoir_name + "_breakdowns.svg" 
    output_folder = pathlib.Path(input_folder, "combined_breakdowns")
    output_folder.mkdir(parents=True, exist_ok=True)
    output_file = output_folder / file_name
    combine_breakdown_plots(
        reservoir_name = reservoir_name,
        input_folder = input_folder,
        input_files = file_list,
        output_file = output_file,
        plot_offsets = plot_offsets,
        text_offsets = text_offsets)

In [None]:
#plot_shaps(reservoir_name = "Yeywa")

## We haven't used any of the code or figures beyond this point...

In [None]:
reservoir_name = "Lemro 2"
res_location = loc_index_to_iloc(find_index_by_name(name=reservoir_name))
input_reservoir = X_co2_train_test.iloc[[res_location]]
explanation_sample_shap = exp_co2_xgboost.predict_parts(
    input_reservoir, type='shap',
    keep_distributions=True,
    processes=4,
    label = f'CO2 Emission Intensity for {reservoir_name}',
    B=50, random_state = 42)
explanation_sample_shap.plot(max_vars=8) 
# shap_explainer_type="TreeExplainer" type="shap_wrapper", type="break_dowo", keep_distributions = True

In [None]:
explanation_sample_shap = exp_ch4_xgboost.predict_parts(
    input_reservoir, type='shap', 
    processes=4,
    keep_distributions=True,
    label = f'CH4 Emission Intensity for {reservoir_name}',
    B=50, random_state = 42)
explanation_sample_shap.plot(max_vars=8) 

## Back to the rest of the script...

In [None]:
# How can the box plots be added using the Python version of DALEX?
explanation_sample_shap = exp_co2_xgboost.predict_parts(
    input_reservoir, type='shap',
    processes=4,
    keep_distributions=True,
    label = f'CO2 emissions for {reservoir_name}',
    B=50, random_state = 42)
explanation_sample_shap.plot(max_vars=5) 
# shap_explainer_type="TreeExplainer" type="shap_wrapper", type="break_dowo", keep_distributions = True

In [None]:
# How can the box plots be added using the Python version of DALEX?
explanation_sample_shap = exp_co2_catboost.predict_parts(
    input_reservoir, type='shap', 
    processes=4,
    keep_distributions=True,
    label = f'CO2 emissions for {reservoir_name}',
    B=25)
explanation_sample_shap.plot(max_vars=5) 
# shap_explainer_type="TreeExplainer" type="shap_wrapper", type="break_dowo", keep_distributions = True

In [None]:
# How can the box plots be added using the Python version of DALEX?
explanation_sample_shap = exp_ch4_lightgbm.predict_parts(
    input_reservoir, type='shap', 
    processes=4,
    keep_distributions=True,
    label = f'CH4 emissions for {reservoir_name}',
    B=50)
explanation_sample_shap.plot(max_vars=5) 
# shap_explainer_type="TreeExplainer" type="shap_wrapper", type="break_dowo", keep_distributions = True

## Ceteris Paribus plots on an instance

In [None]:
cp_lightgbm_co2 = exp_co2_lightgbm.predict_profile(
    input_reservoir, 
    variables=selected_variables_co2)
cp_catboost_co2 = exp_co2_catboost.predict_profile(
    input_reservoir, 
    variables=selected_variables_co2)
cp_xgboost_co2 = exp_co2_xgboost.predict_profile(
    input_reservoir, 
    variables=selected_variables_co2)
cp_catboost_co2.plot([cp_xgboost_co2, cp_lightgbm_co2])

In [None]:
cp_lightgbm_ch4 = exp_ch4_lightgbm.predict_profile(
    input_reservoir, 
    variables=selected_variables_ch4)
cp_catboost_ch4 = exp_ch4_catboost.predict_profile(
    input_reservoir, 
    variables=selected_variables_co2)
cp_xgboost_ch4 = exp_ch4_xgboost.predict_profile(
    input_reservoir, 
    variables=selected_variables_ch4)
cp_catboost_ch4.plot([cp_xgboost_ch4, cp_lightgbm_ch4])

# Model explanation with SHAP using the `SHAP` package

In [None]:
shap.plots.initjs()

In [None]:
# CO2 explainers
explainer_co2_xgboost = shap.TreeExplainer(model_co2_xgboost)
explainer_co2_lightgbm = shap.TreeExplainer(model_co2_lightgbm)
explainer_co2_catboost = shap.TreeExplainer(model_co2_catboost)
# CH4 explainers
explainer_ch4_xgboost = shap.TreeExplainer(model_ch4_xgboost)
explainer_ch4_lightgbm = shap.TreeExplainer(model_ch4_lightgbm)
explainer_ch4_catboost = shap.TreeExplainer(model_ch4_catboost)

# SHAP VALUES - CO2
shaps_co2_xgboost = explainer_co2_xgboost(X_co2_train_test, y_co2_train_test, check_additivity = True)
shaps_co2_lightgbm = explainer_co2_lightgbm(X_co2_train_test, y_co2_train_test, check_additivity = True)
shaps_co2_catboost = explainer_co2_catboost(X_co2_train_test, y_co2_train_test, check_additivity = True)
# SHAP VALUES - CH4
shaps_ch4_xgboost = explainer_ch4_xgboost(X_ch4_train_test, check_additivity = True)
shaps_ch4_lightgbm = explainer_ch4_lightgbm(X_ch4_train_test, check_additivity = True)
shaps_ch4_catboost = explainer_ch4_catboost(X_ch4_train_test, check_additivity = True)

In [None]:
def validate_shaps(shaps, model, raw_score: bool = True):
    """ """
    model_prediction = model.predict(shaps.data, raw_score=raw_score)
    
    # Test that mean model_prediction is equal to base_value
    mean_model_prediction = np.mean(model_prediction)
    assert isclose(mean_model_prediction, shaps.base_values[0], abs_tol=1e-6)
    
    # convert base vals vector to matrix
    base_vals_matrix = shaps.base_values.repeat(shaps.data.shape[1]).reshape(shaps.data.shape)
    
    # Test that shaps add up to raw prediction
    shap_predictions = np.sum(shaps.values, axis=1) + mean_model_prediction    
    np.testing.assert_array_almost_equal(shap_predictions, model_prediction, decimal=6)
    
    # Test that shaps converted to real values match prediction
    model_prediction_actual = model.predict(shaps.data, raw_score = False)
    
    # Find prediction from shaps
    y_shap_actual = np.prod(np.exp(shaps.values), axis=1) * np.mean(model_prediction_actual, axis=0)
    np.testing.assert_array_almost_equal(
        y_shap_actual, model_prediction_actual
    )

In [None]:
# validate_shaps(shaps_co2_lightgbm, model_co2_lightgbm)

In [None]:
shap.plots.waterfall(shaps_co2_lightgbm[0])

In [None]:
# Check shap values from dalex and lightgbm shap output
ix = find_index_by_name(reservoir_name)
iloc_ix = loc_index_to_iloc(ix)

In [None]:
def get_model_input(data: pd.DataFrame, iloc_ix: int) -> pd.DataFrame:
    """ """
    return data.iloc[iloc_ix].to_frame().transpose()

In [None]:
input_data = get_model_input(X_co2_train_test, iloc_ix)

In [None]:
reservoir_name = 'Baingda Dam'

In [None]:
iloc_ix = loc_index_to_iloc(loc_index = find_index_by_name(reservoir_name), data = X_co2_train_test)

fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=False, sharey=False)
fig.suptitle(f'SHAP values for CO$_2$ regression models for {reservoir_name} reservoir')
plt.sca(ax1)
shap.plots.waterfall(shaps_co2_xgboost[iloc_ix])
ax1.title.set_text("XGBoost Regression Model")
plt.sca(ax2)
shap.plots.waterfall(shaps_co2_lightgbm[iloc_ix])
ax2.title.set_text("LightGBM Regression Model")
plt.sca(ax3)
shap.plots.waterfall(shaps_co2_catboost[iloc_ix])
ax3.title.set_text("CATBoost Regression Model")
fig.subplots_adjust(hspace=0)
fig.set_figheight(10)
fig.set_figwidth(14)
fig.tight_layout()
fig.savefig(pathlib.Path('figures/model_explanation/shap_values_per_reservoir_co2.png'))
#plt.show()

In [None]:
iloc_ix = loc_index_to_iloc(loc_index = find_index_by_name(reservoir_name), data = X_ch4_train_test)
#fig, axes = plt.subplots(3)
fig = plt.figure()
fig.suptitle(f'SHAP values for CH$_4$ regression model for reservoir {reservoir_name}')
plt.subplot(311)
#fig.add_subplot(311)
shap.plots.waterfall(shaps_ch4_xgboost[iloc_ix])
plt.subplot(312)
#fig.add_subplot(312)
shap.plots.waterfall(shaps_ch4_lightgbm[iloc_ix])
#fig.add_subplot(313)
plt.subplot(313)
shap.plots.waterfall(shaps_ch4_catboost[iloc_ix])
plt.tight_layout()
plt.show()

In [None]:
explainer_co2_lightgbm.expected_value

In [None]:
shap.plots.force(shaps_co2_lightgbm[iloc_ix, :])

In [None]:
shap.plots.force(shaps_co2_catboost[:])

### Plot beeswarm plots

In [None]:
fig = plt.figure()
shap.plots.beeswarm(shaps_co2_xgboost, 15, axis_color='black', color=plt.get_cmap("viridis"))
fig.tight_layout()
fig.savefig(pathlib.Path('figures/model_explanation/shap_beeswarm_co2_xgboost.png'),dpi=700)

In [None]:
shap.summary_plot(shaps_co2_xgboost, X_co2_train_test )

In [None]:
shap.plots.bar(shaps_co2_catboost)

In [None]:
shap.plots.heatmap(shaps_co2_catboost[:], max_display = 42, plot_width=25)

In [None]:
shap.plots.scatter(
    shaps_ch4_catboost[:, "retention coefficient"])#,
    #color=shaps_co2_catboost[:, :])

## Create dataframes with shap values and save them to files

In [None]:
def convert_shp_to_dataframe(
        shp_data: np.ndarray, 
        train_data: pd.DataFrame, 
        full_data: pd.DataFrame,
        model = None,
        relative: bool = False) -> pd.DataFrame:
    """Takes a numpy ndarray of shap values, information about column names and indices in
    train_data, information about which reservoir matches which index in full data, and model (optional)
    for predicting y_hat if the returned shap values should be in percentage terms relative to
    the prediction"""
    if relative:
        # Calculate predictions
        try:
            y_hat = model.predict(train_data)
        except AttributeError:
            raise AttributeError("Model not provided or does not contain the predict method")
        # Get shap values in percentage
        shap_data_scaled = (shp_data.T / y_hat * 100).T
        shp_data = shap_data_scaled
    
    shaps_df = pd.DataFrame(
        data=shp_data, index=X_co2_train_test.index, columns=X_co2_train_test.columns)
    shaps_with_names = pd.concat(
        [shaps_df, merged_df_min_prim_low['Name']], axis=1).set_index('Name', drop=True)

    return shaps_with_names

## Convert all shap matrices to dataframes and store them in files

In [None]:
shp_conversion_config = {
    'shap_xgboost_co2': (
        shaps_co2_xgboost, X_co2_train_test, merged_df_min_prim_low, model_co2_xgboost),
    'shap_lightbm_co2': (
        shaps_co2_lightgbm, X_co2_train_test, merged_df_min_prim_low, model_co2_lightgbm),
    'shap_catboost_co2': (
        shaps_co2_catboost, X_co2_train_test, merged_df_min_prim_low, model_co2_catboost),
    'shap_xgboost_ch4': (
        shaps_ch4_xgboost, X_ch4_train_test, merged_df_min_prim_low, model_ch4_xgboost),
    'shap_lightgbm_ch4': (
        shaps_ch4_lightgbm, X_ch4_train_test, merged_df_min_prim_low, model_ch4_lightgbm),
    'shap_catboost_ch4': (
        shaps_ch4_catboost, X_ch4_train_test, merged_df_min_prim_low, model_ch4_catboost)
}

## Save shap values calculated in the shap package

In [None]:
for identifier, data in shp_conversion_config.items():
    # Save absolute shap values
    shp_df_absolute = convert_shp_to_dataframe(
        shp_data = data[0].values,
        train_data = data[1],
        full_data = data[2],
        model=data[3],
        relative=False)
    output_dir = os.path.join('outputs', 'model_explanations', 'shap')
    if not os.path.exists(output_dir):
        # Create the folder
        os.makedirs(output_dir)
    shp_df_absolute.to_csv(os.path.join(output_dir, identifier + '_absolute.csv'))
    shp_df_absolute.to_excel(os.path.join(output_dir, identifier + '_absolute.xlsx'))
    # Save percentage shap values
    shp_df_relative = convert_shp_to_dataframe(
        shp_data = data[0].values,
        train_data = data[1],
        full_data = data[2],
        model=data[3],
        relative=True)
    shp_df_relative.to_csv(os.path.join(output_dir,identifier + '_relative.csv'))
    shp_df_relative.to_excel(os.path.join(output_dir,identifier + '_relative.xlsx'))

## Save shap values calculated in DALEX

Left for lated, if required...

# The END