# Reduce dimensionality of feature-space and feature-rank-space, find cluster and visualise the processed data
## Works on SHAP values and breakdown interactions from DALEX
## Author: Tomasz Janus, Mui Ne, 10/11/2023
### email: tomasz.janus@manchester.ac.uk ; tomasz.k.janus@gmail.com

The notebook is intended to import feature importances or SHAP values for either CO2 or CH4 emissions. Reduce dimensionality in the feature importance / shap space using PCA, then cluster the data using an algorithm of choice and visualise the clusters on pca reduced space, t-sne reduced space and on maps.

Second, a rank matrix is developed in the feature importance / shap space which is used to cluster the data based on ranks using some similarity measure

## Flow-chart
1. Import the feature importance dataset(s) for CO$_2$ and CH$_4$ emissions
2. Import the input-output data from re-emission
3. Find outliers in the feature-importance dataset(s)
4. Scale the feature importance data
5. Perform dimensionality reduction
6. Cluster the data in old or new (only PCA allowed) coordinate system and visualise clusters in lower dimensional space
7. Plot the clusters on maps

### Dimensionality reduction(s) used: 
* PCA
* FA
* LDA (not implemented yet) - in ToDo's if time allows
* t-SNE
* UMAP
* PCoA
* NMDS

### Clustering methods used: 
* K-Means
* K-Medoids
* DBScan
* HDBScan
* GMM (Gaussian Mixture Model)

## General logic
1. Find Clusters in data and present them on 2D plots and maps
2. We have three types of data that can be used for clustering
    * a) - Feature Importances
    * b) - Ranking of features, e.g. which features plays importance in predicting output first
    * c) - emission intensities - let's do this next by showing co2 and ch4 emission intensities as a pair
    
## Perform two different types of analyses for feature space and rank space
    * Feature Space - Use K-Means, HDBSCAN, BGMM and OPTICS clustering algorithms
    * Rank Space - Use K-Medoids, HDBSCAN, OPTICS, sklearn.cluster.AgglomerativeClustering
    
    * Feature Space - USE PCA, t-SNE, UMAP, NMDS, embeddings
    * Rank Space - USE PCoA and NMDS embeddings

In [None]:
from typing import Tuple, Dict, Literal, List, Sequence, Set, Callable
import os
import pathlib
import numpy as np
import pandas as pd
import pickle

import seaborn as sns

import dalex

import matplotlib
import matplotlib.pyplot as plt
import mpl_toolkits.mplot3d  # noqa: F401
from mpl_toolkits.mplot3d import Axes3D

import plotly_express as px
import plotly.graph_objs as go
import chart_studio.plotly as py

import sklearn
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, Normalizer, MinMaxScaler
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from umap import UMAP
from sklearn.metrics.pairwise import euclidean_distances

import lib.dim_reduction as dr
import lib.clustering as clust
from lib.data_loaders import load_feature_importances, load_input_output
from lib.ranks import (
    get_ranks, plot_rank_heatmap, rank_distance_matrix, FeatureCorrelationMap, 
    exp_scaling_fun, rank_distance)
from lib.utils import remove_outliers
import lib.mapping as custom_maps

import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
%matplotlib inline

import pca as pca_custom

DALEXBreakDown = dalex.predict_explanations._break_down.object.BreakDown
DALEXShap = dalex.predict_explanations._shap.object.Shap

## Define notebook functions

In [None]:
def get_predictions(data: Dict[str, DALEXBreakDown | DALEXShap]) -> pd.DataFrame | None:
    """Retrieves model (regression) predictions for all analysed reservoirs from dalex
    explainer objects"""
    def confirm_type(data, data_type: DALEXBreakDown | DALEXShap):
        return all([isinstance(value, data_type) for value in data.values()])
    
    predictions = None
    if confirm_type(data, DALEXShap):
        predictions = pd.DataFrame(
            {'Reservoir': list(data.keys()),
            'Prediction': [data[key].prediction for key in data]})
    if confirm_type(data, DALEXBreakDown):
        predictions = pd.DataFrame(
            {'Reservoir': list(data.keys()),
            'Prediction': [data[key].result['cumulative'].iloc[-1] for key in data]})
    return predictions

def process_features(
        features: pd.DataFrame, predictions: pd.DataFrame, proportional: bool = False,
        zero_mean: bool = True, unit_std: bool = False, abs_values: bool = False,
        ) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """ """
    def drop_zero_cols(df: pd.DataFrame) -> pd.DataFrame:
        """ """
        return df.loc[:, (df != 0).any(axis=0)]
        
    features_with_predictions = pd.merge(
        features_df, predictions, left_on='reservoir name', right_on='Reservoir')
    reservoirs = features_with_predictions[['Reservoir']]
    features_with_predictions.drop(columns='Reservoir', inplace=True)
    if proportional:
        features_num = features_with_predictions.iloc[:,1:]\
            .div(features_with_predictions['Prediction'], axis=0)
    else:
        features_num = features_with_predictions.iloc[:,1:]
        
    if abs_values:
        features_num = features_num.abs()
    
    features_num = drop_zero_cols(features_num)
    
    if zero_mean:
        means = features_num.mean()
        features_num = features_num.sub(means)
        
    if unit_std:
        std = features_num.std()
        features_num = features_num.div(std)
    
    return reservoirs, features_num.drop(
        columns=['Prediction', 'intercept', 'Unnamed: 1'], errors='ignore')

## Define the analysis options

In [None]:
# What gas is the analysis for?
gas_name: Literal['co2', 'ch4'] = 'ch4'
# Which feature importances are we using? - shap values or dalex breakdown future importances
feature_type: Literal['shap', 'breakdown'] = 'breakdown'
# On which model are the future importances based?
model_name: Literal['xgboost', 'lightgbm', 'catboost'] = 'xgboost'
# Name of the file with inputs, outputs and internal variables (re-emission output excel file)
input_output_file: str = 'outputs_MIN_LOW_PRIM.xlsx'
# Should the outliers be removed before attempting data scaling and PCA?
run_outlier_removal = False
## PCA OPTIONS
scale_pca: bool = True
pca_scaling_strategy = Normalizer
## DISTANCE-BASED OPTIONS
scale_dist: bool = False
dist_scaling_strategy = MinMaxScaler
# Use FA decomposition
use_fa: bool = False

## Load the required data

In [None]:
# 1. Load the input/output/internal_val data from reemission output EXCEL spreadsheet
inputs_outputs_df = load_input_output(
    filename=os.path.join('outputs','reemission', input_output_file),
    sheets=('inputs', 'outputs', 'internals'))
# 2. Load the features data, i.e. the features dataframe and the reservoir:explainer dictionary
# First check if the values were recalculated in the previous notebook. If not, use precalculated values in
# the bin/ folder
try:
    features_df, features_full = load_feature_importances(
        gas_name, feature_type=feature_type, model_name=model_name, 
        folder = "outputs/model_explanations")
except FileNotFoundError:
    features_df, features_full = load_feature_importances(
        gas_name, feature_type=feature_type, model_name=model_name, 
        folder = "bin/model_explanations_precalculated")    
# 3. Find predictions from regression model (given in 'model_name') from dalex explainers
predictions: pd.DataFrame = get_predictions(features_full)
# 4. Pre-processes the features data, e.g. drops all-zero columns, scales to zero mean, unit_variance
#    converts values to absolute and/or makes the values proportional to prediction
#    Return two dataframes : one with reservoirs and the other one with processed features
reservoirs, processed_features = process_features(
    features_df, predictions, proportional = True, zero_mean=True, 
    unit_std = False, abs_values = False)
# 5. Load input output data (only outputs - duplicate with 1)
input_output_data = load_input_output(
    filename=os.path.join('outputs','reemission', input_output_file),
    sheets=('inputs',))
# 6. Load models
saved_model_folder = pathlib.Path('intermediate/shap_values/model_avg_feat_importances')
with open(saved_model_folder / 'model_feats.pkl', 'rb') as fp:
    model_feat_importances = pickle.load(fp)

## I. Calculate and plot ranks

In [None]:
# Load model feature importance saved in DALEX
model_importances, col_names = model_feat_importances[(model_name, gas_name)]
importances_df = pd.DataFrame(data=model_importances, index=col_names)\
    .rename(columns={0:"value"})\
    .sort_values(by="value", axis=0, ascending=False)
importances_df['value'] = importances_df['value'] / importances_df['value'].sum()

In [None]:
importances_df.head()

In [None]:
features_df.head()

In [None]:
# Set reservoir names as index to limit the risk of errors when mapping indices to reservoir names
features_df_res_name = features_df.set_index('reservoir name')

if feature_type == 'shap':
    cols_to_drop = [] # 'reservoir name'
elif feature_type == 'breakdown':
    cols_to_drop = ['intercept', 'Unnamed: 1'] # 'reservoir name'
else:
    cols_to_drop = []

rank_df = get_ranks(
    feature_data = features_df_res_name, cols_to_drop = cols_to_drop, 
    column_order = list(importances_df.index), # Make sure that the columns are in the same order as
                                               # in the importances_df dataframe
    sort_rows = True)
plot_rank_heatmap(rank_data = rank_df, yticklabels = rank_df.index) #.loc[rank_df.index]['reservoir name']

In [None]:
rank_df.head()

## Calculate ranks distances

In [None]:
# Assert that the columns of rank_df align with the order of importances
all(rank_df.columns == importances_df.index)

In [None]:
# Define correlation map between items
# Uses 0-based indexing
feature_corr_map = FeatureCorrelationMap(
    cmap = {
        (2,5): 0.7,
        (6,9): 0.7,
        (9, 11): 0.6})

### Calculate non-euclidean custom distance matrix using function `rank_distance_matrix`

The custom distance metric is used to find similarities between reservoirs in the feature importance space based on the order of feature importances not the values of feature importances. The idea is that if reservoirs have the same feature importance ranks at the top they are similar.
**NOTE:** May take a while to compute

In [None]:
dist_matrix_df = rank_distance_matrix(
    rank_df = rank_df, 
    rank_importances = importances_df,
    corr = feature_corr_map)

In [None]:
dist_matrix_df = pd.DataFrame(dist_matrix_df, index=rank_df.index, columns = rank_df.index)

In [None]:
dist_matrix_df.head()

In [None]:
# Plot the distance matrix
fig, ax = plt.subplots(figsize=(12, 6))
sns.heatmap(
    dist_matrix_df,
    #yticklabels = features_df.loc[rank_df.index]['reservoir name'],
    #xticklabels = features_df.loc[rank_df.index]['reservoir name'],
    ax=ax)
ax.tick_params(axis='x', labelsize=7)
ax.tick_params(axis='y', labelsize=7)
ax.set_xlabel(None)
ax.set_ylabel(None)

## Check the loaded and pre-processed data

In [None]:
processed_features.describe()

In [None]:
input_output_data.head()

In [None]:
reservoirs.head()

In [None]:
predictions.head()

In [None]:
#fig, ax = plt.subplots(figsize=(4,3))
predictions\
    .rename(columns={'Prediction': 'CH$_4$ model predictions'})\
    .hist(figsize=(4,3))

## Visualise reservoir's contributions using DALEX

In [None]:
import ipywidgets as widgets
import plotly.graph_objects as go

dropdown = widgets.Dropdown(
    options=list(reservoirs['Reservoir']),
    value='A Laing Ni Dam',
    # rows=10,
    description='Reservoirs:',
    disabled=False
)
display(dropdown)
features_full[dropdown.value].plot(show=False).show(renderer="browser")

selected_reservoir =dropdown.value
def on_change(change):
    global selected_reservoir
    if change['type'] == 'change' and change['name'] == 'value':
        selected_reservoir = change['new']
        features_full[selected_reservoir].plot(show=False).show(renderer="browser")
        
dropdown.observe(on_change, names='value')

In [None]:
# Dataframe of features for dimensionality reduction and clustering
X_feat = processed_features.copy()
y_feat = reservoirs.copy()
# Dataframe of feature ranks per reservoir - also for dimensionality reduction and clustering
X_ranks = rank_df.copy()
y_ranks = rank_df.index.copy()
# Dataframe of reservoir distances with respect to their feature rank orders
X_dist = dist_matrix_df.copy()
y_dist = dist_matrix_df.index.copy()

## Remove outliers

In [None]:
if run_outlier_removal:
    X_feat, y_feat = remove_outliers(X,y, num_neighbours = 200)
    X_ranks, y_ranks = remove_outliers(X_ranks,y_ranks, num_neighbours = 200)

## Run dimensionality reduction of the feature space

In [None]:
# Set scaling strategy
scaling_strategy: None | Normalizer | StandardScaler | RobustScaler = Normalizer
scaling_strategy_features = Normalizer

# Dimensionality reduction in the feature importance space
print("Running dimensionality reduction in the feature importance space\n")
X_pca, y_pca, pca = dr.run_pca(
    X_feat, y_feat, scaling_strategy = scaling_strategy_features)
X_fa, y_fa, fa = dr.run_fa(
    X_feat, y_feat, scaling_strategy = scaling_strategy_features)
X_tsne, y_tsne, tsne = dr.run_tsne(
    X_feat, y_feat, scaling_strategy = scaling_strategy_features)
X_umap, y_umap, umap = dr.run_umap(
    X_feat, y_feat, scaling_strategy = scaling_strategy_features, metric = 'euclidean',
    min_dist = 0.05, n_components=3) # Alternatively choose metric = 'correlation'
# Get the euclidean distance of X_feat
X_feat_dist = euclidean_distances(X_feat)
X_mds, y_mds, mds = dr.run_mds(
    X_feat_dist, y_feat, scaling_strategy = None, n_components = 3,
    dissimilarity = 'precomputed', metric = True)

## Run dimensionality reduction of the feature rank space using PCoA and MDS

In [None]:
scaling_strategy_ranks = None
print("Running dimensionality reduction in the feature rank space\n")
print("Dimensionality reduction is performed on a custom matrix of distances between points")
# Note: X_pcoa_ranks is a dataframe but X_mds_ranks is a numpy.ndarray
X_pcoa_ranks, y_pcoa_ranks, pcoa_model, pcoa_ranks = dr.run_pcoa(
    X_dist, y_dist, scaling_strategy = scaling_strategy_ranks)
X_mds_ranks, y_mds_ranks, mds_ranks = dr.run_mds(
    X_dist, y_dist, scaling_strategy = scaling_strategy_ranks)

In [None]:
# Run custom PCA model
# pca_custom_model = pca_custom.pca(n_components=len(list(processed_features.columns)))
# pca_custom_results = pca_custom_model.fit_transform(X=X, row_labels=list(processed_features.columns))
# pca_custom_results['explained_var']
# fig, ax = pca_custom_model.biplot(n_feat=5)

## Explained and Cumulative Explained Variances from PCA on feature space and PCoA on feature rank space

In [None]:
fig, ax = plt.subplots(1,2, figsize = (8,4))
fig.suptitle('Explained and Cumulative Explained Variances - PCA and PCoA')
dr.explained_cumulative_var_plot(
    ordination_model = pca, ax = ax[0], 
    num_components = 30, title='Feature Space')
dr.explained_cumulative_var_plot(
    ordination_model = pcoa_model, ax = ax[1],
    xlabel = "Number of coordinates",
    num_components = 30, title = 'Feature Rank Space')
fig.tight_layout()

## Map directions (vectors) from dim reduction to features

In [None]:
# Map dimensionality reduction model directions (components) to data columns (features)
# (DOES NOT WORK ON FEATURE RANKS AS THEY'RE COMPUTED USING A DISTANCE MATRIX)
# PCA
pca_to_feature_feats = dr.features_to_vars(
    dim_red_model = pca, column_names = X_feat.columns)
# FA
if use_fa:
    fa_to_feature_feats = dr.features_to_vars(
        dim_red_model = fa, column_names = X_feat.columns)

## Visualise the maps between dim reduction components (vectors) and features

In [None]:
dr.plot_component_feature_map(
    pca_to_feature_feats, num_dims = 20, num_feats = None, xtick_rotation = 90)

In [None]:
if use_fa:
    dr.plot_component_feature_map(
        fa_to_feature_ranks, num_dims = 20, num_feats = None, figsize=(14,4),
        xtick_rotation = 0)

## Find clusters

### Plot point distances using K nearest neighbours

In [None]:
fig, axs = plt.subplots(1,1, figsize=(6,4))
clust.plot_kneighbours_dist_graph(
    X_pca, n_dim = 20, scaler=None, cutoff_line_value = 0.4, 
    title = 'K-Neighbours distances for features in PCA-reduced space', ax=axs)

## Cluster feature data in the PCA reduced space

In [None]:
if scale_pca:
    X_pca_feat_clustering = pca_scaling_strategy().fit_transform(X_pca)
else:
    X_pca_feat_clustering = X_pca

n_clusters_feat: int = 3

print("CLUSTERING FEATURES.....\n")
kmeans_labels_feat = clust.run_kmeans(
    X_pca_feat_clustering, n_dim = None, n_clusters = n_clusters_feat)

hdbscan_labels_feat, hdbscan_probabilities_feat = clust.run_hdbscan(
    X_pca_feat_clustering, n_dim = None, min_samples = 50)

bgmm_labels_feat = clust.run_bgmm(
    X_pca_feat_clustering, n_dim = None, n_clusters = n_clusters_feat)

optics_labels_feat = clust.run_optics(
    X_pca_feat_clustering, n_dim = None, min_samples = 0.02, xi = 0.05, metric="minkowski",
    cluster_method = 'xi',
    min_cluster_size = 0.05)

In [None]:
# Clustering methods not used in the analysis
# Reasons: DBSCAN is a bit fiddly with parameterization. HDBSCAN used instead as it seems to be alleviating
#          the shortcomings of DBSCAN whilst being a method superseding the original DBSCAN
#          GMM gives the same results as BGMM and thus BGMM is used instead
# dbscan_labels_feat = clust.run_dbscan(
#    X_pca_feat_clustering, min_samples = 30, n_dim = None, eps=0.4)
#gmm_labels_feat = clust.run_gmm(
#    X_pca_feat_clustering, n_dim = None, n_clusters = n_clusters_feat)

## Cluster rank data in the PCoA reduced space

In [None]:
if scale_pca:
    X_pcoa_rank_clustering = pca_scaling_strategy().fit_transform(X_pcoa_ranks)
else:
    X_pcoa_rank_clustering = X_pcoa_ranks

n_clusters_rank: int = 6
        
print("CLUSTERING RANKS...\n")
kmeans_labels_rank = clust.run_kmeans(
    X_pcoa_rank_clustering, n_dim = None, n_clusters = n_clusters_rank, col_name = 'cluster')

hdbscan_labels_rank, hdbscan_probabilities_rank = clust.run_hdbscan(
    X_pcoa_rank_clustering, n_dim = None, min_samples = 10)

bgmm_labels_rank = clust.run_bgmm(
    X_pcoa_rank_clustering, n_dim = None, n_clusters = n_clusters_rank,
    init_params = 'kmeans')

optics_labels_rank = clust.run_optics(
    X_pcoa_rank_clustering, n_dim = None, min_samples = 0.02, xi = 0.05, metric="minkowski",
    cluster_method = 'xi',
    min_cluster_size = 0.05)

In [None]:
# Clustering methods not used in the analysis
# Reasons: DBSCAN is a bit fiddly with parameterization. HDBSCAN used instead as it seems to be alleviating
#          the shortcomings of DBSCAN whilst being a method superseding the original DBSCAN
#          GMM gives the same results as BGMM and thus BGMM is used instead
#dbscan_labels_rank = clust.run_dbscan(
#    X_pcoa_rank_clustering, min_samples = 30, n_dim = None, eps=0.4)
#gmm_labels_rank = clust.run_gmm(
#    X_pcoa_rank_clustering, n_dim = None, n_clusters = n_clusters_rank)

## Cluster rank data using the distance matrix

In [None]:
if scale_dist:
    X_dist_clustering = dist_scaling_strategy().fit_transform(X_dist)
else:
    X_dist_clustering = X_dist

n_clusters_rank: int = 6

kmedoids_labels_rank_dist = clust.run_kmedoids(
    X_dist, n_clusters = n_clusters_rank, n_dim = None, metric="precomputed",
    method='alternate', random_state = 42)
    
hdbscan_labels_rank_dist, hdbscan_probabilities_rank_dist = clust.run_hdbscan(
    X_dist, n_dim = None, min_samples = 10, metric="precomputed", alpha=0.001,
    min_cluster_size = 3, cluster_selection_epsilon=0.01)
    
optics_labels_rank_dist = clust.run_optics(
    X_dist, n_dim = None, min_samples = 0.08, xi = 0.05, metric="precomputed",
    cluster_method = 'xi',
    min_cluster_size = 0.5)

agg_labels_rank_dist = clust.run_agglomerative(
        X_dist, n_dim = None, n_clusters = n_clusters_rank,
        metric = 'precomputed',
        distance_threshold = None, col_name = 'cluster') # 1.0

## Visualise feature data in reduced space
*K-Means*, *HDBSCAN*, *GMM*, *BGMM* clusterings of features in 4x4 subplots in *t-SNE*, *FA*, *UMAP*, *MDS* and *PCA* projected spaces (5 plots)

In [None]:
title_1: str = "K-Means clusters"
title_2: str = "HDBSCAN clusters"
title_3: str = "BGMM clusters"
title_4: str = "OPTICS clusters"
    
default_alpha = 0.65

### t-SNE reduced feature space - features

In [None]:
# K-Means, HDBSCAN, GMM, BGMM clusterings of features in 4x4 subplots in t-SNE space
s_multiplier = 30
fig, axs = plt.subplots(2,2, figsize = (8,6))
fig.suptitle(f"Visualisation of clusters of features in t-SNE projected space - {gas_name.upper()} emissions")
clust.visualise_clusters_2D(
        data = X_tsne, labels = kmeans_labels_feat, s_multiplier = s_multiplier, title = title_1,
        probabilities = None, ax = axs.flat[0], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_tsne, labels = hdbscan_labels_feat, s_multiplier = s_multiplier, title = title_2, 
        probabilities = hdbscan_probabilities_feat, ax = axs.flat[1], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_tsne, labels = bgmm_labels_feat, s_multiplier = s_multiplier, title = title_3,
        probabilities = None, ax = axs.flat[2], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_tsne, labels = optics_labels_feat, s_multiplier = s_multiplier, title = title_4,
        probabilities = None, ax = axs.flat[3], default_alpha = default_alpha)

### FA reduced feature space - features

In [None]:
s_multiplier = 75
fig, axs = plt.subplots(2,2, figsize = (8,6))
fig.suptitle(f"Visualisation of clusters of features in FA projected space - {gas_name.upper()} emissions")
clust.visualise_clusters_2D(
        data = X_fa, labels = kmeans_labels_feat, s_multiplier = s_multiplier, title = title_1,
        probabilities = None, ax = axs.flat[0], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_fa, labels = hdbscan_labels_feat, s_multiplier = s_multiplier, title = title_2, 
        probabilities = hdbscan_probabilities_feat, ax = axs.flat[1], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_fa, labels = bgmm_labels_feat, s_multiplier = s_multiplier, title = title_3,
        probabilities = None, ax = axs.flat[2], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_fa, labels = optics_labels_feat, s_multiplier = s_multiplier, title = title_4,
        probabilities = None, ax = axs.flat[3], default_alpha = default_alpha)

### PCA reduced feature space - features

In [None]:
s_multiplier = 300
fig, axs = plt.subplots(2,2, figsize = (8,6))
fig.suptitle(f"Visualisation of clusters of features in PCA projected space - {gas_name.upper()} emissions")
clust.visualise_clusters_2D(
        data = X_pca, labels = kmeans_labels_feat, s_multiplier = s_multiplier, title = title_1,
        probabilities = None, ax = axs.flat[0], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_pca, labels = hdbscan_labels_feat, s_multiplier = s_multiplier, title = title_2, 
        probabilities = hdbscan_probabilities_feat, ax = axs.flat[1], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_pca, labels = bgmm_labels_feat, s_multiplier = s_multiplier, title = title_3,
        probabilities = None, ax = axs.flat[2], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_pca, labels = optics_labels_feat, s_multiplier = s_multiplier, title = title_4,
        probabilities = None, ax = axs.flat[3], default_alpha = default_alpha)

### MDS reduced feature space - features

In [None]:
s_multiplier = 50
fig, axs = plt.subplots(2,2, figsize = (8,6))
fig.suptitle(f"Visualisation of clusters of features in MDS projected space - {gas_name.upper()} emissions")
clust.visualise_clusters_2D(
        data = X_mds, labels = kmeans_labels_feat, s_multiplier = s_multiplier, title = title_1,
        probabilities = None, ax = axs.flat[0], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_mds, labels = hdbscan_labels_feat, s_multiplier = s_multiplier, title = title_2, 
        probabilities = hdbscan_probabilities_feat, ax = axs.flat[1], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_mds, labels = bgmm_labels_feat, s_multiplier = s_multiplier, title = title_3,
        probabilities = None, ax = axs.flat[2], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_mds, labels = optics_labels_feat, s_multiplier = s_multiplier, title = title_4,
        probabilities = None, ax = axs.flat[3], default_alpha = default_alpha)

### UMAP reduced feature space - features

In [None]:
s_multiplier = 10
fig, axs = plt.subplots(2,2, figsize = (8,6))
fig.suptitle(f"Visualisation of clusters of features in UMAP projected space - {gas_name.upper()} emissions")
clust.visualise_clusters_2D(
        data = X_umap, labels = kmeans_labels_feat, s_multiplier = s_multiplier, title = title_1,
        probabilities = None, ax = axs.flat[0], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_umap, labels = hdbscan_labels_feat, s_multiplier = s_multiplier, title = title_2, 
        probabilities = hdbscan_probabilities_feat, ax = axs.flat[1], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_umap, labels = bgmm_labels_feat, s_multiplier = s_multiplier, title = title_3,
        probabilities = None, ax = axs.flat[2], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_umap, labels = optics_labels_feat, s_multiplier = s_multiplier, title = title_4,
        probabilities = None, ax = axs.flat[3], default_alpha = default_alpha)

## Ranks
*K-Means*, *HDBSCAN*, *GMM*, *BGMM* clusterings of ranks 4x4 subplots in *t-SNE*, *FA* and *PCA* projected spaces (3 plots)

### A. Clustering made in PCoA space projected onto PCoA

In [None]:
# K-Means, HDBSCAN, GMM, BGMM clusterings of ranks in 4x4 subplots in t-SNE space
s_multiplier = 500
fig, axs = plt.subplots(2,2, figsize = (8,6))
fig.suptitle(f"Visualisation of clusters of ranks in PCoA projected space - {gas_name.upper()} emissions")
clust.visualise_clusters_2D(
        data = X_pcoa_ranks.to_numpy(), labels = kmeans_labels_rank, s_multiplier = s_multiplier, title = title_1,
        probabilities = None, ax = axs.flat[0], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_pcoa_ranks.to_numpy(), labels = hdbscan_labels_rank, s_multiplier = s_multiplier, title = title_2, 
        probabilities = hdbscan_probabilities_rank, ax = axs.flat[1], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_pcoa_ranks.to_numpy(), labels = bgmm_labels_rank, s_multiplier = s_multiplier, title = title_3,
        probabilities = None, ax = axs.flat[2], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_pcoa_ranks.to_numpy(), labels = optics_labels_rank, s_multiplier = s_multiplier, title = title_4,
        probabilities = None, ax = axs.flat[3], default_alpha = default_alpha)

### B. Clustering made in PCoA space projected onto MDS space

In [None]:
s_multiplier = 80
fig, axs = plt.subplots(2,2, figsize = (8,6))
fig.suptitle(f"Visualisation of clusters of ranks in MDS projected space - {gas_name.upper()} emissions")
clust.visualise_clusters_2D(
        data = X_mds_ranks, labels = kmeans_labels_rank, s_multiplier = s_multiplier, title = title_1,
        probabilities = None, ax = axs.flat[0], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_mds_ranks, labels = hdbscan_labels_rank, s_multiplier = s_multiplier, title = title_2, 
        probabilities = hdbscan_probabilities_rank, ax = axs.flat[1], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_mds_ranks, labels = bgmm_labels_rank, s_multiplier = s_multiplier, title = title_3,
        probabilities = None, ax = axs.flat[2], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_mds_ranks, labels = optics_labels_rank, s_multiplier = s_multiplier, title = title_4,
        probabilities = None, ax = axs.flat[3], default_alpha = default_alpha)

## CLUSTERING ON A DISTANCE MATRIX

In [None]:
title_1: str = "K-Medoids clusters"
title_2: str = "HDBSCAN clusters"
title_3: str = "OPTICS clusters"
title_4: str = "Aggregated Clustering clusters"

### PCoA space

In [None]:
s_multiplier = 500
fig, axs = plt.subplots(2,2, figsize = (8,6))
fig.suptitle(f"Visualisation of clusters of ranks in PCoA projected space - {gas_name.upper()} emissions")
clust.visualise_clusters_2D(
        data = X_pcoa_ranks.to_numpy(), labels = kmedoids_labels_rank_dist, s_multiplier = s_multiplier, title = title_1,
        probabilities = None, ax = axs.flat[0], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_pcoa_ranks.to_numpy(), labels = hdbscan_labels_rank_dist, s_multiplier = s_multiplier, title = title_2, 
        probabilities = hdbscan_probabilities_rank_dist, ax = axs.flat[1], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_pcoa_ranks.to_numpy(), labels = optics_labels_rank_dist, s_multiplier = s_multiplier, title = title_3,
        probabilities = None, ax = axs.flat[2], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_pcoa_ranks.to_numpy(), labels = agg_labels_rank_dist, s_multiplier = s_multiplier, title = title_4,
        probabilities = None, ax = axs.flat[3], default_alpha = default_alpha)

### MDS space

In [None]:
s_multiplier = 100
fig, axs = plt.subplots(2,2, figsize = (8,6))
fig.suptitle(f"Visualisation of clusters of ranks in MDS projected space - {gas_name.upper()} emissions")
clust.visualise_clusters_2D(
        data = X_mds_ranks, labels = kmedoids_labels_rank_dist, s_multiplier = s_multiplier, title = title_1,
        probabilities = None, ax = axs.flat[0], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_mds_ranks, labels = hdbscan_labels_rank_dist, s_multiplier = s_multiplier, title = title_2, 
        probabilities = hdbscan_probabilities_rank_dist, ax = axs.flat[1], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_mds_ranks, labels = optics_labels_rank_dist, s_multiplier = s_multiplier, title = title_3,
        probabilities = None, ax = axs.flat[2], default_alpha = default_alpha)
clust.visualise_clusters_2D(
        data = X_mds_ranks, labels = agg_labels_rank_dist, s_multiplier = s_multiplier, title = title_4,
        probabilities = None, ax = axs.flat[3], default_alpha = default_alpha)

In [None]:
hdbscan_rank_dist_labels = clusterer_ranks_dist.labels_
hdbscan_rank_dist_labels_df = pd.DataFrame(hdbscan_rank_dist_labels, columns=['cluster'])
hdbscan_rank_dist_probabilities_df = pd.DataFrame(clusterer_ranks_dist.probabilities_, columns=['probability'])

# MAKE DETAILED PCA VISUALISATIONS

In [None]:
import importlib
importlib.reload(clust)
figure_output_folder = pathlib.Path("figures/clustering")
if not figure_output_folder.exists():
    figure_output_folder.mkdir()

In [None]:
fig, axs = plt.subplots(1,2, figsize = (12,5))
fig.suptitle(
    f"K-Means derived clusters in PCA / PCoA reduced feature space - {gas_name.upper()} emissions",
    fontsize=16)
clust.visualise_pca_2D(
    data=X_pca, labels=kmeans_labels_feat, pca_model=pca, s_multiplier = 400,
    label_fontsize = 14,
    arrow_width = 0.0225,
    tick_fontsize = 12,
    probabilities =None,
    default_alpha = 0.6,
    num_components = 4,
    title = 'Features',
    legend_location = 'lower left',
    var_names = list(X_feat.columns), ax=axs.flat[0])
clust.visualise_pca_2D(
    data=X_pcoa_ranks.to_numpy(), labels=kmeans_labels_rank, pca_model=pcoa_model, 
    s_multiplier = 400,
    label_fontsize = 14,
    tick_fontsize = 12,
    title = 'Feature ranks',
    probabilities =None,
    default_alpha = 0.6,
    legend_location = 'lower right',
    var_names = list(X_ranks.columns), ax=axs.flat[1])
fig.savefig(figure_output_folder/"Kmeans_clusters_ch4.png")

In [None]:
#fig, axs = plt.subplots(1,2, figsize = (12,5))
#fig.suptitle(
#    f"HDBSCAN derived clusters in PCA reduced rank space - {gas_name.upper()} emissions",
#    fontsize=16)
#clust.visualise_pca_2D(
#    data=X_pca, labels=hdbscan_labels_feat, pca_model=pca, s_multiplier = 300, 
#    probabilities = hdbscan_probabilities_feat,
#    default_alpha = 0.8,
#    title = 'Features',
#    var_names = list(X_feat.columns), ax=axs.flat[0])
#clust.visualise_pca_2D(
#    data=X_pca_ranks, labels=hdbscan_labels_rank, pca_model=pca_ranks, s_multiplier = 300, #hdbscan
#    title = 'Ranks',
#    probabilities = hdbscan_probabilities_rank,
#    default_alpha = 0.8,
#    var_names = list(X_ranks.columns), ax=axs.flat[1])

## Save results for mapping in R
Make sure to run without outlier removal so that all reservoirs are being processed
Choose best number of clusters for 
Information to save (for each gas)
1. Ranks with reservoir names
2. K-Means clusters with features
3. K-Means clusters with ranks
4. HDBSCAN clusters for features
5. HDBSCAN clusters for ranks
6. BGMM clusters for features
7. BGMM clusters for rnaks

## CURRENTLY ONLY SAVES DATA FROM THE PCA/PCoA PLOT

In [None]:
FOLDER_NAME = pathlib.Path("intermediate/density_mapping")
feat_kmeans_clusters = pd.concat([y_feat, kmeans_labels_feat], axis = 1)
rank_kmeans_clusters = pd.concat([pd.Series(y_dist), kmeans_labels_rank],  axis = 1)
ch4_output_folder = FOLDER_NAME / 'ch4'
if not ch4_output_folder.exists():
    ch4_output_folder.mkdir(parents=True, exist_ok=True)

feat_kmeans_clusters.to_excel(ch4_output_folder / 'k_means_clusters_feat.xlsx')
rank_kmeans_clusters.to_excel(ch4_output_folder / 'k_means_clusters_rank.xlsx')

# MAPPING RESERVOIR CLUSTER DATA

In [None]:
cluster_option: Literal['pca', 'hdbscan', 'gmm'] = 'kmeans'
# Create data structures
    
cluster_labels = {
    'kmeans': (kmeans_labels_feat, kmeans_labels_rank),
    'hdbscan': (hdbscan_labels_feat, hdbscan_labels_rank),
    'bgmm': (bgmm_labels_feat, bgmm_labels_rank)}

cluster_data_feat = pd.concat([y_feat,X_feat,cluster_labels[cluster_option][0]], axis=1)
cluster_data_rank = pd.concat([pd.Series(y_dist),X_ranks,cluster_labels[cluster_option][1]], axis=1)

if cluster_option == 'hdbscan':
    cluster_data_feat = pd.concat([cluster_data_feat, hdbscan_probabilities_feat], axis=1)
    cluster_data_rank = pd.concat([cluster_data_rank, hdbscan_probabilities_rank], axis=1)

# Currently only works with data that has not undergone any prior outlier removal
data_map_feat = cluster_data_feat.merge(
    inputs_outputs_df, left_on = 'Reservoir', right_on = 'Name').drop(columns='Name')
data_map_feat['cluster'] = data_map_feat['cluster'].fillna(-1)
data_map_rank = cluster_data_rank.merge(
    inputs_outputs_df, left_on = 'reservoir name', right_on = 'Name').drop(columns='Name')
data_map_rank['cluster'] = data_map_rank['cluster'].fillna(-1)

# using dictionary to convert specific columns
dtype_conversion_map = {'res_mean_depth': float,
                'cluster': 'category',
                'catch_area_fractions_1': float,
                'res_max_depth': float}
 
data_map_feat = data_map_feat.astype(dtype_conversion_map)
data_map_rank = data_map_rank.astype(dtype_conversion_map)

### Plot a folium map

In [None]:
custom_maps.plot_mya_reservoirs_gdf(data_map_feat)

### Plot static maps with clustering results both in feature and rank space

In [None]:
import warnings
warnings.filterwarnings('ignore')
fig, axs = plt.subplots(1,2, figsize = (8,8))
fig.suptitle(f"Reservoir clusters obtained with {cluster_option.upper()} clustering method")
custom_maps.plot_mya_reservoirs_static(
    data = data_map_feat, ax=axs.flat[0], title="Clusters in feature importance space",
    marker_size = 'res_max_depth')
custom_maps.plot_mya_reservoirs_static(
    data = data_map_rank, ax=axs.flat[1], title="Clusters in rank space",
    marker_size = 'res_max_depth')
fig.tight_layout()
fig.show()
fig.savefig(figure_output_folder/"reservoir_cluster_map.png")

### Plot dynamic maps in Bokeh with clustering results both in feature and rank space

In [None]:
custom_maps.plot_with_bokeh(
    data_map_feat, marker_size = 'res_max_depth', marker_size_multiplier = 2,
    title = f"Clusters in the feature space derived with {cluster_option.upper()}",
    tooltips = [('Cluster','cluster'),
            ('Name','Reservoir'),
            ('Type', 'type'),
            ('Volume','res_volume'),
            ('Net CO2 emission','co2_net'),
            ('Net CH4 emission','ch4_net')])

# UNUSED CODE BELOW... 

In [None]:
from PyComplexHeatmap import *
plt.figure(figsize=(8, 4))
col_ha = HeatmapAnnotation(
    df=pca_to_feature.iloc[0:20,:],plot=True,legend=True,legend_gap=5,hgap=0.5,axis=1, cmap='RdYlBu_r')
plt.show()

In [None]:
# unused but required import for doing 3d projections with matplotlib < 3.2


fig = plt.figure(1, figsize=(8, 6))
plt.clf()
ax = fig.add_subplot(111, projection="3d", elev=20, azim=60)
ax.set_position([0.5, 0.5, 1, 1])

#ax = Axes3D(fig)

plt.cla()

"""
for name, label in [("Setosa", 0), ("Versicolour", 1), ("Virginica", 2)]:
    ax.text3D(
        X[y == label, 0].mean(),
        X[y == label, 1].mean() + 1.5,
        X[y == label, 2].mean(),
        name,
        horizontalalignment="center",
        bbox=dict(alpha=0.5, edgecolor="w", facecolor="w"),
    )

# Reorder the labels to have colors matching the cluster results
y = np.choose(y, [1, 2, 0]).astype(float)
"""

ax.scatter(
    X_pca[:, 0], X_pca[:, 1], X_pca[:, 2], s=200*X_pca[:, 3], c=pca_labels['cluster'],   #X_pca[:, 3], 
    edgecolor='grey', alpha=0.7)
#cmap=plt.cm.nipy_spectral, 

ax.xaxis.set_ticklabels([])
ax.yaxis.set_ticklabels([])
ax.zaxis.set_ticklabels([])

plt.xlabel('Principal component 1')
plt.ylabel('Principal component 2')
ax.set_zlabel('Principal component 3', fontsize=10, rotation=0)
#ax.set_xlim(-20, 40)
#ax.set_ylim(-20, 15)
#ax.set_zlim(-20, 8)
#plt.tight_layout()
#plt.legend()
plt.show()



In [None]:
X_pca_df = pd.DataFrame(X_pca, columns = [f'PC_{num}' for num in range(0,X_pca.shape[1])])

In [None]:
X_pca_df['PC_4_abs'] = X_pca_df['PC_4'].abs()
X_pca_df['cluster'] = pca_labels_df['cluster_number']

In [None]:
X_pca_df.head()

In [None]:
import plotly.express as px
X_pca_df_filt = X_pca_df.query('PC_1 < 150')

fig = px.scatter_3d(X_pca_df_filt, x='PC_0', y='PC_1', z='PC_2',
              color='PC_3', size='PC_4_abs', opacity=0.7, symbol='cluster')
fig.show()


In [None]:
from skbio.stats.ordination import pcoa
pcoa_ranks = pcoa(dist_matrix_df, number_of_dimensions=4, )
pcoa_ranks.proportion_explained
dist_matrix_df.index


from sklearn.preprocessing import MinMaxScaler


df_pcoa_ranks = pcoa_ranks.samples[['PC1', 'PC2', 'PC3', 'PC4']]
df_pcoa_ranks.index = dist_matrix_df.index
point_sizes = MinMaxScaler().fit_transform(df_pcoa_ranks[['PC3']])
# Add names
df_pcoa_ranks = df_pcoa_ranks.join(features_df[['reservoir name']])
#df_pcoa_ranks = pd.merge(df_pcoa_ranks, features_df[['reservoir name']], left_index=True, right_index=True)
#df_pcoa_ranks['res IDs'] = dist_matrix_df.index.to_numpy()
#df_pcoa_ranks = df_pcoa_ranks.set_index('reservoir name')
fig, ax = plt.subplots(figsize=(8,6))
#ax.set_xlim([-0.27, 0])
#ax.set_ylim([-0.27, 0.27])
df_pcoa_ranks.plot(
    'PC1', 'PC2', kind='scatter', ax=ax, c='lightblue', s=500*point_sizes, alpha=0.6, edgecolor='k')
res_names = features_df.loc[rank_df.index]['reservoir name']
ax.set_xlim([-0.8, 1])
ax.set_ylim([-0.6, 0.8])


#for label, x, y in zip(col1.index, col1, col2):
#    texts+=[ax.text(x, y, label, color=groupColors.get(langnameGroup[label],'k'), fontsize=8)] # for adjustText
from adjustText import adjust_text
texts = []
for k, v in df_pcoa_ranks.iterrows():
    texts.append(plt.text(v['PC1'], v['PC2'], s=v['reservoir name'], alpha = 0.7, fontsize=8))
adjust_text(texts, ax = ax, arrowprops=dict(arrowstyle="-", color='k', lw=0.3, alpha=0.3), expand_objects =(1.2, 1.2),
            #force_text = (0.25, 0.25),
            expand_text=(1.2, 1.2))
#for k, v in df_pcoa_ranks.iterrows():
#    ax.annotate(v['reservoir name'], v[['PC1', 'PC2']], alpha = 0.4, fontsize=10)

In [None]:
symbols_dict = dict(zip(range(0,8), symbols))
symbols_dict

In [None]:
pca_labels_num = pca_labels_df['cluster_number'].map(symbols_dict)

In [None]:
pca_labels_num

In [None]:
import plotly.graph_objects as go
symbols = ['circle', 'cross', 'diamond', 'square', 'x',
            'circle-open', 'diamond-open', 'square-open']
symbols_dict = dict(zip(range(0,8), symbols))
symbols_list_full = []

fig = go.Figure(data=[go.Scatter3d(
    x=X_pca[:, 0],
    y=X_pca[:, 1],
    z=X_pca[:, 2],
    mode='markers',
    marker=dict(
        size=2*np.abs(X_pca[:, 3]),
        color=X_pca[:, 4],                # set color to an array/list of desired values
        colorscale='Viridis',   # choose a colorscale
        symbol=pca_labels_num,
        opacity=0.8,
        line=dict(width=1, color='black')
    ),
)])

# tight layout
fig.update_layout(margin=dict(l=0, r=0, b=0, t=0))
fig.show()