## Run multiobjective optimization of dam porftolios and process the optimization outputs
### T. Janus
### Created: 15/01/2024
### Modified: 06/11/2024

In [None]:
from typing import ClassVar, Dict, List, Any, Tuple, Set, Tuple, Sequence
from typing import TypeAlias, TypeVar, Generic
import subprocess
import pathlib
import numpy as np
import pandas as pd
import string
from datetime import datetime
from parse import parse
import json
import gc
import pprint
import re
import ast
import plotly.express as px
import plotly.graph_objects as go
import networkx as nx
import pygmo as pg
# Apply the deao sorting
from deap import base, tools, creator
# Define the fitness and individual classes
from tqdm import tqdm
import seaborn as sns # for Data visualization
import matplotlib
import matplotlib.pyplot as plt # for Data visualization
from IPython.display import display, HTML
from jinja2 import Template
from lib.notebook12 import (
    reduce_mem_usage, read_id_ifc_map, set_remap, get_every_n_row, 
    SolutionFileParser, OutputVisualiser, ObjectiveCalculator, 
    map_ids, find_solution_by_dam_numbers, return_row_by_criterion
)
%matplotlib inline

## Custom functions and classes
### Functions
* `reduce_mem_usage` - Used for reducing the memory occupied by a dataframe and reduces data type based on content and range of values.
* `read_id_ifc_map` - Reads a dictionary that maps optimization dam IDs to IFC dam IDs.
* `set_remap` - Maps values in a set to new values based on a provided mapping dictionary.
* `get_every_n_row` - Gets every n-th row of a dataframe and returns a reduced dataframe.
### Classes
* `SolutionFileParser`
* `OutputVisualiser`
* `ObjectiveCalculator`

### Define file names

In [None]:
# optimization solution files with optimizations from two scenarios:
# * nobuilt - all dams are considered for optimization as if no dams that currently exist had been built
# * built - current state with built dams in place
#-------------------------------------------------------
# 5 objective optimization with GHG emissions calculated using G-res methodology
mya_nobuilt_5obj_filename = 'mya_5_obj_nobuilt.sol'
mya_built_5obj_filename = 'mya_5_obj_built.sol'
# 5 objective optimization with GHG emissions calculated using emission factors
mya_nobuilt_5obj_soued_filename = 'mya_5_obj_nobuilt_soued.sol'
mya_built_5obj_soued_filename = 'mya_5_obj_built_soued.sol'
# 3 objective optimization with GHG emissions calculated using emission factors
mya_nobuilt_3obj_soued_filename = 'mya_3_obj_nobuilt_soued.sol'
mya_built_3obj_soued_filename = 'mya_3_obj_built_soued.sol'
#-------------------------------------------------------
# Paths to output files from the algorithm with expansion / compression
sol_file_folder_5obj = pathlib.Path('moo_solver_CPAIOR/outputs/epsilon2_5obj')
sol_file_folder_3obj = pathlib.Path('moo_solver_CPAIOR/outputs/epsilon2_3obj_soued')
#-------------------------------------------------------
mya_nobuilt_5obj_path_cpaior = sol_file_folder_5obj / pathlib.Path(mya_nobuilt_5obj_filename)
mya_built_5obj_path_cpaior = sol_file_folder_5obj / pathlib.Path(mya_built_5obj_filename)
mya_nobuilt_soued_5obj_path_cpaior = sol_file_folder_5obj / pathlib.Path(mya_nobuilt_5obj_soued_filename)
mya_built_soued5obj_path_cpaior = sol_file_folder_5obj / pathlib.Path(mya_built_5obj_soued_filename)
mya_nobuilt_soud_3obj_path_cpaior = sol_file_folder_3obj / pathlib.Path(mya_nobuilt_3obj_soued_filename)
mya_built_soued_3obj_path_cpaior = sol_file_folder_3obj / pathlib.Path(mya_built_3obj_soued_filename)
#-------------------------------------------------------
# Load dataframe with dam ids and objective values for each dam
dam_data_filename = pathlib.Path("outputs/moo/all_hp.csv")
# Load the mapping between ids used in the MOO algorithm and the IDs in the IFC database
map_file_path = pathlib.Path('outputs/moo/id_to_ifc.json')

### Read data about dams and dam identifiers (different for the water model and for the IFC database)

In [None]:
# Read file with all hydroelectric plants for optimization and their objective values
dam_data: pd.DataFrame = pd.read_csv(dam_data_filename)
# Find IFC ids of built dams
built_dam_ifc_ids: Set[int] = set(
    dam_data[dam_data['status'] == 'Existing']['ifc_id'].to_list())
# Read the ID (water model identifiers) to IFC map
id_ifc_map: Dict[int, int] = read_id_ifc_map(map_file_path)
# List IFC IDs in a sorted order of dams included in the analysis
ifc_ids: List[int] = sorted([id_ifc_map[_id+1] for _id, _ in enumerate(id_ifc_map) ])
# Some repetition here, but left for now in fear of breaking the code
with open(map_file_path, 'r') as file:
    id_map = json.load(file)
id_map: Dict[int, int] = {int(key): value for key, value in id_map.copy().items()} # Maps optim ids to ifc ids
ifc_to_id_map: Dict[int, int] = {value: key for key, value in id_map.items()}
# ----------------------------------------------------------------------------------------------------------------
dam_df = pd.read_csv(dam_data_filename, index_col=0).set_index('ifc_id')
built_dam_ids_ifc: Set[int] = set(dam_df[dam_df['status_int'] == 1].index.to_list())
# Set of dams used for optimization
built_dam_ids_opt: Set[int] = set_remap(built_dam_ids_ifc, ifc_to_id_map)

### Find the number of existing dams

In [None]:
# Print a number of existing dams
num_existing_dams = dam_df[dam_df['status'] == 'Existing']['name'].count()
print(f"Number of existing dams: {num_existing_dams}")
# Print a set of existing dam ids
existing_dam_ids = set(dam_df[dam_df['status'] == 'Existing'].index.to_list())
print("Existing dam ids:", existing_dam_ids)

## Call the optimizer by calling external script using subprocess. Alternatively, run the script `.sh` files (Linux/MacOS X), or `.bat` files (Windows), directly from Terminal/Console.

### RUNS optimization as a subprocess from within this notebook
#### Switch the `rerun_CPAIOR` flag to True if you want to run the optimization yourself (may take 3hr+ to execute). 
#### Otherwise keep it as False and read pre-saved optimization results in .sol files.

In [None]:
# SET THIS TO TRUE TO RUN THE OPTIMIZATION
rerun_CPAIOR: bool = False

script_paths = [
    'moo_solver_CPAIOR/run_myanmar_dam_selection.sh',
    'moo_solver_CPAIOR/run_myanmar_dam_selection_soued.sh'
]

# Use subprocess to call and execute the shell script
if rerun_CPAIOR:
    # Specify the path to your shell script
    for script_path in script_paths:
        try:
            subprocess.run(['bash', script_path], check=True)
            print("Optimization runs successful.")
        except subprocess.CalledProcessError as e:
            print(f"Error executing script: {e}")

## Parse optimization outputs - fetches the results from `.sol` text files and saves in `.csv` and `.json` formats
### Processes the following optimization scenarios:
1. 5-objective optimization with emissions calculated with ReEmission
2. 5-objective optimization with emissions calculated using emission factors (Soued et al.)
3. 3-objective optimization with emissions calculated using emission factors (Soued et al.)

In [None]:
# Sets paths and arguments for processing optimization results from different optimization scenarios
exec_options = {
    '5obj' : {
        'nobj': 5,
        'row_fraction': 1/3,
        'sol_path_nobuilt' : mya_nobuilt_5obj_path_cpaior,
        'sol_path_built': mya_built_5obj_path_cpaior,
        'output_json_file_nobuilt': sol_file_folder_5obj / pathlib.Path('mya_5_obj_nobuilt.json'),
        'output_csv_file_nobuilt': sol_file_folder_5obj / pathlib.Path('mya_5_obj_nobuilt.csv'),
        'output_json_file_built': sol_file_folder_5obj / pathlib.Path('mya_5_obj_built.json'),
        'output_csv_file_built': sol_file_folder_5obj / pathlib.Path('mya_5_obj_built.csv'),
        'merged_csv_file': sol_file_folder_5obj / pathlib.Path('merged_df_5obj.csv'),
        'nondom_csv_file': sol_file_folder_5obj / pathlib.Path('em_int_nondom_df_5obj.csv')
    },
    '5obj_soued': {
        'nobj': 5,
        'row_fraction': 1/3,
        'sol_path_nobuilt' :mya_nobuilt_soued_5obj_path_cpaior,
        'sol_path_built': mya_built_soued5obj_path_cpaior,
        'output_json_file_nobuilt': sol_file_folder_5obj / pathlib.Path('mya_5_obj_nobuilt_soued.json'),
        'output_csv_file_nobuilt': sol_file_folder_5obj / pathlib.Path('mya_5_obj_nobuilt_soued.csv'),
        'output_json_file_built': sol_file_folder_5obj / pathlib.Path('mya_5_obj_built_soued.json'),
        'output_csv_file_built': sol_file_folder_5obj / pathlib.Path('mya_5_obj_built_soued.csv'),
        'merged_csv_file': sol_file_folder_5obj / pathlib.Path('merged_df_5obj_soued.csv'),
        'nondom_csv_file': sol_file_folder_5obj / pathlib.Path('em_int_nondom_df_5obj_soued.csv')
    },
    '3obj_soued' : {
        'nobj': 3,
        'row_fraction': 1,
        'sol_path_nobuilt' : mya_nobuilt_soud_3obj_path_cpaior,
        'sol_path_built': mya_built_soued_3obj_path_cpaior,
        'output_json_file_nobuilt': sol_file_folder_3obj / pathlib.Path('mya_3_obj_nobuilt_soued.json'),
        'output_csv_file_nobuilt': sol_file_folder_3obj / pathlib.Path('mya_3_obj_nobuilt_soued.csv'),
        'output_json_file_built': sol_file_folder_3obj / pathlib.Path('mya_3_obj_built_soued.json'),
        'output_csv_file_built': sol_file_folder_3obj / pathlib.Path('mya_3_obj_built_soued.csv'),
        'merged_csv_file': sol_file_folder_3obj / pathlib.Path('merged_df_3obj_soued.csv'),
        'nondom_csv_file': sol_file_folder_3obj / pathlib.Path('em_int_nondom_df_3obj_soued.csv')
    }
}

In [None]:
# Select option - needs to be done one-by-one as on smaller laptops, running all three options in a row 
# can lead to memory overflow followed by a kernel crash
options_batch = ['5obj'] #'5obj_soued', 
filter_dataframe: bool = True # Remove some solution rows to reduce the amount of data for saving and visualising
# Save data from .sol files to json and/or csv if boolean flags for each are set to True
convert_to_json, convert_to_csv = False, False
process_results: bool = True
save_to_file: bool = True

In [None]:
def parse_sol_and_reduce_mem_usage(
        exec_options: Dict[str, Dict[str, str]], 
        option: str, 
        convert_to_json: bool, 
        convert_to_csv: bool) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """ """
    # Nobuilt scenario
    sol_parser_nobuilt = SolutionFileParser(exec_options[option]['sol_path_nobuilt'])
    sol_parser_nobuilt.parse()
    if convert_to_json:
        sol_parser_nobuilt.to_json(exec_options[option]['output_json_file_nobuilt'])
    if convert_to_csv:
        sol_parser_nobuilt.to_csv(exec_options[option]['output_csv_file_nobuilt'])
    # Concetenate dataframes into `merged_df`
    df_nobuilt = reduce_mem_usage(sol_parser_nobuilt.solutions_df)
    # Remove the unwanted objects from memory
    del sol_parser_nobuilt
    # Built scenario
    sol_parser_built = SolutionFileParser(exec_options[option]['sol_path_built'])
    sol_parser_built.parse()
    if convert_to_json:
        sol_parser_built.to_json(exec_options[option]['output_json_file_built'])
    if convert_to_csv:
        sol_parser_built.to_csv(exec_options[option]['output_csv_file_built'])
    # Concetenate dataframes into `merged_df`
    df_built = reduce_mem_usage(sol_parser_built.solutions_df)
    # Remove the unwanted objects from memory
    del sol_parser_built
    return df_built, df_nobuilt

def filter_parsed_dataframes(
        df_built: pd.DataFrame, 
        df_nobuilt: pd.DataFrame,
        exec_options: Dict[str, Dict[str, str]], 
        option: str) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """ """
    # Filter the dataframes before merging in the next step
    df_built = df_built.sort_values(by=['energy'], ascending=True)
    df_nobuilt = df_nobuilt.sort_values(by=['energy'], ascending=True)
    # Filter the dataframe if filtering is selected
    df_built_filt = get_every_n_row(df_built, int(1/exec_options[option]['row_fraction']))
    df_nobuilt_filt = get_every_n_row(df_nobuilt, int(1/exec_options[option]['row_fraction']))
    return df_built_filt, df_nobuilt_filt

def make_merged_df(
        df_built_filt: pd.DataFrame, 
        df_nobuilt_filt: pd.DataFrame, 
        exec_options: Dict[str, Dict[str, str]], 
        option: str) -> pd.DataFrame:
    """ """
    df_nobuilt_filt['Scenario'] = "Not Built"
    df_built_filt['Scenario'] = "Built"
    # Merge the built and nobuilt dataframes. They are concatenated by the row dimension in
    # order "Not Built" -> "Built"
    merged_df = pd.concat([df_built_filt, df_nobuilt_filt], ignore_index=True)
    old_new_col_map = {
        'energy': "Mean annual HP, [MW]",
        'ghg': 'GHG emissions [tonne CO<sub>2,eq</sub>/year]', 
        'firm_energy': 'Firm HP, [MW]',
        'loss_agri': 'Agricultural land loss, [km<sup>2</sup>]',
        'loss_forest': 'Deforestation, [km<sup>2</sup>]',
        'num_dams': 'No. of selected dams',
        'dam_ids': 'Dam IDs',
        'land_loss': 'Land loss, [km<sup>2</sup>]',
        'ghg_intensity': 'GHG intensity [gCO<sub>2,eq</sub>/kWh]'}
    merged_df.rename(columns=old_new_col_map, inplace=True)
    merged_df['Dam IDs'] = merged_df['Dam IDs'].apply(set)
    merged_df['Firm Power Ratio, [%]'] = \
        merged_df['Firm HP, [MW]'] / merged_df['Mean annual HP, [MW]'] * 100
    merged_df['Scenario, [1/0]'] = merged_df['Scenario'].map({'Built': 1, 'Not Built': 0})
    
    if exec_options[option]['nobj'] == 5:
        # Define bin edges for land loss
        bins = [0, 300, 600, 1000, 1500, 2000]
        # Define labels for the bins
        labels = ['0-300 km2', '300-500 km2', '500-1000 km2', '1000-1500 km2', '1500-2000 km2']
        merged_df["Loss of Land [km<sup>2</sup>]"] = pd.cut(
            merged_df['Land loss, [km<sup>2</sup>]'], bins=bins, labels=labels, right=False)
    # Arrange by status and energy in ascending order
    merged_df = merged_df.sort_values(by=['Scenario', 'Mean annual HP, [MW]'], ascending=True)
    # Introduce new columns
    merged_df['HP Production [GWh/year]'] = merged_df["Mean annual HP, [MW]"] * 365.25 * 24 / 1_000
    merged_df['Mean HP [GWh/d]'] = merged_df["Mean annual HP, [MW]"] * 24 / 1_000
    merged_df['Firm HP [GWh/d]'] = merged_df['Firm HP, [MW]'] * 24 / 1_000
    
    # Reduce size of some data in merged_df
    merged_df['Scenario, [1/0]'] = merged_df['Scenario, [1/0]'].astype('uint8')
    if exec_options[option]['nobj'] == 5:
        merged_df["Loss of Land [km<sup>2</sup>]"] = merged_df["Loss of Land [km<sup>2</sup>]"].astype('category')
    merged_df['No. of selected dams'] = merged_df['No. of selected dams'].astype('int8')
    # Use an automated method
    merged_df = reduce_mem_usage(merged_df)
    return merged_df

def nondom_sort_in_2_objectives(
        merged_df: pd.DataFrame, 
        exec_options: Dict[str, Dict[str, str]], 
        option: str, method: str = "pygmo") -> Tuple[np.ndarray, np.ndarray, np.ndarray]:
    """ """
    # Perform non-dominated sorting in 2D for two 5D nondominated fronts: for Built and NotBuilt scenarios
    # Create a pareto dominant front for not built data
    if exec_options[option]['nobj'] == 5:
        col_names = [
            "Mean annual HP, [MW]", 'GHG emissions [tonne CO<sub>2,eq</sub>/year]', 
            'GHG intensity [gCO<sub>2,eq</sub>/kWh]', 'Firm Power Ratio, [%]', 
            'Land loss, [km<sup>2</sup>]', 'Dam IDs']
    if exec_options[option]['nobj'] == 3:
        col_names = [
            "Mean annual HP, [MW]", 'GHG emissions [tonne CO<sub>2,eq</sub>/year]', 
            'GHG intensity [gCO<sub>2,eq</sub>/kWh]', 'Firm Power Ratio, [%]', 
            'Dam IDs']
    xy_pairs_built = merged_df\
        .loc[merged_df['Scenario'] =='Built', col_names]
    xy_pairs_built["Mean annual HP, [MW]"] = xy_pairs_built["Mean annual HP, [MW]"] * -1
    xy_pairs_built_list = xy_pairs_built.to_numpy().tolist()
    
    xy_pairs_nobuilt = merged_df\
        .loc[merged_df['Scenario'] =='Not Built', col_names]
    xy_pairs_nobuilt["Mean annual HP, [MW]"] = xy_pairs_nobuilt["Mean annual HP, [MW]"] * -1
    xy_pairs_nobuilt_list = xy_pairs_nobuilt.to_numpy().tolist()
    
    xy_pair_array_built_np = np.array(xy_pairs_built_list)
    xy_pair_array_nobuilt_np = np.array(xy_pairs_nobuilt_list)

    if method == "deap":
        creator.create("FitnessMin", base.Fitness, weights=(-1.0, -1.0))  # Minimize both objectives
        creator.create("Individual", list, fitness=creator.FitnessMin)
        individuals_built = [creator.Individual(point) for point in xy_pair_array_built_np[:,:2]]
        for i_built, ind_built in enumerate(individuals_built):
            ind_built.fitness.values = ind_built  # Set the point as the fitness value
            ind_built.index = i_built  # Store the original index
        fronts_built = tools.sortNondominated(individuals_built, len(individuals_built), first_front_only=True)
        # Extract indices of non-dominated front
        non_dom_front_built = np.array([ind_built.index for ind_built in fronts_built[0]])
    
        individuals_nobuilt = [creator.Individual(point) for point in xy_pair_array_nobuilt_np[:,:2]]
        for i_nobuilt, ind_nobuilt in enumerate(individuals_nobuilt):
            ind_nobuilt.fitness.values = ind_nobuilt  # Set the point as the fitness value
            ind_nobuilt.index = i_nobuilt  # Store the original index
        fronts_nobuilt = tools.sortNondominated(individuals_nobuilt, len(individuals_nobuilt), first_front_only=True)
        # Extract indices of non-dominated front
        non_dom_front_nobuilt = np.array([ind_nobuilt.index for ind_nobuilt in fronts_nobuilt[0]])
    if method == "pygmo":
        # Find non-dominated fronts, i.e. indexes of nondominated points. List them in the order of
        # increasing HP production
        non_dom_front_built = pg.non_dominated_front_2d(points=xy_pair_array_built_np[:,:2])[::-1]
        non_dom_front_nobuilt = pg.non_dominated_front_2d(points=xy_pair_array_nobuilt_np[:,:2])[::-1]
        
    # Convert back from negative to positive values
    xy_pair_array_built_np[:,0] = xy_pair_array_built_np[:,0] * -1
    xy_pair_array_nobuilt_np[:,0] = xy_pair_array_nobuilt_np[:,0] * -1
    # Select nondominated points
    xy_nondom_built_np = xy_pair_array_built_np[non_dom_front_built]
    xy_nondom_nobuilt_np = xy_pair_array_nobuilt_np[non_dom_front_nobuilt]
    xy_nondom_all = np.concatenate((xy_nondom_built_np, xy_nondom_nobuilt_np), axis=0)
    return xy_nondom_built_np, xy_nondom_nobuilt_np, xy_nondom_all

def convert_nondom_to_dfs(xy_nondom_built_np, xy_nondom_nobuilt_np, exec_options, option) -> pd.DataFrame:
    """ """
    if exec_options[option]['nobj'] == 5:
        col_names = [
            "Mean annual HP, [MW]", 'GHG emissions [tonne CO<sub>2,eq</sub>/year]', 
            'GHG intensity [gCO<sub>2,eq</sub>/kWh]', 'Firm Power Ratio, [%]', 
            'Land loss, [km<sup>2</sup>]', 'Dam IDs']
    if exec_options[option]['nobj'] == 3:
        col_names = [
            "Mean annual HP, [MW]", 'GHG emissions [tonne CO<sub>2,eq</sub>/year]', 
            'GHG intensity [gCO<sub>2,eq</sub>/kWh]', 'Firm Power Ratio, [%]', 
            'Dam IDs']
    xy_nondom_built_df = pd.DataFrame(xy_nondom_built_np, columns=col_names)
    xy_nondom_built_df['Scenario'] = "Built"
    xy_nondom_nobuilt_df = pd.DataFrame(xy_nondom_nobuilt_np, columns=col_names)
    xy_nondom_nobuilt_df['Scenario'] = "Not Built"
    xy_nondom_df = pd.concat([xy_nondom_built_df, xy_nondom_nobuilt_df], ignore_index=True)
    xy_nondom_df['HP Production [GWh/year]'] = xy_nondom_df["Mean annual HP, [MW]"] * 365.25 * 24 / 1_000
    if exec_options[option]['nobj'] == 5:
        # Define bin edges for land loss
        bins = [0, 300, 600, 1000, 1500, 2000]
        # Define labels for the bins
        labels = ['0-300 km2', '300-500 km2', '500-1000 km2', '1000-1500 km2', '1500-2000 km2']
        xy_nondom_df["Loss of Land [km<sup>2</sup>]"] = pd.cut(
            xy_nondom_df['Land loss, [km<sup>2</sup>]'], bins=bins, labels=labels, right=False)
    em_int_nondom_df = xy_nondom_df
    return em_int_nondom_df

In [None]:
if process_results:
    for option in options_batch:
        # Parse and reduce size of solutions from .sol files
        df_built, df_nobuilt = parse_sol_and_reduce_mem_usage(
            exec_options, 
            option,
            convert_to_json=convert_to_json, 
            convert_to_csv=convert_to_csv)
        # Filter the results by removing some solutions
        df_built_filt, df_nobuilt_filt = filter_parsed_dataframes(
            df_built, 
            df_nobuilt,
            exec_options,
            option)
        # Merge data from two optimization scenarios (with build status and without build statuses)
        merged_df = make_merged_df(df_built_filt, df_nobuilt_filt, exec_options, option)
        # Perform nondominated sorting
        print("Performing nondominated sorting")
        xy_nondom_built_np, xy_nondom_nobuilt_np, xy_nondom_all = nondom_sort_in_2_objectives(merged_df, exec_options, option)
        print("Finished nondominated sorting")
        em_int_nondom_df = convert_nondom_to_dfs(xy_nondom_built_np, xy_nondom_nobuilt_np, exec_options, option)
        if save_to_file:
            em_int_nondom_df.to_csv(exec_options[option]['nondom_csv_file'], index=False)
            merged_df.to_csv(exec_options[option]['merged_csv_file'], index = False)
        #Statistics
        number_of_solutions = len(df_nobuilt) + len(df_built)
        print(f"Processed the option {option}")
        print(f"Total number of solutions : {number_of_solutions}")
        print(f"Scenario with built constructed dams {len(df_built)} solutions")
        print(f"Scenario with zero constructed dams {len(df_nobuilt)} solutions")

## The End