## Global Sensitivity and Uncertainty Analysis

The GSUA approach is based on the work of  A. Carmona-Cabrero and R. Muñoz-Carpena, University of Florida. The paper can be found  in JASS:  [(Carmona-Cabrero et al., 2024)](https://doi.org/10.18564/jasss.5174) 



In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from SALib.sample import sobol
import json
from pathlib import Path

# Folder containing config files
SCENARIO = "shopping/gsua_test/"
CONFIG_DIR = Path("/blue/carpena/haasehelen/ifwaste/input/gsua_based_configuration/" + SCENARIO)
LATINHYPERCUBE_SAMPLING = False
l = 100  # Sampling intensity
N = 5
SAVE_DIR = "/blue/carpena/haasehelen/ifwaste/input/gsua_based_configuration/samples/" + SCENARIO


In [2]:
SAVE_DIR

'/blue/carpena/haasehelen/ifwaste/input/gsua_based_configuration/samples/shopping/gsua_test/gsua'

In [3]:
def extract_parameters(d, path=""):
    result = {}
    for key, value in d.items():
        current_path = f"{path}:{key}" if path else key

        if isinstance(value, dict):
            if "value" in value and "layer" in value:
                result[current_path] = {
                    "value": value.get("value", "NAN"),
                    "distribution":  value.get("distribution", "NAN"),
                    "bounds":  value.get("bounds", "NAN"),
                    "layer": value["layer"],
                    "dtype": value.get("dtype", "NAN"),
                    "length": value.get("length", "NAN"),
                    "options":value.get("options", "NAN"),
                    "decimals": value.get("decimals", "NAN")
                }
            elif "distribution" in value:
                result[current_path] = {
                    "value":  value.get("value", "NAN"),
                    "distribution": value["distribution"],
                    "bounds":  value.get("bounds", "NAN"),
                    "layer": value.get("layer", value["distribution"]),
                    "dtype": value.get("dtype", "NAN"),
                    "length": value.get("length", "NAN"),
                    "options":value.get("options", "NAN"),
                    "decimals": value.get("decimals", "NAN")
                }
            else:
                nested_result = extract_parameters(value, current_path)
                result.update(nested_result)
    return result

## 1. First read all parameterization values from the json files in the CONFIG_DIR
Generate 4 dfs: 
- constants: all parameters that are constants
- parameter: all parameters that shall be sampled using sobol split into:
  - simple_distributions: all parameters that are either triangular or uniform
  - other_distribtuions: parameters, that require extra attention, as they have to be converted to a "simple" distribution


In [4]:
CONFIG_DIR

PosixPath('/blue/carpena/haasehelen/ifwaste/input/gsua_based_configuration/shopping/gsua_test')

In [5]:
list(CONFIG_DIR.glob("*.json"))

[PosixPath('/blue/carpena/haasehelen/ifwaste/input/gsua_based_configuration/shopping/gsua_test/household.json'),
 PosixPath('/blue/carpena/haasehelen/ifwaste/input/gsua_based_configuration/shopping/gsua_test/convenience_store.json'),
 PosixPath('/blue/carpena/haasehelen/ifwaste/input/gsua_based_configuration/shopping/gsua_test/simulation.json'),
 PosixPath('/blue/carpena/haasehelen/ifwaste/input/gsua_based_configuration/shopping/gsua_test/neighborhood.json'),
 PosixPath('/blue/carpena/haasehelen/ifwaste/input/gsua_based_configuration/shopping/gsua_test/discount_store.json'),
 PosixPath('/blue/carpena/haasehelen/ifwaste/input/gsua_based_configuration/shopping/gsua_test/premium_retail.json')]

In [6]:
all_json_files = list(CONFIG_DIR.glob("*.json"))
df_list = []
for json_file in all_json_files:
    with open(json_file) as f:
        config = json.load(f)
    flat_params = extract_parameters(config)
    df = pd.DataFrame.from_dict(flat_params, orient="index")
    df.index.name = "name"
    df.reset_index(inplace=True)
    df.replace("", "NAN", inplace=True)
    df_list.append(df)

df = pd.concat(df_list, ignore_index=True)

constants = df[df["value"] != "NAN"]
parameter = df[df["value"] == "NAN"]
mask = (parameter["distribution"] == "triang") | (parameter["distribution"] == "unif")
simple_distributions = parameter[mask ]
other_distributions = parameter[~(mask)]



In [7]:
other_distributions.head(100)

Unnamed: 0,name,value,distribution,bounds,layer,dtype,length,options,decimals
3,Household:hh_max_avail_time_per_day,NAN,vector,"[15, 180]",household,int,7,NAN,NAN
6,Household:hh_shopping_frequency,NAN,subgroup,"{'1-2': 0.08, '3-6': 0.12, '7-10': 0.8}",household,int,NAN,NAN,NAN
8,Household:hh_pay_day_interval,NAN,subgroup,"{'14': 0.628, '30': 0.103, '7': 0.269}",household,int,NAN,NAN,NAN
19,Adult:adult_preference_vector,NAN,vector,"[0, 1]",household,float,7,NAN,2
35,Child:child_preference_vector,NAN,vector,"[0, 1]",household,float,7,NAN,2
75,Neighborhood:nh_store_amounts,NAN,vector,"[0, 1]",neighborhood,int,3,NAN,NAN


In [8]:
#sim = df[df["layer"] == "simulation"]
#sim = sim.drop(columns=["bounds", "distribution", "layer", "length", "options"])
#sim.to_latex(index=False)

In [9]:
#nh = df[df["layer"] == "neighborhood"]
#nh = nh.drop(columns=["value", "layer", "length", "dtype", "decimals"])
#nh.to_latex(index=False)

In [10]:
#hh = df[df["layer"] == "household"]
#hh = hh.drop(columns=["value", "layer", "length", "options"])
#hh.to_latex()

In [11]:
simple_distributions.head(10)

Unnamed: 0,name,value,distribution,bounds,layer,dtype,length,options,decimals
0,Household:hh_amount_children,NAN,unif,"[0, 6]",household,int,NAN,NAN,NAN
1,Household:hh_amount_adults,NAN,unif,"[1, 2]",household,int,NAN,NAN,NAN
2,Household:hh_level_of_concern,NAN,unif,"[0, 1]",household,float,NAN,NAN,2
4,Household:hh_impulse_buy_likelihood,NAN,unif,"[0, 1.6]",household,float,NAN,NAN,2
5,Household:hh_daily_budget,NAN,unif,"[8.36, 79.5]",household,float,NAN,NAN,2
7,Household:hh_min_time_to_cook,NAN,unif,"[16, 24]",household,int,NAN,NAN,NAN
9,Household:hh_time_per_store,NAN,unif,"[10, 45]",household,int,NAN,NAN,NAN
10,Household:hh_price_sensitivity,NAN,unif,"[0, 1]",household,float,NAN,NAN,2
11,Household:hh_brand_sensitivity,NAN,unif,"[0, 1]",household,float,NAN,NAN,2
12,Household:hh_quality_sensitivity,NAN,unif,"[0, 1]",household,float,NAN,NAN,2


## 2. Convert all parameters from "other_distribution" into "simple_distribtuions"

In [12]:
#convert other distributions to sobol-sampleable values
for _,row in other_distributions.iterrows(): 
    if row["distribution"] == "subgroup": 
        new_row = pd.DataFrame({"name":row["name"], "distribution":"unif", "bounds":[[0,1]], "dtype":row["dtype"],
                            "decimals":row["decimals"], "layer": row["layer"], "value":"NAN", "options":"NAN"})
        simple_distributions = pd.concat([simple_distributions, new_row], ignore_index=True)
        
    if row["distribution"] == "vector": 
        for i in range(row["length"]): 
            new_row = pd.DataFrame({"name":row["name"]+"_"+str(i), "distribution":"unif", "bounds":[row["bounds"]], "dtype":row["dtype"],
                            "decimals":2, "layer": row["layer"], "value":"NAN", "options":"NAN"})
            simple_distributions = pd.concat([simple_distributions, new_row], ignore_index=True)
            
    if row["distribution"] == "sales": 
        new_row = pd.DataFrame({"name":row["name"], "distribution":"unif", "bounds":[[0,1]], "dtype":"float",
                            "decimals":2, "layer": row["layer"], "value":"NAN", "options":"NAN"})
        #sales werden dann von vorne nach hinten ausgewählt - nicht noch randomisiert - zack 
        simple_distributions = pd.concat([simple_distributions, new_row], ignore_index=True)        
    

## 3. Read all parameters (now simple and other) into a problem definition for sobol sampling
Then round all values according to the json files (dtype + decimals)

## 4. Prepare for HPC
- add sens_class line for HPC
- check if at least one store would be in the neighborhood, otherwise remove entire sample

## 5. Add replications of samples, write everything to the file
- save the values by level
- also save the parameter names in an extra file for later usage

## 6. Convert other_distributions back to original types for simulation

In [None]:
import csv
import os
from pandas.api.types import is_numeric_dtype
from SALib.sample import latin

# Create folder if it doesn't exist
if not os.path.exists(SAVE_DIR):
    os.makedirs(SAVE_DIR)
    
def convert_p_to_value_in_subgroup(p, bounds): 
    cumulative = 0.0
    for key, prob in bounds.items():
        if '-' in key:
            start, end = map(int, key.split('-'))
        else:
            start = end = int(key)
        cumulative += prob
        if p <= cumulative:
            n_values = end - start + 1
            width = prob / n_values
            relative_p = p - (cumulative - prob)
            index_in_group = min(int(relative_p / width), n_values - 1)
            return start + index_in_group


problems = {}

for layer in simple_distributions["layer"].unique():
    print(layer)
    df = simple_distributions[simple_distributions["layer"] == layer]
    names = df["name"].to_list()
    distributions = df["distribution"].to_list()
    bounds = df["bounds"].to_list()
    problem = {'num_vars': len(names),
    'names': names,
    'bounds': bounds,
    'dists':distributions
    }
    
    if LATINHYPERCUBE_SAMPLING:
        param_values = pd.DataFrame(latin.sample(problem, l), columns=names)
    else:
        param_values = pd.DataFrame(data=sobol.sample(problem, l, calc_second_order=False), columns=names)  # calc_second_order=True for second-order interactions. 
    for param_name in param_values.columns: 
        if param_name not in other_distributions["name"].to_list():
            row = simple_distributions[simple_distributions["name"] == param_name]
            if row["dtype"].values[0] == "int":
                param_values[param_name] = param_values[param_name].round().astype("int")
            elif row["dtype"].values[0] == "float": 
                param_values[param_name] = param_values[param_name].astype("float").round(int(row["decimals"].values[0]))
            
    
    #filter out unperformable sets (no stores in simulation)
    if layer == "neighborhood":
        print(len(param_values))
        param_values = param_values[~((param_values['Neighborhood:nh_store_amounts_0'] + param_values['Neighborhood:nh_store_amounts_1'] + param_values['Neighborhood:nh_store_amounts_2']) == 0)]
        print(len(param_values))
        
        
    #transform other distributions back
    for _,row in other_distributions.iterrows(): 
        if row["distribution"] == "subgroup" and row["name"] in param_values.columns: 
            bounds = row["bounds"]
            df = pd.DataFrame()
            df["probability"] = param_values[row["name"]]
            df["results"] = df["probability"].apply(lambda p: convert_p_to_value_in_subgroup(p, bounds))
            param_values[row["name"]] = df["results"]
        if row["distribution"] == "vector" and f"{row["name"]}_{0}" in param_values.columns:
            columns_to_drop = []
            col_names = [f"{row["name"]}_{i}" for i in range(row["length"])]
            # Create new column with vectors (lists) for each row
            param_values[row["name"]] = param_values[col_names].values.tolist()
            # Queue old columns for dropping
            columns_to_drop.extend(col_names)
            param_values.drop(columns=columns_to_drop, inplace=True)

        if row["distribution"] == "sales"  and row["name"] in param_values.columns: 
            percentage = param_values[row["name"]]
            df = pd.DataFrame()
            # Calculate how many sales to select
            df["number_of_sales"] = round(percentage * len(row["options"])).astype(int)
            df["number_of_sales"] = df["number_of_sales"].clip(lower=1)
            df["selected_sales"] = df["number_of_sales"].apply(lambda n: row["options"][:n])
            param_values[row["name"]] = df["selected_sales"]
            
    for param_name in param_values.columns: 
        if param_name not in simple_distributions["name"].to_list() and is_numeric_dtype(param_values[param_name]):
            row = simple_distributions[simple_distributions["name"] == param_name]
            if row["dtype"].values[0] == "int":
                #param_values[param_name] = param_values[param_name].round().astype("int")
                param_values[param_name] = param_values[param_name].apply(lambda x: int(np.floor(x + np.random.uniform(0, 1))))

            elif row["dtype"].values[0] == "float": 
                param_values[param_name] = param_values[param_name].astype("float").round(int(row["decimals"].values[0]))
            
    print("GSA sample matrix size: {}".format(param_values.shape))  
    #for HPC added "sens_class" line
    header = param_values.columns.to_list()
    #param_values['sens_class'] = 'sens_class'
    
    #for shopping clustering
    sample_matrix_realizations = pd.DataFrame(np.repeat(param_values.to_numpy(), repeats=N, axis=0))
    sample_matrix_realizations.to_csv( f"{SAVE_DIR}{layer}_ifwaste_sample_df.txt", header=header, index=False, sep='\t')
    
    param_values.insert(0, 'sens_class', 'sens_class')
    sample_matrix_realizations = pd.DataFrame(np.repeat(param_values.to_numpy(), repeats=N, axis=0))
    
    print("GSA sample matrix size: {}".format(sample_matrix_realizations.shape))
    #write the sample matrix to csv to configure the model
    sample_matrix_realizations.to_csv( f"{SAVE_DIR}{layer}_ifwaste_sample.txt", header=False, index=False, sep='\t')
    with open(f"{SAVE_DIR}{layer}_ifwaste_sample_header.txt", "w", newline="") as f:
        writer = csv.writer(f)
        for item in header:
            writer.writerow([item])

    
    
    
constants["value"].T.to_frame().T.to_csv(
    SAVE_DIR + 'simulation_ifwaste_sample.txt',
    header=False,
    index=False,
    sep='\t'
)
constants["name"].to_csv( SAVE_DIR + 'simulation_ifwaste_sample_header.txt', index=False, header=False)

##for shopping clustering
constants["value"].T.to_frame().T.to_csv(
    SAVE_DIR + 'simulation_ifwaste_sample_df.txt',
    header=constants["name"].tolist(),
    index=False,
    sep='\t'
)


household


  sample = self._random(n, workers=workers)


GSA sample matrix size: (7200, 52)
GSA sample matrix size: (36000, 53)
neighborhood
2200
1927
GSA sample matrix size: (1927, 18)
GSA sample matrix size: (9635, 19)


  sample = self._random(n, workers=workers)


In [14]:
header

['Neighborhood:Grid:travel_time_per_cell',
 'Neighborhood:BasketCurator:increment_likelihood',
 'Neighborhood:BasketCurator:max_items_quickshop',
 'Neighborhood:Food:FGMeat:expiration',
 'Neighborhood:Food:FGMeat:impulse_buy_likelihood',
 'Neighborhood:Food:FGDairy:expiration',
 'Neighborhood:Food:FGDairy:impulse_buy_likelihood',
 'Neighborhood:Food:FGBaked:expiration',
 'Neighborhood:Food:FGBaked:impulse_buy_likelihood',
 'Neighborhood:Food:FGVegetable:expiration',
 'Neighborhood:Food:FGVegetable:impulse_buy_likelihood',
 'Neighborhood:Food:FGDryFood:expiration',
 'Neighborhood:Food:FGDryFood:impulse_buy_likelihood',
 'Neighborhood:Food:FGSnacks:expiration',
 'Neighborhood:Food:FGSnacks:impulse_buy_likelihood',
 'Neighborhood:Food:FGStorePrepared:expiration',
 'Neighborhood:Food:FGStorePrepared:impulse_buy_likelihood',
 'Neighborhood:nh_store_amounts']

In [15]:
simple_distributions

Unnamed: 0,name,value,distribution,bounds,layer,dtype,length,options,decimals
0,Household:hh_amount_children,NAN,unif,"[0, 6]",household,int,NAN,NAN,NAN
1,Household:hh_amount_adults,NAN,unif,"[1, 2]",household,int,NAN,NAN,NAN
2,Household:hh_level_of_concern,NAN,unif,"[0, 1]",household,float,NAN,NAN,2
3,Household:hh_impulse_buy_likelihood,NAN,unif,"[0, 1.6]",household,float,NAN,NAN,2
4,Household:hh_daily_budget,NAN,unif,"[8.36, 79.5]",household,float,NAN,NAN,2
...,...,...,...,...,...,...,...,...,...
85,Child:child_preference_vector_5,NAN,unif,"[0, 1]",household,float,,NAN,2
86,Child:child_preference_vector_6,NAN,unif,"[0, 1]",household,float,,NAN,2
87,Neighborhood:nh_store_amounts_0,NAN,unif,"[0, 1]",neighborhood,int,,NAN,2
88,Neighborhood:nh_store_amounts_1,NAN,unif,"[0, 1]",neighborhood,int,,NAN,2


In [16]:
param_values    

Unnamed: 0,sens_class,Neighborhood:Grid:travel_time_per_cell,Neighborhood:BasketCurator:increment_likelihood,Neighborhood:BasketCurator:max_items_quickshop,Neighborhood:Food:FGMeat:expiration,Neighborhood:Food:FGMeat:impulse_buy_likelihood,Neighborhood:Food:FGDairy:expiration,Neighborhood:Food:FGDairy:impulse_buy_likelihood,Neighborhood:Food:FGBaked:expiration,Neighborhood:Food:FGBaked:impulse_buy_likelihood,Neighborhood:Food:FGVegetable:expiration,Neighborhood:Food:FGVegetable:impulse_buy_likelihood,Neighborhood:Food:FGDryFood:expiration,Neighborhood:Food:FGDryFood:impulse_buy_likelihood,Neighborhood:Food:FGSnacks:expiration,Neighborhood:Food:FGSnacks:impulse_buy_likelihood,Neighborhood:Food:FGStorePrepared:expiration,Neighborhood:Food:FGStorePrepared:impulse_buy_likelihood,Neighborhood:nh_store_amounts
0,sens_class,6.2,0.14,6,8,0.46,7,0.64,10,0.31,6,0.25,217,0.22,307,0.18,4,0.92,"[1, 0, 0]"
1,sens_class,4.0,0.14,6,8,0.46,7,0.64,10,0.31,6,0.25,217,0.22,307,0.18,4,0.92,"[1, 0, 0]"
2,sens_class,6.2,0.53,6,8,0.46,7,0.64,10,0.31,6,0.25,217,0.22,307,0.18,4,0.92,"[1, 0, 0]"
3,sens_class,6.2,0.14,6,8,0.46,7,0.64,10,0.31,6,0.25,217,0.22,307,0.18,4,0.92,"[1, 0, 0]"
4,sens_class,6.2,0.14,6,8,0.46,7,0.64,10,0.31,6,0.25,217,0.22,307,0.18,4,0.92,"[1, 0, 0]"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2195,sens_class,5.4,0.80,4,6,0.18,33,0.13,5,0.31,5,0.37,117,0.39,95,0.57,5,0.80,"[0, 0, 1]"
2196,sens_class,5.4,0.80,4,6,0.18,33,0.13,5,0.31,5,0.37,117,0.39,95,0.57,5,0.26,"[1, 0, 1]"
2197,sens_class,5.4,0.80,4,6,0.18,33,0.13,5,0.31,5,0.37,117,0.39,95,0.57,5,0.26,"[0, 0, 1]"
2198,sens_class,5.4,0.80,4,6,0.18,33,0.13,5,0.31,5,0.37,117,0.39,95,0.57,5,0.26,"[0, 0, 1]"


In [17]:
other_distributions.head(100)

Unnamed: 0,name,value,distribution,bounds,layer,dtype,length,options,decimals
3,Household:hh_max_avail_time_per_day,NAN,vector,"[15, 180]",household,int,7,NAN,NAN
6,Household:hh_shopping_frequency,NAN,subgroup,"{'1-2': 0.08, '3-6': 0.12, '7-10': 0.8}",household,int,NAN,NAN,NAN
8,Household:hh_pay_day_interval,NAN,subgroup,"{'14': 0.628, '30': 0.103, '7': 0.269}",household,int,NAN,NAN,NAN
19,Adult:adult_preference_vector,NAN,vector,"[0, 1]",household,float,7,NAN,2
35,Child:child_preference_vector,NAN,vector,"[0, 1]",household,float,7,NAN,2
75,Neighborhood:nh_store_amounts,NAN,vector,"[0, 1]",neighborhood,int,3,NAN,NAN


In [18]:
combined_bounds = pd.concat([simple_distributions, other_distributions], ignore_index=True)
combined_bounds = combined_bounds.drop(columns=["value", "layer", "length", "options"])
combined_bounds.to_csv(SAVE_DIR + "bounds_df.csv", sep="\t")
