In [1]:
import numpy as np
import pandas as pd

from star import star_vars
from itertools import combinations, chain, repeat

In [2]:
param_names = ['x1', 'x2', 'x3']
ivars_scales = [0.1, 0.3, 0.5]

In [3]:
def ishigami(x, a=7, b=0.05):
    '''Ishigami test function'''
    # check whether the input x is a dataframe
    
    if not isinstance(x, (pd.core.frame.DataFrame, pd.core.series.Series, np.ndarray, list)):
        raise TypeError('`x` must be of type pandas.DataFrame, numpy.ndarray, pd.Series, or list')
    
    if x.shape[0] > 3:
        raise ValueError('`x` must have only three arguments at a time')
    
    return np.sin(x[0]) + a*(np.sin(x[1])**2) + b*(x[2]**4)*np.sin(x[0])

In [4]:
def factor_ranking(factors):
    # gather indices for sorting factor
    temp = np.argsort(factors)[::-1]
    # create an array the same shape and type as temp
    ranks = np.empty_like(temp)
    # rank factors
    ranks[temp] = np.arange(len(factors))

    return ranks
    

In [5]:
# helper functions
def apply_unique(func, df, axis=1, *args, **kwargs):
    '''Apply a function to unique rows of a DataFrame
    for efficiency.'''

    applied_df = df.merge(df.drop_duplicates()
                         .assign(**{func.__name__: lambda x: x.apply(func, axis=axis)}), 
                         how='left')
    applied_df.index = df.index
    
    return applied_df
    
    
def scale(df, bounds, axis=1, *args, **kwargs):
    '''scale the sampled matrix
    bounds is a dict with ['ub', 'lb'] keys
    the values are lists of the upper and lower bounds
    of the parameters/variables/factors'''
    
    # numpy equivalent for math operations
    bounds_np = {key:np.array(value) for key,value in bounds.items()}
    
    if axis:
        return df * (bounds_np['ub'] - bounds_np['lb']) + bounds_np['lb']
    else:
        return df.T * (bounds_np['ub'] - bounds_np['lb']) + bounds_np['lb']
    
    
def pairs_h(iterable):
    '''gives the pairs of numbers considering their differences'''
    interval = range(min(iterable), max(iterable)-min(iterable))
    pairs  = {key+1:[j for j in combinations(iterable, 2) if np.abs(j[0]-j[1])==key+1] for key in interval}
    return pairs
    
    
def section_df(df, delta_h): # ***delta_h here is newly added*** July 6th, 2021 - Saman's comment
    '''gets the paired values of each section based on index'''
    pairs = pairs_h(df.index.get_level_values(-1))
    df_values = df.to_numpy()
    sample = pd.concat({h*delta_h:
                    pd.DataFrame.from_dict({str(idx_tup): [df_values[idx_tup[0]], df_values[idx_tup[1]]] for idx_tup in idx}, 'index') \
                      for h, idx in pairs.items()}) 

    return sample
    
    
# lambda functions
'''covariogram of each section'''
cov_section = lambda pair_cols, mu_star: (pair_cols.sub(mu_star, axis=0)[0] * pair_cols.sub(mu_star, axis=0)[1]).groupby(level=[0,1,2]).mean()

'''variogram over all sections'''
variogram = lambda pair_cols: 0.5*(pair_cols[0] - pair_cols[1]).pow(2).groupby(level=[1,2]).mean()

'''morris sensitivity measure equivalent evaluated over all sections'''
morris_eq = lambda pair_cols: ((pair_cols[1] - pair_cols[0]).abs().groupby(level=[1,2]).mean(), \
                               (pair_cols[1] - pair_cols[0]).groupby(level=[1,2]).mean())

'''covariogram over all sections'''
covariogram = lambda pair_cols, mu_overall: ((pair_cols - mu_overall)[0] * (pair_cols - mu_overall)[1]).groupby(level=[1,2]).mean()

'''expected covariogram over all sections'''
e_covariogram = lambda cov_section_all: cov_section_all.groupby(level=[1,2]).mean()

'''sobol (total order) sensitivity measure equivalent evaluated over all sections''' # new sobol added *** 6 July 2021
# sobol_eq = lambda gamma, ecov, variance: ((gamma + ecov) / variance).loc[:,1]
sobol_eq = lambda gamma, ecov, variance, delta_h: ((gamma + ecov) / variance)[:, delta_h] # new July 6, 2021



# ivars function
def ivars(variogram_array, scale, delta_h):
    '''generate Integrated Variogram Across a Range of Scales (IVARS)
    by approximating area using right trapezoids having width of `delta_h`
    and hights of variogram values'''
    num_h  = len(variogram_value.index.levels[-1].to_list())
    x_bench= np.arange(start=0, stop=delta_h*(num_h+1), step=delta_h)
    x_int  = np.arange(start=0, stop=(scale*10+1)/10, step=delta_h)

    # calculate interpolated values for both x (h) and y (variogram)
    if x_int[-1] < scale:
        x_int.append(scale)
    y_bench= [0] + variogram_array.to_list()

    y_int  = np.interp(x=x_int, xp=x_bench, fp=y_bench)
    
    # for loop for each step size to caluclate the area
    ivars = 0
    for i in range(len(x_int)-1):
        ivars += 0.5*(y_int[i+1] + y_int[i]) * (x_int[i+1] - x_int[i])

    return ivars

# alias
idx = pd.IndexSlice

In [6]:
delta_h = 0.17
rng = np.random.default_rng(seed=100)
star_centres = rng.random((10, 3))
star_points  = star_vars(star_centres, delta_h=delta_h, parameters=['x1', 'x2', 'x3'], rettype='DataFrame')

In [7]:
# bounds = {'ub':[3,4,5], 'lb':[-3,-4,-5]}
# star_points_scaled = scale(star_points, bounds)
star_points

Unnamed: 0,Unnamed: 1,Unnamed: 2,0,1,2
0,x1,0,0.154982,0.596554,0.288863
0,x1,1,0.324982,0.596554,0.288863
0,x1,2,0.494982,0.596554,0.288863
0,x1,3,0.664982,0.596554,0.288863
0,x1,4,0.834982,0.596554,0.288863
...,...,...,...,...,...
9,x3,1,0.408518,0.389765,0.301648
9,x3,2,0.408518,0.389765,0.471648
9,x3,3,0.408518,0.389765,0.641648
9,x3,4,0.408518,0.389765,0.811648


In [8]:
# df = apply_unique(ishigami, star_points_scaled, axis=1)
df = apply_unique(ishigami, star_points, axis=1)
df.index.names=['centre', 'param', 'points']
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,1,2,ishigami
centre,param,points,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,x1,0,0.154982,0.596554,0.288863,2.363711
0,x1,1,0.324982,0.596554,0.288863,2.528698
0,x1,2,0.494982,0.596554,0.288863,2.684477
0,x1,3,0.664982,0.596554,0.288863,2.826555
0,x1,4,0.834982,0.596554,0.288863,2.950838
...,...,...,...,...,...,...
9,x3,1,0.408518,0.389765,0.301648,1.408061
9,x3,2,0.408518,0.389765,0.471648,1.408879
9,x3,3,0.408518,0.389765,0.641648,1.411263
9,x3,4,0.408518,0.389765,0.811648,1.416516


In [10]:
# getting the paired values of each section based on `h`
pair_df = df[ishigami.__name__].groupby(level=[0,1]).apply(section_df, delta_h)
pair_df.index.names = ['centre', 'param', 'h', 'pair_ind']
pair_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0,1
centre,param,h,pair_ind,Unnamed: 4_level_1,Unnamed: 5_level_1
0,x1,0.17,"(0, 1)",2.363711,2.528698
0,x1,0.17,"(1, 2)",2.528698,2.684477
0,x1,0.17,"(2, 3)",2.684477,2.826555
0,x1,0.17,"(3, 4)",2.826555,2.950838
0,x1,0.34,"(0, 2)",2.363711,2.684477
...,...,...,...,...,...
9,x3,0.51,"(1, 4)",1.408061,1.416516
9,x3,0.51,"(2, 5)",1.408879,1.426340
9,x3,0.68,"(0, 4)",1.407902,1.416516
9,x3,0.68,"(1, 5)",1.408061,1.426340


In [11]:
# mu_star calculation
mu_star_df = df[ishigami.__name__].groupby(level=[0,1]).mean()
mu_star_df.index.names = ['centre', 'param']
mu_star_df.unstack(level=1)

param,x1,x2,x3
centre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2.670856,2.711665,2.959823
1,5.221893,2.203489,4.830274
2,4.859827,2.559283,5.081112
3,5.250317,2.390273,5.028813
4,2.608418,2.485396,2.7039
5,5.347976,2.788918,5.44363
6,2.070988,2.521456,2.281507
7,1.533582,2.638269,1.865307
8,4.00787,2.905628,4.362239
9,1.464656,2.18713,1.41316


In [12]:
# overall mu (mean) of the unique evaluated function values over all stars points
mu_overall = df[ishigami.__name__].unique().mean()
mu_overall

3.166272392518096

In [13]:
# overall var (variance) of the unique evaluated function values over all stars points
var_overall = df[ishigami.__name__].unique().var(ddof=1)
var_overall

2.6112830283767723

In [14]:
# sectional covariogram calculation - content matches MATLAB code style!!
cov_section_all = cov_section(pair_df, mu_star_df)
cov_section_all.unstack(level=1)

Unnamed: 0_level_0,param,x1,x2,x3
centre,h,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,0.17,0.02186,1.566406,5.886643e-05
0,0.34,-0.007501,0.163242,-6.004829e-06
0,0.51,-0.043812,-1.522677,-6.623759e-05
0,0.68,-0.085995,-3.294446,-0.0001322232
0,0.85,,-4.93247,-0.0002162665
1,0.17,0.039359,1.698195,1.4713e-07
1,0.34,0.005332,0.188747,-1.782973e-08
1,0.51,-0.036175,-1.636557,-1.678103e-07
1,0.68,-0.083988,-3.573827,-3.323045e-07
1,0.85,-0.136678,-5.394484,-5.458826e-07


In [15]:
# variogram calculation
variogram_value = variogram(pair_df)
variogram_value.unstack(level=0)

param,x1,x2,x3
h,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.17,0.010931,0.450358,1.9e-05
0.34,0.044055,1.837818,5.3e-05
0.51,0.09908,4.099778,9.3e-05
0.68,0.174655,7.009718,0.000148
0.85,0.268556,10.215821,0.00025


In [16]:
# morris calculation
morris_values = morris_eq(pair_df)
morris_values[0].unstack(level=0)

param,x1,x2,x3
h,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.17,0.146551,0.90455,0.003983
0.34,0.295199,1.86371,0.007214
0.51,0.443808,2.822004,0.010482
0.68,0.590258,3.722907,0.014539
0.85,0.73265,4.516354,0.01982


In [17]:
morris_values[1].unstack(level=0)

param,x1,x2,x3
h,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.17,0.146551,0.90455,0.003983
0.34,0.295199,1.86371,0.007214
0.51,0.443808,2.822004,0.010482
0.68,0.590258,3.722907,0.014539
0.85,0.73265,4.516354,0.01982


In [18]:
# overall covariogram calculation
covariogram_value = covariogram(pair_df, mu_overall)
covariogram_value.unstack(level=0)

param,x1,x2,x3
h,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.17,2.506471,2.053916,2.1823
0.34,2.485032,0.651051,2.169058
0.51,2.461518,-1.120069,2.146429
0.68,2.449105,-3.063161,2.098655
0.85,2.514648,-4.961458,1.931184


In [19]:
# expected value of the overall covariogram calculation
e_covariogram_value = e_covariogram(cov_section_all)
e_covariogram_value.unstack(level=0)

param,x1,x2,x3
h,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.17,0.036876,1.517933,2.4e-05
0.34,0.00393,0.116156,-4e-06
0.51,-0.036289,-1.56785,-3.1e-05
0.68,-0.082623,-3.337643,-6.1e-05
0.85,-0.133762,-5.004852,-0.000101


In [30]:
((variogram_value + e_covariogram_value) / var_overall)[:, 1*delta_h]

param
x1    0.018308
x2    0.753764
x3    0.000017
dtype: float64

In [33]:
# sobol calculation
sobol_value = sobol_eq(variogram_value, e_covariogram_value, var_overall, delta_h)
sobol_value

param
x1    0.018308
x2    0.753764
x3    0.000017
dtype: float64

In [34]:
# IVARS calculation
ivars_values = [0.1, 0.3, 0.5]
ivars_df = pd.DataFrame.from_dict({scale: variogram_value.groupby(level=0).apply(ivars, scale=scale, delta_h=delta_h) \
                      for scale in ivars_values}, 'index')
ivars_df

Unnamed: 0,x1,x2,x3
0.1,0.000929,0.03828,2e-06
0.3,0.005603,0.232775,8e-06
0.5,0.017769,0.737471,2e-05


In [35]:
# calculating rankings of sobol and ivars
sobol_ranking = factor_ranking(sobol_value)
sobol_ranking_df = pd.DataFrame(data=[sobol_ranking], columns=param_names)
sobol_ranking_df

Unnamed: 0,x1,x2,x3
0,1,0,2


In [36]:
ivars_ranking = factor_ranking(ivars_df)
ivars_ranking_df = pd.DataFrame(data=ivars_ranking, columns=param_names, index=ivars_scales)
ivars_ranking_df

Unnamed: 0,x1,x2,x3
0.1,0,1,2
0.3,0,1,2
0.5,0,1,2


In [None]:
# bootstrapping to get CIs
bootstrap_size = 1000

# create result dataframes/series if bootstrapping is chosen to be done
result_bs_variogram = pd.DataFrame()
result_bs_sobol = pd.DataFrame()
result_bs_ivars_df = pd.DataFrame()
result_bs_sobol_ranking = pd.DataFrame()
result_bs_ivars_ranking = pd.DataFrame()

for _ in range(0, bootstrap_size):
    ## specify random sequence by sampling with replacement
    bootstrap_rand = np.random.choice(list(range(0,10)), size=len(range(0,10)), replace=True).tolist()
    bootstrapped_pairdf = pd.concat([pair_df.loc[idx[i, :, :, :], :] for i in bootstrap_rand])
    bootstrapped_df     = pd.concat([df.loc[idx[i, :, :], :] for i in bootstrap_rand])
    #display(bootstrapped_pairdf)
    #display(bootstrap_rand)

    ## calculating sectional covariograms
    bootstrapped_cov_section_all = pd.concat([cov_section_all.loc[idx[i, :]] for i in bootstrap_rand])
    #display('sectional variogram:')
    #display(bootstrapped_cov_section_all)
    #display(bootstrap_rand)

    ## calculating variogram, ecovariogram, variance, mean, Sobol, and IVARS values
    bootstrapped_variogram = variogram(bootstrapped_pairdf)
    #display('variogram:')
    #display(bootstrapped_variogram.unstack(level=0))

    bootstrapped_ecovariogram = e_covariogram(bootstrapped_cov_section_all)
    #display('E(covariogram):')
    #display(bootstrapped_ecovariogram.unstack(level=0))

    bootstrapped_var = bootstrapped_df[ishigami.__name__].unique().var(ddof=1)
    #display('variance:', bootstrapped_var)

    bootstrapped_sobol = sobol_eq(bootstrapped_variogram, bootstrapped_ecovariogram, bootstrapped_var)
    #display('sobol:', bootstrapped_sobol)
    
    bootstrapped_sobol_ranking = factor_ranking(bootstrapped_sobol)
    bootstrapped_sobol_ranking_df = pd.DataFrame(data=[bootstrapped_sobol_ranking], columns=param_names)

    ivars_values = [0.1, 0.3, 0.5]
    delta_h = 0.1
    bootstrapped_ivars_df = pd.DataFrame.from_dict({scale: bootstrapped_variogram.groupby(level=0).apply(ivars, scale=scale, delta_h=delta_h) \
                                                    for scale in ivars_values}, 'index')
    
    bootstrapped_ivars_ranking = factor_ranking(bootstrapped_ivars_df)
    bootstrapped_ivars_ranking_df = pd.DataFrame(data=bootstrapped_ivars_ranking, columns=param_names, index=ivars_scales)
    
    #display('ivars:', boostrapped_ivars_df)
    
    # unstack variogram
    bootstrapped_variogram_df = bootstrapped_variogram.unstack(level=0)
    
    # transpose sobol values for stacking of results
    bootstrapped_sobol_df = bootstrapped_sobol.to_frame().transpose()
    
    # attatch new results to previous results (order does not matter here)
    result_bs_variogram = pd.concat([bootstrapped_variogram_df, result_bs_variogram])
    result_bs_sobol = pd.concat([bootstrapped_sobol_df, result_bs_sobol])
    result_bs_ivars_df = pd.concat([bootstrapped_ivars_df, result_bs_ivars_df])
    result_bs_sobol_ranking = pd.concat([bootstrapped_sobol_ranking_df, result_bs_sobol_ranking])
    result_bs_ivars_ranking = pd.concat([bootstrapped_ivars_ranking_df, result_bs_ivars_ranking])

___
new bootstrapping - efficient but might result in memory leak

In [None]:
%%timeit
bootstrap_rand = np.random.choice(list(range(0,10)), size=len(range(0,10*1000)), replace=True).tolist()
b = pd.concat([pair_df.loc[idx[i, :, :, :], :] for i in bootstrap_rand])

In [None]:
%%timeit
# large memory usage though just below this line
a = pair_df.unstack(['param', 'h', 'pair_ind']).\
            sample(10*1000, replace=True).\
            stack(['param', 'h', 'pair_ind'])

In [None]:
# large memory usage though just below this line
a = pair_df.unstack(['param', 'h', 'pair_ind']).\
            sample(10*1000, replace=True).\
            stack(['param', 'h', 'pair_ind'])

# this one is efficient and does not result 

num_stars = 10
num_bstrap = 1000
bstrap_index = list(chain.from_iterable(repeat(e, pair_df.shape[0]) for e in range(int(num_bstrap))))

In [None]:
#taken from https://stackoverflow.com/a/57979836/5188208

def _handle_insert_loc(loc, n):
    """
    Computes the insert index from the right if loc is negative for a given size of n.
    """
    return n + loc + 1 if loc < 0 else loc


def add_index_level(old_index, value, name = None, loc = 0):
    """
    Expand a (multi)index by adding a level to it.

    :param old_index: The index to expand
    :param name: The name of the new index level
    :param value: Scalar or list-like, the values of the new index level
    :param loc: Where to insert the level in the index, 0 is at the front, negative values count back from the rear end
    :return: A new multi-index with the new level added
    """
    loc = _handle_insert_loc(loc, len(old_index.names))
    old_index_df = old_index.to_frame()
    old_index_df.insert(loc, name, value)
    new_index_names = list(old_index.names)  # sometimes new index level names are invented when converting to a df,
    new_index_names.insert(loc, name)        # here the original names are reconstructed
    new_index = pd.MultiIndex.from_frame(old_index_df, names=new_index_names)
    return new_index

In [None]:
a.index = add_index_level(a.index, bstrap_index, 'bootstrap', loc=0)
d = a.groupby('bootstrap')

In [None]:
d.apply(variogram)

In [None]:
# calculate upper and lower confidence interval limits of the ivars values
ivars_low = pd.DataFrame()
ivars_upp = pd.DataFrame()
for scale in ivars_scales:
    ivars_low = pd.concat([ivars_low, result_bs_ivars_df.loc[scale].quantile((1-0.9)/2).rename(scale).to_frame()], axis=1)
    ivars_upp = pd.concat([ivars_upp, result_bs_ivars_df.loc[scale].quantile(1-((1-0.9)/2)).rename(scale).to_frame()], axis=1)

ivars_low = ivars_low.transpose()
ivars_upp = ivars_upp.transpose()
display(ivars_low)
display(ivars_upp)

In [None]:
variogram_low = pd.DataFrame()
variogram_upp = pd.DataFrame()
for h in np.unique(result_bs_variogram.index.values).tolist():
    variogram_low = pd.concat([variogram_low, result_bs_variogram.loc[h].quantile((1-0.9)/2).rename(h).to_frame()], axis=1)
    variogram_upp = pd.concat([variogram_upp, result_bs_variogram.loc[h].quantile(1-((1-0.9)/2)).rename(h).to_frame()], axis=1)
    
variogram_low = variogram_low.transpose()
variogram_upp = variogram_upp.transpose()

variogram_low.index.names = ['h']
variogram_upp.index.names = ['h']

display(variogram_low)
display(variogram_upp)

In [None]:
sobol_low = result_bs_sobol.quantile((1-0.9)/2).rename('').to_frame().transpose()
sobol_upp = result_bs_sobol.quantile(1-((1-0.9)/2)).rename('').to_frame().transpose()
                            
display(sobol_low)
display(sobol_upp)

In [None]:
rel_sobol_results = []
for param in param_names:
    rel_sobol_results.append(result_bs_sobol_ranking.eq(sobol_ranking_df)[param].sum()/bootstrap_size)

rel_sobol = pd.DataFrame([rel_sobol_results],  columns=param_names)
rel_sobol

In [None]:
# small test to see if this works properly
df = pd.DataFrame({'x1' : [2], 'x2' : [0], 'x3' : [1]}, index = [0])
df
df2 = pd.DataFrame({'x1' : [1, 0, 2, 2], 'x2' : [0, 0, 0, 0], 'x3' : [0, 3, 4, 1]}, index=[0, 0, 0, 0])
df2.eq(df)['x3'].sum()

In [None]:
# calculate relibability estimate based on ivars factor rankings
rel_ivars_results = []
for param in param_names:
    rel_ivars_results_scale = []
    for scale in ivars_scales:
        rel_ivars_results_scale.append(result_bs_ivars_ranking.eq(ivars_ranking_df)[param].loc[scale].sum()/bootstrap_size)
    
    rel_ivars_results.append(rel_ivars_results_scale)

rel_ivars = pd.DataFrame(rel_ivars_results,  columns=param_names, index=ivars_scales)
rel_ivars

In [None]:
import scipy.stats as stat
import scipy.cluster.hierarchy as hchy
from matplotlib import pyplot as plt
from itertools import compress

In [None]:
def factor_grouping(sens_idx, num_grp=None):
    [m, n] = sens_idx.shape
    
    # make data 1d
    R = sens_idx.stack()
    # remove zero elements to improve numerical reasoning
    R = R[R!=0]
    
    # do a box-cox transformation
    [TRANSDAT, LAMBDA] = stat.boxcox(R)
    if LAMBDA <= 0.0099:
        TRANSDAT = np.log(R)
    
    indices = np.argwhere(np.isinf(TRANSDAT))
    if indices.shape == (2, 1):
        TRANSDAT[indices[0], indices[1]] = np.log(R[R>0])
        
    # reshape data for the linkage calculation
    S = np.reshape(TRANSDAT, [n, m])
    
    # Agglomerative hierarchical cluster
    Z = hchy.linkage(S, method='ward', metric='euclidean')
    
    # Optimal group number
    Clusters = []
    for i in range(2, n+1):
        Clusters.append(hchy.fcluster(Z, criterion='maxclust', t=i))
    # if user gives the group number preform calculations
    if num_grp:
        rank_grp = hchy.fcluster(Z, criterion='maxclust', t=num_grp)
        optm_num_grp = num_grp
        nn = 1
        id = len(Z)
        while nn != optm_num_grp:
            cutoff = Z[id-1][2]
            rank_grp = hchy.fcluster(Z, criterion='distance', t=cutoff)
            nn = np.amax(rank_grp)
            id = id - 1

        clrThrshl = 0.5*(Z[id][2] + Z[id+1][2])
    # if user does not give optimal group number use elbow method
    else:
        cutoff, clrThrshl = elbow_method(Z)
        rank_grp = hchy.fcluster(Z, criterion='distance', t=cutoff)
        optm_num_grp = max(rank_grp)
        
    
    #*** this part can be edited once we start working on plots
    #fig = plt.figure(figsize=(25,10))
    #dn = hchy.dendrogram(Z)
    #plt.show()
    
    return optm_num_grp, rank_grp, Clusters
    

In [None]:
def elbow_method(Z):
    Q1 = np.array([1, Z[0][2]])
    Q2 = np.array([len(Z), Z[-1][2]])
    
    d = []
    for i in range(0, len(Z) - 2):
        P = [i+1, Z[i][2]]
        d.append(np.abs(np.linalg.det(np.array([[Q2 - Q1], [P-Q1]])))/np.linalg.norm(Q2-Q1))
    id = d.index(max(d))
    cutoff = Z[id][2]
    clrThrshl = 0.5*(Z[id][2] + Z[id+1][2])
    
    return cutoff, clrThrshl

In [None]:
# grouping
num_grp_ivars50, ivars50_grp_array, ClustersIvars50 = factor_grouping(result_bs_ivars_df.loc[0.5], num_grp=2)
num_grp_sobol, sobol_grp_array, ClustersSobol = factor_grouping(result_bs_sobol, num_grp=2)

display(num_grp_ivars50)
display(num_grp_sobol)

display(ivars50_grp_array)
display(sobol_grp_array)

display(ClustersIvars50)
display(ClustersSobol)

In [None]:
ivars50_grps = pd.DataFrame([ivars50_grp_array], columns=param_names)
sobol_grps = pd.DataFrame([sobol_grp_array], columns=param_names)

display(ivars50_grps)
display(sobol_grps)

In [None]:
# Reliability estimates of group
cluster_sobol = []
cluster_rank_sobol = []
for g in range(0, num_grp_sobol):
    cluster_sobol.append(np.argwhere(sobol_grp_array==g + 1).flatten())
    cluster_rank_sobol.append(sobol_ranking[cluster_sobol[g]])
    cluster_rank_sobol[g] = np.sort(cluster_rank_sobol[g], axis=0)

display(cluster_sobol)
display(cluster_rank_sobol)

In [None]:
cluster_ivars50 = []
cluster_rank_ivars50 = []
for g in range(0, num_grp_ivars50):
    cluster_ivars50.append(np.argwhere(ivars50_grp_array==g + 1).flatten())
    cluster_rank_ivars50.append(ivars_ranking_df.loc[0.5].to_numpy()[cluster_ivars50[g]])
    cluster_rank_ivars50[g] = np.sort(cluster_rank_ivars50[g], axis=0)

display(cluster_ivars50)
display(cluster_rank_ivars50)

In [None]:

reli_sobol_grp_array = np.zeros(len(param_names))
reli_ivars50_grp_array = np.zeros(len(param_names))
for D in range (0, len(param_names)):
    match = [np.argwhere(cluster_sobol[x]==D).flatten() for x in range(0, len(cluster_sobol))]
    rank_range_sobol = [(match[x].size != 0) for x in range(0, len(match))]
    rank_sobol_benchmark = list(compress(cluster_rank_sobol, rank_range_sobol))
    rank_sobol_benchmark = rank_sobol_benchmark[0]
    
    match = [np.argwhere(cluster_ivars50[x]==D).flatten() for x in range(0, len(cluster_ivars50))]
    rank_range_ivars50 = [(match[x].size != 0) for x in range(0, len(match))]
    rank_ivars50_benchmark = list(compress(cluster_rank_ivars50, rank_range_ivars50))
    rank_ivars50_benchmark = rank_ivars50_benchmark[0]
    
    # calculate the reliability of paramter number D
    reli_sobol = 0
    reli_ivars50 = 0
    for i in range(0, bootstrap_size):
        reli_sobol += len(np.argwhere(result_bs_sobol_ranking.iloc[i, D] == rank_sobol_benchmark))/bootstrap_size
        reli_ivars50 += len(np.argwhere(result_bs_ivars_ranking.loc[0.5].iloc[i, D] == rank_ivars50_benchmark))/bootstrap_size
        
    reli_sobol_grp_array[D] = reli_sobol
    reli_ivars50_grp_array[D] = reli_ivars50

reli_sobol_grp = pd.DataFrame([reli_sobol_grp_array], columns=param_names)
reli_ivars50_grp = pd.DataFrame([reli_ivars50_grp_array], columns=param_names)
    
display(reli_sobol_grp)
display(reli_ivars50_grp)