In [1]:
import numpy as np
import pandas as pd

from star import star_vars
from itertools import combinations, chain, repeat

In [2]:
param_names = ['x1', 'x2', 'x3']
ivars_scales = [0.1, 0.3, 0.5]

In [3]:
def ishigami(x, a=7, b=0.05):
    '''Ishigami test function'''
    # check whether the input x is a dataframe
    
    if not isinstance(x, (pd.core.frame.DataFrame, pd.core.series.Series, np.ndarray, list)):
        raise TypeError('`x` must be of type pandas.DataFrame, numpy.ndarray, pd.Series, or list')
    
    if x.shape[0] > 3:
        raise ValueError('`x` must have only three arguments at a time')
    
    return np.sin(x[0]) + a*(np.sin(x[1])**2) + b*(x[2]**4)*np.sin(x[0])

In [4]:
def factor_ranking(factors):
    # gather indices for sorting factor
    temp = np.argsort(factors)[::-1]
    # create an array the same shape and type as temp
    ranks = np.empty_like(temp)
    # rank factors
    ranks[temp] = np.arange(len(factors))

    return ranks
    

In [5]:
# helper functions
def apply_unique(func, df, axis=1, *args, **kwargs):
    '''Apply a function to unique rows of a DataFrame
    for efficiency.'''

    applied_df = df.merge(df.drop_duplicates()
                         .assign(**{func.__name__: lambda x: x.apply(func, axis=axis)}), 
                         how='left')
    applied_df.index = df.index
    
    return applied_df
    
    
def scale(df, bounds, axis=1, *args, **kwargs):
    '''scale the sampled matrix
    bounds is a dict with ['ub', 'lb'] keys
    the values are lists of the upper and lower bounds
    of the parameters/variables/factors'''
    
    # numpy equivalent for math operations
    bounds_np = {key:np.array(value) for key,value in bounds.items()}
    
    if axis:
        return df * (bounds_np['ub'] - bounds_np['lb']) + bounds_np['lb']
    else:
        return df.T * (bounds_np['ub'] - bounds_np['lb']) + bounds_np['lb']
    
    
def pairs_h(iterable):
    '''gives the pairs of numbers considering their differences'''
    interval = range(min(iterable), max(iterable)-min(iterable))
    pairs  = {key+1:[j for j in combinations(iterable, 2) if np.abs(j[0]-j[1])==key+1] for key in interval}
    return pairs
    
    
def section_df(df):
    '''gets the paired values of each section based on index'''
    pairs = pairs_h(df.index.get_level_values(-1))
    df_values = df.to_numpy()
    sample = pd.concat({h:
                    pd.DataFrame.from_dict({str(idx_tup): [df_values[idx_tup[0]], df_values[idx_tup[1]]] for idx_tup in idx}, 'index') \
                      for h, idx in pairs.items()})

    return sample
    
    
# lambda functions
'''covariogram of each section'''
cov_section = lambda pair_cols, mu_star: (pair_cols.sub(mu_star, axis=0)[0] * pair_cols.sub(mu_star, axis=0)[1]).groupby(level=[0,1,2]).mean()

'''variogram over all sections'''
variogram = lambda pair_cols: 0.5*(pair_cols[0] - pair_cols[1]).pow(2).groupby(level=[1,2]).mean()

'''morris sensitivity measure equivalent evaluated over all sections'''
morris_eq = lambda pair_cols: ((pair_cols[1] - pair_cols[0]).abs().groupby(level=[1,2]).mean(), \
                               (pair_cols[1] - pair_cols[0]).groupby(level=[1,2]).mean())

'''covariogram over all sections'''
covariogram = lambda pair_cols, mu_overall: ((pair_cols - mu_overall)[0] * (pair_cols - mu_overall)[1]).groupby(level=[1,2]).mean()

'''expected covariogram over all sections'''
e_covariogram = lambda cov_section_all: cov_section_all.groupby(level=[1,2]).mean()

'''sobol (total order) sensitivity measure equivalent evaluated over all sections'''
sobol_eq = lambda gamma, ecov, variance: ((gamma + ecov) / variance).loc[:,1]

# ivars function
def ivars(variogram_array, scale, delta_h):
    '''generate Integrated Variogram Across a Range of Scales (IVARS)
    by approximating area using right trapezoids having width of `delta_h`
    and hights of variogram values'''
    num_h  = len(variogram_value.index.levels[-1].to_list())
    x_bench= np.arange(start=0, stop=delta_h*(num_h+1), step=delta_h)
    x_int  = np.arange(start=0, stop=(scale*10+1)/10, step=delta_h)

    # calculate interpolated values for both x (h) and y (variogram)
    if x_int[-1] < scale:
        x_int.append(scale)
    y_bench= [0] + variogram_array.to_list()

    y_int  = np.interp(x=x_int, xp=x_bench, fp=y_bench)
    
    # for loop for each step size to caluclate the area
    ivars = 0
    for i in range(len(x_int)-1):
        ivars += 0.5*(y_int[i+1] + y_int[i]) * (x_int[i+1] - x_int[i])

    return ivars

# alias
idx = pd.IndexSlice

In [6]:
delta_h = 0.1
rng = np.random.default_rng(seed=100)
star_centres = rng.random((10, 3))
star_points  = star_vars(star_centres, delta_h=delta_h, parameters=['x1', 'x2', 'x3'], rettype='DataFrame')

In [7]:
# bounds = {'ub':[3,4,5], 'lb':[-3,-4,-5]}
# star_points_scaled = scale(star_points, bounds)
star_points

Unnamed: 0,Unnamed: 1,Unnamed: 2,0,1,2
0,x1,0,0.034982,0.596554,0.288863
0,x1,1,0.134982,0.596554,0.288863
0,x1,2,0.234982,0.596554,0.288863
0,x1,3,0.334982,0.596554,0.288863
0,x1,4,0.434982,0.596554,0.288863
...,...,...,...,...,...
9,x3,5,0.408518,0.389765,0.531648
9,x3,6,0.408518,0.389765,0.631648
9,x3,7,0.408518,0.389765,0.731648
9,x3,8,0.408518,0.389765,0.831648


In [8]:
# df = apply_unique(ishigami, star_points_scaled, axis=1)
df = apply_unique(ishigami, star_points, axis=1)
df.index.names=['centre', 'param', 'points']
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,0,1,2,ishigami
centre,param,points,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,x1,0,0.034982,0.596554,0.288863,2.244282
0,x1,1,0.134982,0.596554,0.288863,2.343915
0,x1,2,0.234982,0.596554,0.288863,2.442202
0,x1,3,0.334982,0.596554,0.288863,2.538162
0,x1,4,0.434982,0.596554,0.288863,2.630836
...,...,...,...,...,...,...
9,x3,5,0.408518,0.389765,0.531648,1.409483
9,x3,6,0.408518,0.389765,0.631648,1.411058
9,x3,7,0.408518,0.389765,0.731648,1.413588
9,x3,8,0.408518,0.389765,0.831648,1.417398


In [9]:
df['ishigami'].groupby(level=[0,1]).apply(print)

centre  param  points
0       x1     0         2.244282
               1         2.343915
               2         2.442202
               3         2.538162
               4         2.630836
               5         2.719298
               6         2.802665
               7         2.880103
               8         2.950838
               9         3.014164
Name: (0, x1), dtype: float64
centre  param  points
0       x2     0         0.806598
               1         1.008512
               2         1.339316
               3         1.785823
               4         2.330231
               5         2.950838
               6         3.622901
               7         4.319627
               8         5.013240
               9         5.676088
Name: (0, x2), dtype: float64
centre  param  points
0       x3     0         2.950582
               1         2.950627
               2         2.950838
               3         2.951427
               4         2.952697
               5        

centre  param
0       x1       None
        x2       None
        x3       None
1       x1       None
        x2       None
        x3       None
2       x1       None
        x2       None
        x3       None
3       x1       None
        x2       None
        x3       None
4       x1       None
        x2       None
        x3       None
5       x1       None
        x2       None
        x3       None
6       x1       None
        x2       None
        x3       None
7       x1       None
        x2       None
        x3       None
8       x1       None
        x2       None
        x3       None
9       x1       None
        x2       None
        x3       None
Name: ishigami, dtype: object

In [10]:
# getting the paired values of each section based on `h`
pair_df = df[ishigami.__name__].groupby(level=[0,1]).apply(section_df)
pair_df.index.names = ['centre', 'param', 'h', 'pair_ind']
pair_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,0,1
centre,param,h,pair_ind,Unnamed: 4_level_1,Unnamed: 5_level_1
0,x1,1,"(0, 1)",2.244282,2.343915
0,x1,1,"(1, 2)",2.343915,2.442202
0,x1,1,"(2, 3)",2.442202,2.538162
0,x1,1,"(3, 4)",2.538162,2.630836
0,x1,1,"(4, 5)",2.630836,2.719298
...,...,...,...,...,...
9,x3,7,"(1, 8)",1.407902,1.417398
9,x3,7,"(2, 9)",1.407953,1.422860
9,x3,8,"(0, 8)",1.407896,1.417398
9,x3,8,"(1, 9)",1.407902,1.422860


In [11]:
# mu_star calculation
mu_star_df = df[ishigami.__name__].groupby(level=[0,1]).mean()
mu_star_df.index.names = ['centre', 'param']
mu_star_df.unstack(level=1)

param,x1,x2,x3
centre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2.656646,2.885317,2.95948
1,5.243696,2.068463,4.830351
2,4.864449,2.432892,5.084556
3,5.330994,2.254328,5.028727
4,2.554175,2.655503,2.704976
5,5.378425,2.651379,5.443588
6,2.075819,2.862646,2.280281
7,1.513483,2.958328,1.866293
8,4.072956,2.926255,4.365968
9,1.435236,2.505585,1.411486


In [12]:
# overall mu (mean) of the unique evaluated function values over all stars points
mu_overall = df[ishigami.__name__].unique().mean()
mu_overall

3.2181307939105364

In [13]:
# overall var (variance) of the unique evaluated function values over all stars points
var_overall = df[ishigami.__name__].unique().var(ddof=1)
var_overall

2.625633323232234

In [14]:
# sectional covariogram calculation - content matches MATLAB code style!!
cov_section_all = cov_section(pair_df, mu_star_df)
cov_section_all.unstack(level=1)

Unnamed: 0_level_0,param,x1,x2,x3
centre,h,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,1,0.048396,2.130395,0.000085
0,2,0.032084,1.415033,0.000045
0,3,0.013009,0.545250,0.000009
0,4,-0.008655,-0.447458,-0.000026
0,5,-0.032701,-1.525340,-0.000059
...,...,...,...,...
9,5,-0.033542,-1.506603,-0.000010
9,6,-0.060431,-2.610690,-0.000016
9,7,-0.089252,-3.710385,-0.000023
9,8,-0.119703,-4.758991,-0.000031


In [15]:
# variogram calculation
variogram_value = variogram(pair_df)
variogram_value.unstack(level=0)

param,x1,x2,x3
h,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.003723,0.157006,9e-06
2,0.014996,0.643154,3e-05
3,0.033887,1.467836,5.5e-05
4,0.060336,2.620288,8.1e-05
5,0.094159,4.06793,0.00011
6,0.135042,5.756585,0.000144
7,0.182547,7.612715,0.000191
8,0.23612,9.547591,0.000259
9,0.295096,11.463071,0.000365


In [16]:
# morris calculation
morris_values = morris_eq(pair_df)
morris_values[0].unstack(level=0)

param,x1,x2,x3
h,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.085347,0.531766,0.002697
2,0.171708,1.090234,0.004997
3,0.258653,1.664431,0.00707
4,0.345745,2.242759,0.009086
5,0.432546,2.813301,0.011216
6,0.518617,3.364139,0.013629
7,0.603523,3.883672,0.016496
8,0.686833,4.360934,0.019987
9,0.768126,4.785892,0.024272


In [17]:
morris_values[1].unstack(level=0)

param,x1,x2,x3
h,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.085347,0.531766,0.002697
2,0.171708,1.090234,0.004997
3,0.258653,1.664431,0.00707
4,0.345745,2.242759,0.009086
5,0.432546,2.813301,0.011216
6,0.518617,3.364139,0.013629
7,0.603523,3.883672,0.016496
8,0.686833,4.360934,0.019987
9,0.768126,4.785892,0.024272


In [18]:
# overall covariogram calculation
covariogram_value = covariogram(pair_df, mu_overall)
covariogram_value.unstack(level=0)

param,x1,x2,x3
h,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,2.532258,2.555273,2.193087
2,2.517009,1.886561,2.192966
3,2.49801,1.042883,2.192949
4,2.475428,0.054558,2.193032
5,2.449472,-1.04133,2.193213
6,2.420388,-2.202352,2.193492
7,2.388463,-3.382391,2.193875
8,2.354013,-4.533451,2.19437
9,2.317387,-5.607577,2.194989


In [19]:
# expected value of the overall covariogram calculation
e_covariogram_value = e_covariogram(cov_section_all)
e_covariogram_value.unstack(level=0)

param,x1,x2,x3
h,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,0.048206,2.064049,4.9e-05
2,0.03195,1.368743,2.6e-05
3,0.012943,0.524264,5e-06
4,-0.008639,-0.438263,-1.5e-05
5,-0.032586,-1.481585,-3.4e-05
6,-0.058656,-2.56376,-5.5e-05
7,-0.086574,-3.639797,-7.8e-05
8,-0.116039,-4.663456,-0.000104
9,-0.146724,-5.589107,-0.000136


In [30]:
((variogram_value + e_covariogram_value) / var_overall).loc[:, 1]

param
x1    0.019777
x2    0.845912
x3    0.000022
dtype: float64

In [20]:
# sobol calculation
sobol_value = sobol_eq(variogram_value, e_covariogram_value, var_overall)
sobol_value

param
x1    0.019777
x2    0.845912
x3    0.000022
dtype: float64

In [21]:
# IVARS calculation
ivars_values = [0.1, 0.3, 0.5]
ivars_df = pd.DataFrame.from_dict({scale: variogram_value.groupby(level=0).apply(ivars, scale=scale, delta_h=delta_h) \
                      for scale in ivars_values}, 'index')
ivars_df

Unnamed: 0,x1,x2,x3
0.1,0.000186,0.00785,4.686609e-07
0.3,0.003566,0.153408,6.656532e-06
0.5,0.016002,0.692225,2.297963e-05


In [31]:
# calculating rankings of sobol and ivars
sobol_ranking = factor_ranking(sobol_value)
sobol_ranking_df = pd.DataFrame(data=[sobol_ranking], columns=param_names)
sobol_ranking_df

Unnamed: 0,x1,x2,x3
0,1,0,2


In [32]:
ivars_ranking = factor_ranking(ivars_df)
ivars_ranking_df = pd.DataFrame(data=ivars_ranking, columns=param_names, index=ivars_scales)
ivars_ranking_df

Unnamed: 0,x1,x2,x3
0.1,0,1,2
0.3,0,1,2
0.5,0,1,2


In [None]:
# bootstrapping to get CIs
bootstrap_size = 1000

# create result dataframes/series if bootstrapping is chosen to be done
result_bs_variogram = pd.DataFrame()
result_bs_sobol = pd.DataFrame()
result_bs_ivars_df = pd.DataFrame()
result_bs_sobol_ranking = pd.DataFrame()
result_bs_ivars_ranking = pd.DataFrame()

for _ in range(0, bootstrap_size):
    ## specify random sequence by sampling with replacement
    bootstrap_rand = np.random.choice(list(range(0,10)), size=len(range(0,10)), replace=True).tolist()
    bootstrapped_pairdf = pd.concat([pair_df.loc[idx[i, :, :, :], :] for i in bootstrap_rand])
    bootstrapped_df     = pd.concat([df.loc[idx[i, :, :], :] for i in bootstrap_rand])
    #display(bootstrapped_pairdf)
    #display(bootstrap_rand)

    ## calculating sectional covariograms
    bootstrapped_cov_section_all = pd.concat([cov_section_all.loc[idx[i, :]] for i in bootstrap_rand])
    #display('sectional variogram:')
    #display(bootstrapped_cov_section_all)
    #display(bootstrap_rand)

    ## calculating variogram, ecovariogram, variance, mean, Sobol, and IVARS values
    bootstrapped_variogram = variogram(bootstrapped_pairdf)
    #display('variogram:')
    #display(bootstrapped_variogram.unstack(level=0))

    bootstrapped_ecovariogram = e_covariogram(bootstrapped_cov_section_all)
    #display('E(covariogram):')
    #display(bootstrapped_ecovariogram.unstack(level=0))

    bootstrapped_var = bootstrapped_df[ishigami.__name__].unique().var(ddof=1)
    #display('variance:', bootstrapped_var)

    bootstrapped_sobol = sobol_eq(bootstrapped_variogram, bootstrapped_ecovariogram, bootstrapped_var)
    #display('sobol:', bootstrapped_sobol)
    
    bootstrapped_sobol_ranking = factor_ranking(bootstrapped_sobol)
    bootstrapped_sobol_ranking_df = pd.DataFrame(data=[bootstrapped_sobol_ranking], columns=param_names)

    ivars_values = [0.1, 0.3, 0.5]
    delta_h = 0.1
    bootstrapped_ivars_df = pd.DataFrame.from_dict({scale: bootstrapped_variogram.groupby(level=0).apply(ivars, scale=scale, delta_h=delta_h) \
                                                    for scale in ivars_values}, 'index')
    
    bootstrapped_ivars_ranking = factor_ranking(bootstrapped_ivars_df)
    bootstrapped_ivars_ranking_df = pd.DataFrame(data=bootstrapped_ivars_ranking, columns=param_names, index=ivars_scales)
    
    #display('ivars:', boostrapped_ivars_df)
    
    # unstack variogram
    bootstrapped_variogram_df = bootstrapped_variogram.unstack(level=0)
    
    # transpose sobol values for stacking of results
    bootstrapped_sobol_df = bootstrapped_sobol.to_frame().transpose()
    
    # attatch new results to previous results (order does not matter here)
    result_bs_variogram = pd.concat([bootstrapped_variogram_df, result_bs_variogram])
    result_bs_sobol = pd.concat([bootstrapped_sobol_df, result_bs_sobol])
    result_bs_ivars_df = pd.concat([bootstrapped_ivars_df, result_bs_ivars_df])
    result_bs_sobol_ranking = pd.concat([bootstrapped_sobol_ranking_df, result_bs_sobol_ranking])
    result_bs_ivars_ranking = pd.concat([bootstrapped_ivars_ranking_df, result_bs_ivars_ranking])

___
new bootstrapping - efficient but might result in memory leak

In [None]:
%%timeit
bootstrap_rand = np.random.choice(list(range(0,10)), size=len(range(0,10*1000)), replace=True).tolist()
b = pd.concat([pair_df.loc[idx[i, :, :, :], :] for i in bootstrap_rand])

In [None]:
%%timeit
# large memory usage though just below this line
a = pair_df.unstack(['param', 'h', 'pair_ind']).\
            sample(10*1000, replace=True).\
            stack(['param', 'h', 'pair_ind'])

In [None]:
# large memory usage though just below this line
a = pair_df.unstack(['param', 'h', 'pair_ind']).\
            sample(10*1000, replace=True).\
            stack(['param', 'h', 'pair_ind'])

# this one is efficient and does not result 

num_stars = 10
num_bstrap = 1000
bstrap_index = list(chain.from_iterable(repeat(e, pair_df.shape[0]) for e in range(int(num_bstrap))))

In [None]:
#taken from https://stackoverflow.com/a/57979836/5188208

def _handle_insert_loc(loc, n):
    """
    Computes the insert index from the right if loc is negative for a given size of n.
    """
    return n + loc + 1 if loc < 0 else loc


def add_index_level(old_index, value, name = None, loc = 0):
    """
    Expand a (multi)index by adding a level to it.

    :param old_index: The index to expand
    :param name: The name of the new index level
    :param value: Scalar or list-like, the values of the new index level
    :param loc: Where to insert the level in the index, 0 is at the front, negative values count back from the rear end
    :return: A new multi-index with the new level added
    """
    loc = _handle_insert_loc(loc, len(old_index.names))
    old_index_df = old_index.to_frame()
    old_index_df.insert(loc, name, value)
    new_index_names = list(old_index.names)  # sometimes new index level names are invented when converting to a df,
    new_index_names.insert(loc, name)        # here the original names are reconstructed
    new_index = pd.MultiIndex.from_frame(old_index_df, names=new_index_names)
    return new_index

In [None]:
a.index = add_index_level(a.index, bstrap_index, 'bootstrap', loc=0)
d = a.groupby('bootstrap')

In [None]:
d.apply(variogram)

In [None]:
# calculate upper and lower confidence interval limits of the ivars values
ivars_low = pd.DataFrame()
ivars_upp = pd.DataFrame()
for scale in ivars_scales:
    ivars_low = pd.concat([ivars_low, result_bs_ivars_df.loc[scale].quantile((1-0.9)/2).rename(scale).to_frame()], axis=1)
    ivars_upp = pd.concat([ivars_upp, result_bs_ivars_df.loc[scale].quantile(1-((1-0.9)/2)).rename(scale).to_frame()], axis=1)

ivars_low = ivars_low.transpose()
ivars_upp = ivars_upp.transpose()
display(ivars_low)
display(ivars_upp)

In [None]:
variogram_low = pd.DataFrame()
variogram_upp = pd.DataFrame()
for h in np.unique(result_bs_variogram.index.values).tolist():
    variogram_low = pd.concat([variogram_low, result_bs_variogram.loc[h].quantile((1-0.9)/2).rename(h).to_frame()], axis=1)
    variogram_upp = pd.concat([variogram_upp, result_bs_variogram.loc[h].quantile(1-((1-0.9)/2)).rename(h).to_frame()], axis=1)
    
variogram_low = variogram_low.transpose()
variogram_upp = variogram_upp.transpose()

variogram_low.index.names = ['h']
variogram_upp.index.names = ['h']

display(variogram_low)
display(variogram_upp)

In [None]:
sobol_low = result_bs_sobol.quantile((1-0.9)/2).rename('').to_frame().transpose()
sobol_upp = result_bs_sobol.quantile(1-((1-0.9)/2)).rename('').to_frame().transpose()
                            
display(sobol_low)
display(sobol_upp)

In [None]:
rel_sobol_results = []
for param in param_names:
    rel_sobol_results.append(result_bs_sobol_ranking.eq(sobol_ranking_df)[param].sum()/bootstrap_size)

rel_sobol = pd.DataFrame([rel_sobol_results],  columns=param_names)
rel_sobol

In [None]:
# small test to see if this works properly
df = pd.DataFrame({'x1' : [2], 'x2' : [0], 'x3' : [1]}, index = [0])
df
df2 = pd.DataFrame({'x1' : [1, 0, 2, 2], 'x2' : [0, 0, 0, 0], 'x3' : [0, 3, 4, 1]}, index=[0, 0, 0, 0])
df2.eq(df)['x3'].sum()

In [None]:
# calculate relibability estimate based on ivars factor rankings
rel_ivars_results = []
for param in param_names:
    rel_ivars_results_scale = []
    for scale in ivars_scales:
        rel_ivars_results_scale.append(result_bs_ivars_ranking.eq(ivars_ranking_df)[param].loc[scale].sum()/bootstrap_size)
    
    rel_ivars_results.append(rel_ivars_results_scale)

rel_ivars = pd.DataFrame(rel_ivars_results,  columns=param_names, index=ivars_scales)
rel_ivars

In [None]:
import scipy.stats as stat
import scipy.cluster.hierarchy as hchy
from matplotlib import pyplot as plt

In [None]:
def factor_grouping(sens_idx, num_grp=None):
    [m, n] = sens_idx.shape
    
    # drop zero elements in sens_idx
    #***not sure about this part I think it would be best for Kasra to decide
    
    # make data 1d
    R = sens_idx.stack()
    
    # do a box-cox transformation
    [TRANSDAT, LAMBDA] = stat.boxcox(R)
    if LAMBDA <= 0.0099:
        TRANSDAT = np.log(R)
    
    indices = np.argwhere(np.isinf(TRANSDAT))
    if indices.shape == (2, 1):
        TRANSDAT[indices[0], indices[1]] = np.log(R[R>0])
        
    # reshape data for the linkage calculation
    S = np.reshape(TRANSDAT, [n, m])
    
    # Agglomerative hierarchical cluster
    Z = hchy.linkage(S, method='ward', metric='euclidean')
    
    # Optimal group number
    Clusters = hchy.fcluster(Z, criterion='maxclust', t=np.arange(start=2, stop=n))
    # if user gives the group number preform calculations
    if num_grp:
        rank_grp = hchy.fcluster(Z, criterion='maxclust', t=num_grp)
        optm_num_grp = num_grp
        nn = 1
        id = len(Z)
        while nn != optm_num_grp:
            cutoff = Z[id-1][2]
            rank_grp = hchy.fcluster(Z, criterion='distance', t=cutoff)
            nn = np.amax(rank_grp)
            id = id - 1

        clrThrshl = 0.5*(Z[id][2] + Z[id+1][2])
    # if user does not give optimal group number use elbow method
    else:
        cutoff, clrThrshl = elbow_method(Z)
        rank_grp = hchy.fcluster(Z, criterion='distance', t=cutoff)
        optm_num_grp = max(rank_grp)
        
    
    #*** this part can be edited once we start working on plots
    #fig = plt.figure(figsize=(25,10))
    #dn = hchy.dendrogram(Z)
    #plt.show()
    
    return optm_num_grp, rank_grp, Clusters, S
    

In [None]:
def elbow_method(Z):
    Q1 = np.array([1, Z[0][2]])
    Q2 = np.array([len(Z), Z[-1][2]])
    
    d = []
    for i in range(0, len(Z) - 2):
        P = [i+1, Z[i][2]]
        d.append(np.abs(np.linalg.det(np.array([[Q2 - Q1], [P-Q1]])))/np.linalg.norm(Q2-Q1))
    display(Z)
    id = d.index(max(d))
    cutoff = Z[id][2]
    clrThrshl = 0.5*(Z[id][2] + Z[id+1][2])
    
    return cutoff, clrThrshl

In [None]:
# grouping
sens_idx = result_bs_ivars_df.loc[0.5]
optm_num_grp, rank_grp, Clusters, S = factor_grouping(sens_idx)