In [1]:
import numpy as np
import pandas as pd
import itertools
import os

In [2]:
FILE_DIR = "testResultSmall"
nExperimentReptition = 10
randomSeed = 20220222
trueValue = 0.005
dataDistributions = ['gamma', 'lognorm', 'pareto']
dataSizes = [500, 800]

### Tail probability estimation with single threshold

In [3]:
metaDataDict = {"dataSize": 500,
                "percentageLHS": 0.99,
                "percentageRHS": 0.995,
                "thresholdPercentage": 0.7,
                "alpha": 0.05,
                "gEllipsoidalDimension": 3}

In [4]:
thresholdPercentages = np.linspace(0.6, 0.85, 11).tolist()
# served as the lhsEndpoint in the objective function: 1_{lhs<=x<=rhs}.
percentageLHSs = np.linspace(0.9, 0.99, 10).tolist()
columnNames = ['dataDistribution','dataSize','percentageLHS', 'percentageRHS', "thresholdPercentage", "trueValue", "nrepIndex", "(0,KS)","(1,KS)","(2,KS)","(0,CHI2)","(1,CHI2)","(2,CHI2)"]
cumDf1 = pd.DataFrame(columns=columnNames)
for dataDistribution, dataSize, percentageLHS, thresholdPercentage in itertools.product(*[dataDistributions, dataSizes, percentageLHSs, thresholdPercentages]):
    metaDataDict["dataSize"] = dataSize
    metaDataDict["percentageLHS"] = percentageLHS
    metaDataDict["percentageRHS"] = percentageLHS+trueValue
    metaDataDict["thresholdPercentage"] = thresholdPercentage
    assert "random_state" not in metaDataDict
    poolParamList = [(dataDistribution, metaDataDict, random_state+randomSeed)
                     for random_state in range(nExperimentReptition)]
    FILE_NAME = ["tailProbabilityEstimation"]
    FILE_NAME += ["dataDistribution="+dataDistribution]
    FILE_NAME += [key+"="+str(metaDataDict[key])
                  for key in metaDataDict]
    FILE_NAME += ["randomSeed="+str(randomSeed)]
    FILE_NAME += ["nExperimentReptition="+str(nExperimentReptition)]
    FILE_NAME = '_'.join(FILE_NAME)+".csv"
    FILE_NAME = FILE_NAME.replace("00000000000001","").replace("0000000000001","")
    df = pd.read_csv(os.path.join(FILE_DIR, FILE_NAME),
                     index_col="Experiment Repetition Index")
    df.reset_index(inplace=True)
    df.rename(columns={"Experiment Repetition Index":"nrepIndex"},inplace=True)
    df["dataDistribution"] = dataDistribution
    df["dataSize"] = dataSize
    df["percentageLHS"] = percentageLHS
    df["percentageRHS"] = percentageLHS+trueValue
    df["thresholdPercentage"] = thresholdPercentage
    df["trueValue"] = trueValue
    cumDf1 = cumDf1.append(df)

### Tail probability estimation with multiple thresholds

In [5]:
thresholdPercentages = [0.6, 0.65, 0.70, 0.75, 0.8]
# served as the lhsEndpoint in the objective function: 1_{lhs<=x<=rhs}.
percentageLHSs = np.linspace(0.9, 0.99, 10).tolist()
dataSizes = [500, 800]

cumDf2 = pd.DataFrame(columns=columnNames)
for dataDistribution, dataSize, percentageLHS, thresholdPercentage in itertools.product(*[dataDistributions, dataSizes, percentageLHSs, thresholdPercentages]):
    metaDataDict["dataSize"] = dataSize
    metaDataDict["percentageLHS"] = percentageLHS
    metaDataDict["percentageRHS"] = percentageLHS+trueValue
    metaDataDict["thresholdPercentage"] = [thresholdPercentage +
                                           increment for increment in [0, 0.01, 0.02, 0.03, 0.04]]
    assert "random_state" not in metaDataDict
    poolParamList = [(dataDistribution, metaDataDict, random_state+randomSeed)
                     for random_state in range(nExperimentReptition)]
    FILE_NAME = ["tailProbabilityEstimation"]
    FILE_NAME += ["dataDistribution="+dataDistribution]
    FILE_NAME += [key+"="+str(metaDataDict[key])
                  for key in metaDataDict]
    FILE_NAME += ["randomSeed="+str(randomSeed)]
    FILE_NAME += ["nExperimentReptition="+str(nExperimentReptition)]
    FILE_NAME = '_'.join(FILE_NAME)+".csv"
    FILE_NAME = FILE_NAME.replace("00000000000001","").replace("0000000000001","")
    df = pd.read_csv(os.path.join(FILE_DIR, FILE_NAME),
                     index_col="Experiment Repetition Index")
    df.reset_index(inplace=True)
    df.rename(columns={"Experiment Repetition Index":"nrepIndex"},inplace=True)
    df["dataDistribution"] = dataDistribution
    df["dataSize"] = dataSize
    df["percentageLHS"] = percentageLHS
    df["percentageRHS"] = percentageLHS+trueValue
    df["thresholdPercentage"] = thresholdPercentage
    df["trueValue"] = trueValue
    cumDf2 = cumDf2.append(df)

### Quantile estimation with single threshold

In [6]:
import dataPreparationUtils as dpu
from scipy.stats import gamma, lognorm, pareto, genpareto
stringToDataModule = {"gamma": gamma,
                      "lognorm": lognorm,
                      "pareto": pareto,
                      "genpareto": genpareto}
metaDataDict = {"dataSize": 500,
                "quantitleValue": 0.99,
                "thresholdPercentage": 0.7,
                "alpha": 0.05,
                "gEllipsoidalDimension": 3}

In [7]:
thresholdPercentages = np.linspace(0.6, 0.85, 11).tolist()
quantitleValues = np.linspace(0.9, 0.99, 10).tolist()
dataSizes = [500, 800]
columnNames = ['dataDistribution','dataSize','quantitleValue', "thresholdPercentage", "trueValue", "nrepIndex",
               "(0,CHI2)","(1,CHI2)","(2,CHI2)"]

cumDf3 = pd.DataFrame(columns=columnNames)
for dataDistribution, dataSize, quantitleValue, thresholdPercentage in itertools.product(*[dataDistributions, dataSizes, quantitleValues, thresholdPercentages]):
    metaDataDict["dataSize"] = dataSize
    metaDataDict["quantitleValue"] = quantitleValue
    metaDataDict["thresholdPercentage"] = thresholdPercentage
    assert "random_state" not in metaDataDict
    poolParamList = [(dataDistribution, metaDataDict, random_state+randomSeed)
                     for random_state in range(nExperimentReptition)]
    FILE_NAME = ["quantileEstimation"]
    FILE_NAME += ["dataDistribution="+dataDistribution]
    FILE_NAME += [key+"="+str(metaDataDict[key])
                  for key in metaDataDict]
    FILE_NAME += ["randomSeed="+str(randomSeed)]
    FILE_NAME += ["nExperimentReptition="+str(nExperimentReptition)]
    FILE_NAME = '_'.join(FILE_NAME)+".csv"
    FILE_NAME = FILE_NAME.replace("00000000000001","").replace("0000000000001","")
    df = pd.read_csv(os.path.join(FILE_DIR, FILE_NAME),
                     index_col="Experiment Repetition Index")
    df.reset_index(inplace=True)
    df.rename(columns={"Experiment Repetition Index":"nrepIndex"},inplace=True)
    df["dataDistribution"] = dataDistribution
    df["dataSize"] = dataSize
    df["quantitleValue"] = quantitleValue
    df["thresholdPercentage"] = thresholdPercentage
    trueValue = dpu.endPointGeneration(
        stringToDataModule[dataDistribution], quantitleValue, dpu.dataModuleToDefaultParamDict[stringToDataModule[dataDistribution]])        
    df["trueValue"] = trueValue
    
    cumDf3 = cumDf3.append(df)    

### Quantile estimation with multiple thresholds

In [8]:
thresholdPercentages = [0.6, 0.65, 0.70, 0.75, 0.8]
quantitleValues = np.linspace(0.9, 0.99, 10).tolist()
dataSizes = [500, 800]
columnNames = ['dataDistribution','dataSize','quantitleValue', "thresholdPercentage", "trueValue", "nrepIndex",
               "(0,CHI2)","(1,CHI2)","(2,CHI2)"]

cumDf4 = pd.DataFrame(columns=columnNames)
for dataDistribution, dataSize, quantitleValue, thresholdPercentage in itertools.product(*[dataDistributions, dataSizes, quantitleValues, thresholdPercentages]):
    metaDataDict["dataSize"] = dataSize
    metaDataDict["quantitleValue"] = quantitleValue
    metaDataDict["thresholdPercentage"] = [thresholdPercentage +
                                           increment for increment in [0, 0.01, 0.02, 0.03, 0.04]]
    assert "random_state" not in metaDataDict
    poolParamList = [(dataDistribution, metaDataDict, random_state+randomSeed)
                     for random_state in range(nExperimentReptition)]
    FILE_NAME = ["quantileEstimation"]
    FILE_NAME += ["dataDistribution="+dataDistribution]
    FILE_NAME += [key+"="+str(metaDataDict[key])
                  for key in metaDataDict]
    FILE_NAME += ["randomSeed="+str(randomSeed)]
    FILE_NAME += ["nExperimentReptition="+str(nExperimentReptition)]
    FILE_NAME = '_'.join(FILE_NAME)+".csv"
    FILE_NAME = FILE_NAME.replace("00000000000001","").replace("0000000000001","")
    df = pd.read_csv(os.path.join(FILE_DIR, FILE_NAME),
                     index_col="Experiment Repetition Index")
    df.reset_index(inplace=True)
    df.rename(columns={"Experiment Repetition Index":"nrepIndex"},inplace=True)
    df["dataDistribution"] = dataDistribution
    df["dataSize"] = dataSize
    df["quantitleValue"] = quantitleValue
    df["thresholdPercentage"] = thresholdPercentage    
    trueValue = dpu.endPointGeneration(
        stringToDataModule[dataDistribution], quantitleValue, dpu.dataModuleToDefaultParamDict[stringToDataModule[dataDistribution]])    
    df["trueValue"] = trueValue
    
    cumDf4 = cumDf4.append(df)        

In [186]:
go1 = cumDf1.groupby(by=['dataDistribution','dataSize','percentageLHS','thresholdPercentage'])
go2 = cumDf2.groupby(by=['dataDistribution','dataSize','percentageLHS','thresholdPercentage'])
go3 = cumDf3.groupby(by=['dataDistribution','dataSize','quantitleValue','thresholdPercentage'])
go4 = cumDf4.groupby(by=['dataDistribution','dataSize','quantitleValue','thresholdPercentage'])
# dictGO_key = list(dictGO.keys())
# dictGO_value = list(dictGO.values())
# using the variable axs for multiple Axes

In [30]:
def getSignificanceNExponent (value: float):
        exponent = np.floor(np.log(value)/np.log(10))
        return (value/10**exponent, exponent)
    
def tableFiveOneUnit(targetColumns, EstimatedUpperBound, RelativeRatio, CoverageProbability):
    content = r""
    for i, targetColumn in enumerate(targetColumns):
        significance, exponent = getSignificanceNExponent(EstimatedUpperBound[i])
        if targetColumn == '(0,CHI2)':
            content+=r"$(0,\chi^2)$ & ${:.2f}/{:.2f}\times 10^{{{:d}}}$ & ${:2g}$".format(RelativeRatio[i], significance, int(exponent),CoverageProbability[i])+'''\\\\
        '''
        elif targetColumn == '(1,CHI2)':
            content+=r"$(1,\chi^2)$ & ${:.2f}/{:.2f}\times 10^{{{:d}}}$ & ${:2g}$".format(RelativeRatio[i], significance, int(exponent),CoverageProbability[i])+'''\\\\
        '''
        elif targetColumn == '(2,CHI2)':
            content+=r"$(2,\chi^2)$ & ${:.2f}/{:.2f}\times 10^{{{:d}}}$ & ${:2g}$".format(RelativeRatio[i], significance, int(exponent),CoverageProbability[i])+'''\\\\
        '''
        elif targetColumn == '(0,KS)':
            content+=r"$(0,\text{{KS}})$ & ${:.2f}/{:.2f}\times 10^{{{:d}}}$ & ${:2g}$".format(RelativeRatio[i], significance, int(exponent),CoverageProbability[i])+'''\\\\
        '''
        elif targetColumn == '(1,KS)':
            content+=r"$(1,\text{{KS}})$ & ${:.2f}/{:.2f}\times 10^{{{:d}}}$ & ${:2g}$".format(RelativeRatio[i], significance, int(exponent),CoverageProbability[i])+'''\\\\
        '''
        elif targetColumn == '(2,KS)':
            content+=r"$(2,\text{{KS}})$ & ${:.2f}/{:.2f}\times 10^{{{:d}}}$ & ${:2g}$".format(RelativeRatio[i], significance, int(exponent),CoverageProbability[i])+'''\\\\
        '''
        else:
            print(targetColumn)
            assert False
    return content

def tableFiveOneSubTable(content, subtableTitle, subtableLabel, textwidth):
    return r"\begin{subtable}"+r"{{{:}\textwidth}}".format(textwidth)+\
        r'''\begin{tabular}{ccc}
        \toprule
        \hline
        \multicolumn{1}{p{2.8cm}}{Constraint setting} &
        \multicolumn{1}{p{3.9cm}}{Relative ratio/Estimated UpperBound}  &
        \multicolumn{1}{p{2cm}}{Converage probability}  \\\hline
        '''+\
    content+\
    r'''\hline
    \bottomrule
    \end{tabular}
    '''+\
    r'''\caption{{{:}}}
    '''.format(subtableTitle)+\
    r'''\label{{{:}}}
    '''.format(subtableLabel)+\
    r'''\end{subtable}
    '''    

# tableFiveOneUnit(targetColumns, EstimatedUpperBound, RelativeRatio, CoverageProbability)

In [31]:
# targetColumns = ['(0,CHI2)','(1,CHI2)','(2,CHI2)']
# #                  '(1,KS)','(2,KS)']
# keyChoice1 = ('gamma', 500, 0.99, 0.7)
# keyChoice2 = ('gamma', 800, 0.99, 0.7)
# go3.get_group(keyChoice1)


In [39]:

def tableFiveOne(groupby_object:pd.core.groupby.generic.DataFrameGroupBy, targetColumns:list,
                 keyChoice1:tuple, keyChoice2:tuple, 
                 subTableTitle1:str, subTableLabel1:str, subTableTitle2:str, subTableLabel2:str,
                 tableTitle:str, tableLabel:str, scalebox:float = 0.7, textwidth:float=0.7):
    trueValue1 = groupby_object.get_group(keyChoice1)['trueValue'].unique()
    RelativeRatio1 = (groupby_object.get_group(keyChoice1)[targetColumns].mean()/trueValue1).values
    EstimatedUpperBound1 = groupby_object.get_group(keyChoice1)[targetColumns].mean().values
    CoverageProbability1 = (groupby_object.get_group(keyChoice1)[targetColumns]>trueValue1[0]).mean().values
    trueValue2 = groupby_object.get_group(keyChoice2)['trueValue'].unique()
    RelativeRatio2 = (groupby_object.get_group(keyChoice2)[targetColumns].mean()/trueValue2).values
    EstimatedUpperBound2 = groupby_object.get_group(keyChoice2)[targetColumns].mean().values
    CoverageProbability2 = (groupby_object.get_group(keyChoice2)[targetColumns]>trueValue2[0]).mean().values
    #We assume that subTable1 and subTable2 have the same true value. 
    latexTable = r'''
    \begin{table}[ht]
    \centering'''+\
    r'''\scalebox{{{:}}}{{
    '''.format(scalebox)+\
    tableFiveOneSubTable(
        tableFiveOneUnit(targetColumns, EstimatedUpperBound1, RelativeRatio1, CoverageProbability1),
        subTableTitle1, subTableLabel1, textwidth)+\
    r'''\quad\quad\quad\quad
    '''+\
    tableFiveOneSubTable(
        tableFiveOneUnit(targetColumns, EstimatedUpperBound2, RelativeRatio2, CoverageProbability2),
        subTableTitle2, subTableLabel2, textwidth)+\
    r"}"+\
    r'''\caption{{{:}}}
    '''.format(tableTitle+"True value is {:g}.".format(trueValue1[0]))+\
    r'''\label{{{:}}}
    '''.format(tableLabel)+\
    r'''\end{table}'''
    return latexTable

In [104]:
targetColumns = ['(0,CHI2)','(1,CHI2)','(2,CHI2)','(0,KS)','(1,KS)','(2,KS)']
keyChoice1 = ('gamma', 500, 0.99, 0.7)
keyChoice2 = ('gamma', 800, 0.99, 0.7)
subTableTitle1 = r"Data sample size = $500$. "
subTableLabel1 = "stb11_tpe_gamma"
subTableTitle2 = r"Data sample size = $800$. "
subTableLabel2 = "stb12_tpe_gamma"
tableTitle = r"Tail probablity estimation from Gamma data source. "
tableLabel = "stb1_tpe_gamma"
print(tableFiveOne(go1, targetColumns, keyChoice1, keyChoice2, subTableTitle1, subTableLabel1, subTableTitle2, subTableLabel2, tableTitle, tableLabel))
keyChoice1 = ('lognorm', 500, 0.99, 0.7)
keyChoice2 = ('lognorm', 800, 0.99, 0.7)
subTableTitle1 = r"Data sample size = $500$. "
subTableLabel1 = "stb11_tpe_lognormal"
subTableTitle2 = r"Data sample size = $800$. "
subTableLabel2 = "stb12_tpe_lognormal"
tableTitle = r"Tail probablity estimation from Lognormal data source. "
tableLabel = "tb1_tpe_lognormal"
print(tableFiveOne(go1, targetColumns, keyChoice1, keyChoice2, subTableTitle1, subTableLabel1, subTableTitle2, subTableLabel2, tableTitle, tableLabel))
keyChoice1 = ('pareto', 500, 0.99, 0.7)
keyChoice2 = ('pareto', 800, 0.99, 0.7)
subTableTitle1 = r"Data sample size = $500$. "
subTableLabel1 = "stb11_tpe_pareto"
subTableTitle2 = r"Data sample size = $800$. "
subTableLabel2 = "stb12_tpe_pareto"
tableTitle = r"Tail probablity estimation from Pareto data source. "
tableLabel = "tb1_tpe_pareto"
print(tableFiveOne(go1, targetColumns, keyChoice1, keyChoice2, subTableTitle1, subTableLabel1, subTableTitle2, subTableLabel2, tableTitle, tableLabel))


targetColumns = ['(0,CHI2)','(1,CHI2)','(2,CHI2)']
#                  '(1,KS)','(2,KS)']
keyChoice1 = ('gamma', 500, 0.99, 0.7)
keyChoice2 = ('gamma', 800, 0.99, 0.7)
subTableTitle1 = r"Data sample size = $500$. "
subTableLabel1 = "stb11_qe_gamma"
subTableTitle2 = r"Data sample size = $800$. "
subTableLabel2 = "stb12_qe_gamma"
tableTitle = r"Quantitle estimation from Gamma data source. "
tableLabel = "stb1_qe_gamma"
print(tableFiveOne(go3, targetColumns, keyChoice1, keyChoice2, subTableTitle1, subTableLabel1, subTableTitle2, subTableLabel2, tableTitle, tableLabel))

targetColumns = ['(0,CHI2)','(1,CHI2)','(2,CHI2)']
#                  '(1,KS)','(2,KS)']
keyChoice1 = ('lognorm', 500, 0.99, 0.7)
keyChoice2 = ('lognorm', 800, 0.99, 0.7)
subTableTitle1 = r"Data sample size = $500$. "
subTableLabel1 = "stb11_qe_lognormal"
subTableTitle2 = r"Data sample size = $800$. "
subTableLabel2 = "stb12_qe_lognormal"
tableTitle = r"Quantitle estimation from Lognormal data source. "
tableLabel = "stb1_qe_lognormal"
print(tableFiveOne(go3, targetColumns, keyChoice1, keyChoice2, subTableTitle1, subTableLabel1, subTableTitle2, subTableLabel2, tableTitle, tableLabel))


targetColumns = ['(0,CHI2)','(1,CHI2)','(2,CHI2)']
#                  '(1,KS)','(2,KS)']
keyChoice1 = ('pareto', 500, 0.99, 0.7)
keyChoice2 = ('pareto', 800, 0.99, 0.7)
subTableTitle1 = r"Data sample size = $500$. "
subTableLabel1 = "stb11_qe_pareto"
subTableTitle2 = r"Data sample size = $800$. "
subTableLabel2 = "stb12_qe_pareto"
tableTitle = r"Quantitle estimation from Pareto data source. "
tableLabel = "stb1_qe_pareto"
print(tableFiveOne(go3, targetColumns, keyChoice1, keyChoice2, subTableTitle1, subTableLabel1, subTableTitle2, subTableLabel2, tableTitle, tableLabel))



    \begin{table}[ht]
    \centering\scalebox{0.7}{
    \begin{subtable}{0.7\textwidth}\begin{tabular}{ccc}
        \toprule
        \hline
        \multicolumn{1}{p{2.8cm}}{Constraint setting} &
        \multicolumn{1}{p{3.9cm}}{Relative ratio/Estimated UpperBound}  &
        \multicolumn{1}{p{2cm}}{Converage probability}  \\\hline
        $(0,\chi^2)$ & $16.08/8.04\times 10^{-2}$ & $ 1$\\
        $(1,\chi^2)$ & $5.46/2.73\times 10^{-2}$ & $ 1$\\
        $(2,\chi^2)$ & $2.99/1.50\times 10^{-2}$ & $ 1$\\
        $(0,\text{KS})$ & $14.23/7.11\times 10^{-2}$ & $ 1$\\
        $(1,\text{KS})$ & $6.64/3.32\times 10^{-2}$ & $ 1$\\
        $(2,\text{KS})$ & $3.95/1.98\times 10^{-2}$ & $ 1$\\
        \hline
    \bottomrule
    \end{tabular}
    \caption{Data sample size = $500$. }
    \label{stb11_tpe_gamma}
    \end{subtable}
    \quad\quad\quad\quad
    \begin{subtable}{0.7\textwidth}\begin{tabular}{ccc}
        \toprule
        \hline
        \multicolumn{1}{p{2.8cm}}{Constraint setting} &

In [286]:
def tableFiveThreeUnit(thresholds, EstimatedUpperBound, RelativeRatio, CoverageProbability):
    content = r""
    for i, threshold in enumerate(thresholds):
        significance0, exponent0 = getSignificanceNExponent(EstimatedUpperBound[i][0])
        content+=r"${:.3f}$ & ${:.2f}\times 10^{{{:d}}}/{:.3f}/{:2g}$ ".format(threshold, significance0, int(exponent0), RelativeRatio[i][0], CoverageProbability[i][0])+'''\\\\
        '''
    significance0, exponent0 = getSignificanceNExponent(EstimatedUpperBound[-1][0])
    content+=r"${:}$ & ${:.2f}\times 10^{{{:d}}}/{:.3f}/{:3g}$ ".format([round(threshod,3) for threshod in thresholds], significance0, int(exponent0), RelativeRatio[-1][0], CoverageProbability[-1][0]
                                                                                                                )+'''\\\\
        '''
    return content

def tableFiveThreeSubTable(content, subtableTitle, subtableLabel, textwidth):
    return r"\begin{subtable}"+r"{{{:}\textwidth}}".format(textwidth)+\
        r'''\begin{tabular}{ccc}
        \toprule
        \hline
        \multicolumn{1}{c}{Thresholds} &
        \multicolumn{1}{c}{$(2,\chi^2)$}   \\\hline
        '''+\
    content+\
    r'''\hline
    \bottomrule
    \end{tabular}
    '''+\
    r'''\caption{{{:}}}
    '''.format(subtableTitle)+\
    r'''\label{{{:}}}
    '''.format(subtableLabel)+\
    r'''\end{subtable}
    '''    

In [287]:
def tableFiveThree(groupby_object1:pd.core.groupby.generic.DataFrameGroupBy, 
                   groupby_object2:pd.core.groupby.generic.DataFrameGroupBy,
                   targetColumns:list,
                 keyChoice1:tuple, keyChoice2:tuple, 
                 subTableTitle1:str, subTableLabel1:str, subTableTitle2:str, subTableLabel2:str,
                 tableTitle:str, tableLabel:str, scalebox:float = 0.7, textwidth:float=0.7):
    numMultiThreshold = 5
    RelativeRatio1 = []
    CoverageProbability1 = []
    EstimatedUpperBound1 = []
    trueValue1 = []
    for idx in range(numMultiThreshold):
        currKeyChoice = (*keyChoice1[:3],round(keyChoice1[3]+0.025*idx,3))
        ## trueValue should be same among different numMultiThreshold. 
        trueValue1.append(groupby_object1.get_group(currKeyChoice)['trueValue'].unique()[0])
        EstimatedUpperBound1.append(groupby_object1.get_group(currKeyChoice)[targetColumns].mean().tolist())
        RelativeRatio1.append((groupby_object1.get_group(currKeyChoice)[targetColumns].mean().values/trueValue1[-1]).tolist())
        CoverageProbability1.append((groupby_object1.get_group(currKeyChoice)[targetColumns].values>trueValue1[-1]).mean(axis=0).tolist())        
    thresholds1 = [keyChoice1[3]+0.025*idx for idx in range(numMultiThreshold)]
    ## trueValue should be same among different numMultiThreshold. 
    trueValue1.append(groupby_object2.get_group(keyChoice1)['trueValue'].unique()[0])
    RelativeRatio1.append((groupby_object2.get_group(keyChoice1)[targetColumns].mean().values/trueValue1[-1]).tolist())
    CoverageProbability1.append((groupby_object2.get_group(keyChoice1)[targetColumns].values>trueValue1[-1]).mean(axis=0).tolist())
    assert np.unique(trueValue1).size == 1 
    trueValue1 = trueValue1[0]
    RelativeRatio2 = []
    CoverageProbability2 = []
    EstimatedUpperBound2 = []
    trueValue2 = []
    for idx in range(numMultiThreshold):
        currKeyChoice = (*keyChoice2[:3], round(keyChoice2[3]+0.025*idx,3))
        ## trueValue should be same among different numMultiThreshold. 
        trueValue2.append(groupby_object1.get_group(currKeyChoice)['trueValue'].unique()[0])
        EstimatedUpperBound2.append(groupby_object1.get_group(currKeyChoice)[targetColumns].mean().tolist())
        RelativeRatio2.append((groupby_object1.get_group(currKeyChoice)[targetColumns].mean().values/trueValue2[-1]).tolist())
        CoverageProbability2.append((groupby_object1.get_group(currKeyChoice)[targetColumns].values>trueValue2[-1]).mean(axis=0).tolist())        
    thresholds2 = [keyChoice2[3]+0.025*idx for idx in range(numMultiThreshold)]
    ## trueValue should be same among different numMultiThreshold. 
    trueValue2.append(groupby_object2.get_group(keyChoice2)['trueValue'].unique()[0])
    RelativeRatio2.append((groupby_object2.get_group(keyChoice2)[targetColumns].mean().values/trueValue2[-1]).tolist())
    CoverageProbability2.append((groupby_object2.get_group(keyChoice1)[targetColumns].values>trueValue2[-1]).mean(axis=0).tolist())
    assert np.unique(trueValue2).size == 1 
    trueValue2 = trueValue2[0]
    assert trueValue2==trueValue1
    trueValue = trueValue1
    del trueValue2, trueValue1
    #We assume that subTable1 and subTable2 have the same true value. 
    latexTable = r'''
    \begin{table}[ht]
    \centering'''+\
    r'''\scalebox{{{:}}}{{
    '''.format(scalebox)+\
    tableFiveThreeSubTable(
        tableFiveThreeUnit(thresholds1, EstimatedUpperBound1, RelativeRatio1, CoverageProbability1),
        subTableTitle1, subTableLabel1, textwidth)+\
    r'''\quad\quad\quad\quad
    '''+\
    tableFiveThreeSubTable(
        tableFiveThreeUnit(thresholds2, EstimatedUpperBound2, RelativeRatio2, CoverageProbability2),
        subTableTitle2, subTableLabel2, textwidth)+\
    r"}"+\
    r'''\caption{{{:}}}
    '''.format(tableTitle+"True value is {:g}.".format(trueValue))+\
    r'''\label{{{:}}}
    '''.format(tableLabel)+\
    r'''\end{table}'''
    return latexTable

### 5.1 Selection of shape constraints. i.e. D=0, 1, 2

In [11]:
import matplotlib.pyplot as plt

In [1]:
fig, axs = plt.subplots(2, 3,figsize=(24,12))
ks1= []
chi1 = []
for key, eachDF in go1:
    assert False
#     print("KS: "+ key[0]+"with datasize={:}, percentageLHS={:} and thresholdPercentage={:}".format(key[1],key[2],key[3]))
    ks1.append((eachDF.mean(axis=0)[['(0,KS)','(1,KS)','(2,KS)']]/eachDF['trueValue'].unique()).tolist())
#     print("CHI2: "+ key[0]+"with datasize={:}, percentageLHS={:} and thresholdPercentage={:}".format(key[1],key[2],key[3])) 
    chi1.append((eachDF.mean(axis=0)[['(0,CHI2)','(1,CHI2)','(2,CHI2)']]/eachDF['trueValue'].unique()).tolist())
axs[0][0].plot(pd.DataFrame(data=ks1,columns=["D=0","D=1","D=2"]).transpose().values.tolist(),'--x')
axs[0][0].set_title("KS, tail probability, single threshold, Relative Ratio (RE)")
axs[0][1].plot(pd.DataFrame(data=chi1,columns=["D=0","D=1","D=2"]).transpose().values.tolist(),'--x')
axs[0][1].set_title("Chi2, tail probability, single threshold, RE")

ks2= []
chi2 = []
for key, eachDF in go2:
#     print("KS: "+ key[0]+"with datasize={:}, percentageLHS={:} and thresholdPercentage={:}".format(key[1],key[2],key[3]))
    ks2.append((eachDF.mean(axis=0)[['(0,KS)','(1,KS)','(2,KS)']]/eachDF['trueValue'].unique()).tolist())
#     print("CHI2: "+ key[0]+"with datasize={:}, percentageLHS={:} and thresholdPercentage={:}".format(key[1],key[2],key[3]))    
    chi2.append((eachDF.mean(axis=0)[['(0,CHI2)','(1,CHI2)','(2,CHI2)']]/eachDF['trueValue'].unique()).tolist())
axs[1][0].plot(pd.DataFrame(data=ks2,columns=["D=0","D=1","D=2"]).transpose().values.tolist(),'--x')
axs[1][0].set_title("KS, tail probability, multiple thresholds, RE")
axs[1][1].plot(pd.DataFrame(data=chi2,columns=["D=0","D=1","D=2"]).transpose().values.tolist(),'--x')
axs[1][1].set_title("Chi2, tail probability, multiple thresholds, RE")

for i in range(2):
    for j in range(3):
        axs[i][j].set_xticks([0, 1, 2])     
        axs[i][j].set_xticklabels(['D=0', 'D=1', 'D=2'])

chi3 = []      
chi4 = []      
for key, eachDF in go3:
#     print("CHI2: "+ key[0]+"with datasize={:}, percentageLHS={:} and thresholdPercentage={:}".format(key[1],key[2],key[3]))    
    chi3.append((eachDF.mean(axis=0)[['(0,CHI2)','(1,CHI2)','(2,CHI2)']]/eachDF['trueValue'].unique()).tolist())
for key, eachDF in go4:
#     print("CHI2: "+ key[0]+"with datasize={:}, percentageLHS={:} and thresholdPercentage={:}".format(key[1],key[2],key[3]))    
    chi4.append((eachDF.mean(axis=0)[['(0,CHI2)','(1,CHI2)','(2,CHI2)']]/eachDF['trueValue'].unique()).tolist())
axs[0][2].plot(pd.DataFrame(data=chi3,columns=["D=0","D=1","D=2"]).transpose().values.tolist(),'--x')
axs[0][2].set_title("Chi2, Quantitle Estimation, single threshold, RE")
axs[1][2].plot(pd.DataFrame(data=chi4,columns=["D=0","D=1","D=2"]).transpose().values.tolist(),'--x')
_=axs[1][2].set_title("Chi2, Quantitle Estimation, multiple thresholds, RE")


### 5.2 Selection of shape constraints. i.e.  KS vs Chi2

In [4]:
fig, axs = plt.subplots(1, 2,figsize=(16,6))
ksvschi1= []
for key, eachDF in go1:
    eachDF_mean = eachDF.mean(axis=0)
    ksvschi1.append([eachDF_mean['(0,CHI2)']/eachDF_mean['(0,KS)'],eachDF_mean['(1,CHI2)']/eachDF_mean['(1,KS)'],eachDF_mean['(2,CHI2)']/eachDF_mean['(2,KS)']])
    del eachDF_mean
ksvschi2= []    
for key, eachDF in go2:
    eachDF_mean = eachDF.mean(axis=0)
    ksvschi2.append([eachDF_mean['(0,CHI2)']/eachDF_mean['(0,KS)'],eachDF_mean['(1,CHI2)']/eachDF_mean['(1,KS)'],eachDF_mean['(2,CHI2)']/eachDF_mean['(2,KS)']])
    del eachDF_mean

axs[0].plot(pd.DataFrame(data=ksvschi1,columns=["D=0","D=1","D=2"]).transpose().values.tolist(),'--x')
axs[0].set_title("Chi2vsKS, mean tail probability ratio, single threshold")
axs[1].plot(pd.DataFrame(data=ksvschi2,columns=["D=0","D=1","D=2"]).transpose().values.tolist(),'--x')
axs[1].set_title("Chi2vsKS, mean tail probability ratio, multiple thresholds")
for i in range(2):
        axs[i].set_xticks([0, 1, 2])     
        axs[i].set_xticklabels(['D=0', 'D=1', 'D=2'])

### 5.3 Selection of Threshold

In [13]:
go1 = cumDf1.groupby(by=['dataDistribution','dataSize','percentageLHS'])
go2 = cumDf2.groupby(by=['dataDistribution','dataSize','percentageLHS'])
go3 = cumDf3.groupby(by=['dataDistribution','dataSize','quantitleValue'])
go4 = cumDf4.groupby(by=['dataDistribution','dataSize','quantitleValue'])

In [1]:
for key, eachDF in go1:
    resultPerthreshold = []
    for thresholdPercentage, eachDFwThreshold in eachDF.groupby(by='thresholdPercentage'):
        
        eachDF_mean = eachDFwThreshold.mean(axis=0)[['(0,KS)','(1,KS)','(2,KS)','(0,CHI2)','(1,CHI2)','(2,CHI2)']]/eachDFwThreshold['trueValue'][0]
        resultPerthreshold.append([thresholdPercentage]+list(eachDF_mean))
        del eachDF_mean
    print("SingleThreshold data source is {:}, dataSize is {:}, percentageLHS is {:}".format(key[0],key[1],key[2]))
    display(pd.DataFrame(resultPerthreshold,columns=['thresholdPercentage','(0,KS)','(1,KS)','(2,KS)','(0,CHI2)','(1,CHI2)','(2,CHI2)']))

for key, eachDF in go2:
    resultPerthreshold = []
    for thresholdPercentage, eachDFwThreshold in eachDF.groupby(by='thresholdPercentage'):
        
        eachDF_mean = eachDFwThreshold.mean(axis=0)[['(0,KS)','(1,KS)','(2,KS)','(0,CHI2)','(1,CHI2)','(2,CHI2)']]/eachDFwThreshold['trueValue'][0]
        resultPerthreshold.append([thresholdPercentage]+list(eachDF_mean))
        del eachDF_mean
    print("MultiThresholds data source is {:}, dataSize is {:}, percentageLHS is {:}".format(key[0],key[1],key[2]))        
    display(pd.DataFrame(resultPerthreshold,columns=['thresholdPercentage','(0,KS)','(1,KS)','(2,KS)','(0,CHI2)','(1,CHI2)','(2,CHI2)']))
for key, eachDF in go3:
    resultPerthreshold = []
    for thresholdPercentage, eachDFwThreshold in eachDF.groupby(by='thresholdPercentage'):
        
        eachDF_mean = eachDFwThreshold.mean(axis=0)[['(0,CHI2)','(1,CHI2)','(2,CHI2)']]/eachDFwThreshold['trueValue'][0]
        resultPerthreshold.append([thresholdPercentage]+list(eachDF_mean))
        del eachDF_mean
    print("SingleThreshold data source is {:}, dataSize is {:}, quantitleValue is {:}".format(key[0],key[1],key[2]))        
    display(pd.DataFrame(resultPerthreshold,columns=['thresholdPercentage','(0,CHI2)','(1,CHI2)','(2,CHI2)']))
for key, eachDF in go4:
    resultPerthreshold = []
    for thresholdPercentage, eachDFwThreshold in eachDF.groupby(by='thresholdPercentage'):
        
        eachDF_mean = eachDFwThreshold.mean(axis=0)[['(0,CHI2)','(1,CHI2)','(2,CHI2)']]/eachDFwThreshold['trueValue'][0]
        resultPerthreshold.append([thresholdPercentage]+list(eachDF_mean))
        del eachDF_mean
    print("MultiThresholds data source is {:}, dataSize is {:}, quantitleValue is {:}".format(key[0],key[1],key[2]))                
    display(pd.DataFrame(resultPerthreshold,columns=['thresholdPercentage','(0,CHI2)','(1,CHI2)','(2,CHI2)']))

### 5.4 Selection of objective functions

In [15]:
go1 = cumDf1.groupby(by=['dataDistribution','dataSize','thresholdPercentage'])
go2 = cumDf2.groupby(by=['dataDistribution','dataSize','thresholdPercentage'])
go3 = cumDf3.groupby(by=['dataDistribution','dataSize','thresholdPercentage'])
go4 = cumDf4.groupby(by=['dataDistribution','dataSize','thresholdPercentage'])

In [2]:
for key, eachDF in go1:
    resultPercentage = []
    for percentage, eachDFwP in eachDF.groupby(by='percentageLHS'):
        eachDF_mean = eachDFwP.mean(axis=0)[['(0,KS)','(1,KS)','(2,KS)','(0,CHI2)','(1,CHI2)','(2,CHI2)']]/eachDFwP['trueValue'][0]
        resultPercentage.append([percentage]+list(eachDF_mean))
        del eachDF_mean
    print("SingleThreshold data source is {:}, dataSize is {:}, thresholdPercentage is {:}".format(key[0],key[1],key[2]))        
    display(pd.DataFrame(resultPercentage,columns=['ObjectiveLHS','(0,KS)','(1,KS)','(2,KS)','(0,CHI2)','(1,CHI2)','(2,CHI2)']))

for key, eachDF in go2:
    resultPercentage = []
    for percentage, eachDFwP in eachDF.groupby(by='percentageLHS'):
        eachDF_mean = eachDFwP.mean(axis=0)[['(0,KS)','(1,KS)','(2,KS)','(0,CHI2)','(1,CHI2)','(2,CHI2)']]/eachDFwP['trueValue'][0]
        resultPercentage.append([percentage]+list(eachDF_mean))
        del eachDF_mean
    print("MultiThresholds data source is {:}, dataSize is {:}, thresholdPercentage is {:}".format(key[0],key[1],key[2]))                
    display(pd.DataFrame(resultPercentage,columns=['ObjectiveLHS','(0,KS)','(1,KS)','(2,KS)','(0,CHI2)','(1,CHI2)','(2,CHI2)']))

for key, eachDF in go3:
    resultPercentage = []
    for percentage, eachDFwP in eachDF.groupby(by='quantitleValue'):
        eachDF_mean = eachDFwP.mean(axis=0)[['(0,CHI2)','(1,CHI2)','(2,CHI2)']]/eachDFwP['trueValue'][0]
        resultPercentage.append([percentage]+list(eachDF_mean))
        del eachDF_mean
    print("SingleThreshold data source is {:}, dataSize is {:}, thresholdPercentage is {:}".format(key[0],key[1],key[2]))                
    display(pd.DataFrame(resultPercentage,columns=['targetQuantile','(0,CHI2)','(1,CHI2)','(2,CHI2)']))

for key, eachDF in go4:
    resultPercentage = []
    for percentage, eachDFwP in eachDF.groupby(by='quantitleValue'):
        eachDF_mean = eachDFwP.mean(axis=0)[['(0,CHI2)','(1,CHI2)','(2,CHI2)']]/eachDFwP['trueValue'][0]
        resultPercentage.append([percentage]+list(eachDF_mean))
        del eachDF_mean
    print("MultiThresholds data source is {:}, dataSize is {:}, thresholdPercentage is {:}".format(key[0],key[1],key[2]))                
    display(pd.DataFrame(resultPercentage,columns=['targetQuantile','(0,CHI2)','(1,CHI2)','(2,CHI2)'])) 

### 5.5 Selection of data source

In [17]:
go1 = cumDf1.groupby(by=['dataSize','thresholdPercentage','percentageLHS'])
go2 = cumDf2.groupby(by=['dataSize','thresholdPercentage','percentageLHS'])
go3 = cumDf3.groupby(by=['dataSize','thresholdPercentage','quantitleValue'])
go4 = cumDf4.groupby(by=['dataSize','thresholdPercentage','quantitleValue'])

In [3]:
for key, eachDF in go1:
    result = []
    for dataDistribution, eachDFwP in eachDF.groupby(by='dataDistribution'):
        eachDF_mean = eachDFwP.mean(axis=0)[['(0,KS)','(1,KS)','(2,KS)','(0,CHI2)','(1,CHI2)','(2,CHI2)']]/eachDFwP['trueValue'].values[0]
        result.append([dataDistribution]+list(eachDF_mean))
        del eachDF_mean
    print("SingleThreshold data source is {:}, thresholdPercentage is {:}, percentageLHS is {:}".format(key[0],key[1],key[2]))                        
    display(pd.DataFrame(result,columns=['dataDistribution','(0,KS)','(1,KS)','(2,KS)','(0,CHI2)','(1,CHI2)','(2,CHI2)']))

for key, eachDF in go2:
    result = []
    for dataDistribution, eachDFwP in eachDF.groupby(by='dataDistribution'):
        eachDF_mean = eachDFwP.mean(axis=0)[['(0,KS)','(1,KS)','(2,KS)','(0,CHI2)','(1,CHI2)','(2,CHI2)']]/eachDFwP['trueValue'].values[0]
        result.append([dataDistribution]+list(eachDF_mean))
        del eachDF_mean
    print("MultiThresholds data source is {:}, thresholdPercentage is {:}, percentageLHS is {:}".format(key[0],key[1],key[2]))                                
    display(pd.DataFrame(result,columns=['dataDistribution','(0,KS)','(1,KS)','(2,KS)','(0,CHI2)','(1,CHI2)','(2,CHI2)']))
for key, eachDF in go3:
    result = []
    for dataDistribution, eachDFwP in eachDF.groupby(by='dataDistribution'):
        eachDF_mean = eachDFwP.mean(axis=0)[['(0,CHI2)','(1,CHI2)','(2,CHI2)']]/eachDFwP['trueValue'].values[0]
        result.append([dataDistribution]+list(eachDF_mean))
        del eachDF_mean
    print("SingleThreshold data source is {:}, thresholdPercentage is {:}, quantitleValue is {:}".format(key[0],key[1],key[2]))                        
    display(pd.DataFrame(result,columns=['dataDistribution','(0,CHI2)','(1,CHI2)','(2,CHI2)']))
for key, eachDF in go4:
    result = []
    for dataDistribution, eachDFwP in eachDF.groupby(by='dataDistribution'):
        eachDF_mean = eachDFwP.mean(axis=0)[['(0,CHI2)','(1,CHI2)','(2,CHI2)']]/eachDFwP['trueValue'].values[0]
        result.append([dataDistribution]+list(eachDF_mean))
        del eachDF_mean
    print("MultiThresholds data source is {:}, thresholdPercentage is {:}, quantitleValue is {:}".format(key[0],key[1],key[2]))                                
    display(pd.DataFrame(result,columns=['dataDistribution','(0,CHI2)','(1,CHI2)','(2,CHI2)']))