# Wilcoxon Signed Rank Test

In [27]:
import json
import pandas as pd
import numpy as np
import scipy
from scipy.stats import wilcoxon


### Try Wilcoxon

In [28]:
d = [6, 8, 14, 16, 23, 24, 28, 29, 41, -48, 49, 56, 60, -67, 75]
d = [6, 8, 14, 16, 23, 24, 28, 29, 41, 49, 56, 60, 75]
w, p = wilcoxon(d, alternative='greater')
w, p = wilcoxon(d)
p

0.000244140625

## Analyze

In [29]:
with open('result/RQ2.json', 'r') as f:
  data = json.load(f)


In [30]:
res = {}

# asrs = ["quartznet", "hubert", "wav2vec-base"]
# datasets = ["YBAA", "ZHAA", "ASI", "TNI", "NCC",
#             "TXHC", "EBVS", "ERMS", "YDCK", "YKWK", "THV", "TLV"]
# tools = ["random", "triphone_rich", "pure_diversity", "icassp_without_diversity_enhancing_real_mix", "icassp_real_mix",
#          "asrevolve_error_model_real", "word_error_real_mix/no_word_enhance", "word_error_real_mix/word_enhance"]


asrs = ["quartznet"]
datasets = ["YBAA", "ZHAA", "ASI", "TNI"]
tools = ["random", "triphone_rich", "pure_diversity"]

res = {}
for asr in asrs:
    res[asr] = {}
    for dataset in datasets:
        res[asr][dataset] = {}
        for tool in tools:
            res[asr][dataset][tool] = pd.read_csv(data[asr][dataset][tool])
            
# res


In [34]:
short_names = {
    "random": "RND",
    "triphone_rich": "TR",
    "pure_diversity": "PD",
    "icassp_without_diversity_enhancing_real_mix": "IC-WDE",
    "icassp_real_mix": "IC",
    "asrevolve_error_model_real": "ASR-EV",
    "word_error_real_mix/no_word_enhance": "NWE",
    "word_error_real_mix/word_enhance": "WE",
}

### 1. Analyze the experiment grouped by \<ASR Model\>

In [35]:
def get_wer_result(df):
    arr = list(df["WER_Seed1"]) + list(df["WER_Seed2"]) + list(df["WER_Seed3"])
    assert len(arr) == 3 * len(df["WER_Seed1"])
    assert len(arr) == 3 * len(df["WER_Seed2"])
    assert len(arr) == 3 * len(df["WER_Seed3"])
    return arr

wer_p_value = {}
for asr in asrs:
    wer_p_value[asr] = pd.DataFrame()

    values = {}
        
    for tool_1 in tools:
        values[short_names[tool_1]] = {}
        for tool_2 in tools :

            if tool_1 == tool_2:
                values[short_names[tool_1]][short_names[tool_2]] = 0
            else :
                arr_1 = []
                arr_2 = []
                for dataset in datasets :
                    arr_1 += get_wer_result(res[asr][dataset][tool_1])
                    arr_2 += get_wer_result(res[asr][dataset][tool_2])

                w, p = scipy.stats.wilcoxon(arr_1, arr_2)
                # we would reject the null hypothesis at a confidence level of 5%
                if p < 0.05:
                    values[short_names[tool_1]][short_names[tool_2]] = 1
                else :
                    values[short_names[tool_1]][short_names[tool_2]] = 0
    
    # print(values)

    df = pd.DataFrame(data=values)
    print(df)
            


        RND  TR  PD  IC-WDE  IC  ASR-EV  NWE  WE
RND       0   0   1       0   0       0    0   1
TR        0   0   0       1   0       0    1   1
PD        1   0   0       1   1       1    1   1
IC-WDE    0   1   1       0   0       0    0   0
IC        0   0   1       0   0       0    0   1
ASR-EV    0   0   1       0   0       0    0   1
NWE       0   1   1       0   0       0    0   0
WE        1   1   1       0   1       1    0   0
        RND  TR  PD  IC-WDE  IC  ASR-EV  NWE  WE
RND       0   0   0       0   1       0    1   0
TR        0   0   0       0   1       0    0   0
PD        0   0   0       0   1       0    1   0
IC-WDE    0   0   0       0   1       0    1   0
IC        1   1   1       1   0       1    1   1
ASR-EV    0   0   0       0   1       0    1   0
NWE       1   0   1       1   1       1    0   1
WE        0   0   0       0   1       0    1   0
        RND  TR  PD  IC-WDE  IC  ASR-EV  NWE  WE
RND       0   1   1       1   1       0    0   1
TR        1   0   0 