# Correlation Analysis

In [1]:
import os
import json
import pandas as pd
import numpy as np
import scipy
from scipy.stats import wilcoxon
from scipy.spatial import distance
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pd.options.display.float_format = '{:,.3f}'.format


In [2]:
shorten_error_model_name = {
    "random" : "RND",
    "error_model_triphone_rich" : "TR",
    "error_model_pure_diversity" : "PD",
    "error_model_without_diversity_enhancing" : "IC-WDE",
    "error_model" : "IC",
    "asrevolve_error_model_real" : "ASR-EV",
    "word_error_predictor_real/no_word_enhance" : "NWE",
    "word_error_predictor_real/word_enhance": "WE"
}

shorten_finetuned_model_name = {
    "random": "RND",
    "triphone_rich": "TR",
    "pure_diversity": "PD",
    "icassp_without_diversity_enhancing_real_mix": "IC-WDE",
    "icassp_real_mix": "IC",
    "asrevolve_error_model_real": "ASR-EV",
    "word_error_real_mix/no_word_enhance": "NWE",
    "word_error_real_mix/word_enhance": "WE"
}

def shorten_em_name(tools):
    return [shorten_error_model_name[tool] for tool in tools]

def shorten_ft_name(tools) :
    return [shorten_finetuned_model_name[tool] for tool in tools]

tool_short_names = ["RND", "TR", "PD", "IC-WDE", "IC", "ASR-EV", "NWE", "WE"]

finetuned_model_tool_names = ["random", "triphone_rich", "pure_diversity", "icassp_without_diversity_enhancing_real_mix", "icassp_real_mix",
         "asrevolve_error_model_real", "word_error_real_mix/no_word_enhance", "word_error_real_mix/word_enhance"]

### The relative improvement of WER after fine-tuning

In [3]:
with open('result/RQ2.json', 'r') as f:
  data = json.load(f)

asrs = ["quartznet", "hubert", "wav2vec-base"]
datasets = ["YBAA", "ZHAA", "ASI", "TNI", "NCC",
            "TXHC", "EBVS", "ERMS", "YDCK", "YKWK", "THV", "TLV"]
tools = ["random", "triphone_rich", "pure_diversity", "icassp_without_diversity_enhancing_real_mix", "icassp_real_mix",
         "asrevolve_error_model_real", "word_error_real_mix/no_word_enhance", "word_error_real_mix/word_enhance"]

finetuned_model_performance_on_test_set = {}
for asr in asrs:
    finetuned_model_performance_on_test_set[asr] = {}
    for dataset in datasets:
        finetuned_model_performance_on_test_set[asr][dataset] = {}
        for tool in tools:
            finetuned_model_performance_on_test_set[asr][dataset][shorten_finetuned_model_name[tool]] = pd.read_csv(
                data[asr][dataset][tool])

# finetuned_model_performance_on_test_set


In [4]:
with open('result/original.json', 'r') as f:
  original_data = json.load(f)

original_model_performance_on_test_set = {}

for asr in asrs:
    original_model_performance_on_test_set[asr] = {}
    for dataset in datasets:
        original_model_performance_on_test_set[asr][dataset] = original_data[asr][dataset]["test"]["wer"]

# original_model_performance_on_test_set



In [5]:
relative_improvement_of_finetuned_model = {}
for asr in asrs:
    relative_improvement_of_finetuned_model[asr] = {}
    for dataset in datasets:
        relative_improvement_of_finetuned_model[asr][dataset] = {}
        for tool in shorten_ft_name(tools):
            relative_improvement_of_finetuned_model[asr][dataset][tool] = {}
            for metric in ["WER_Seed1", "WER_Seed2", "WER_Seed3", "WER_Avg"]:
                relative_improvement_of_finetuned_model[asr][dataset][tool][metric] = (
                    (original_model_performance_on_test_set[asr][dataset] - finetuned_model_performance_on_test_set[asr][dataset][tool][metric]) / original_model_performance_on_test_set[asr][dataset]).to_list()

# relative_improvement_of_finetuned_model


In [6]:
def save_to_file(fpath, printed, significant_count, non_significant_count):
    os.makedirs(os.path.dirname(fpath), exist_ok=True)
    with open(fpath, 'w') as f:
        f.write(f"Significant: \t\t{significant_count}\n")
        f.write(f"Non-significant: \t{non_significant_count}\n")
        f.write(
            f"Total: \t\t\t\t{significant_count + non_significant_count}\n")
        f.write(
            f"Percentage: \t\t{100*significant_count / (significant_count + non_significant_count):.2f}%\n\n")
        for line in printed:
            f.write(line)
            f.write('\n')


CONFIDENCE_LEVEL = 0.05


### 1. Correlation between the original model's WER on the selected samples and the relative improvement of WER after fine-tuning

In [7]:
with open('result/RQ1.json', 'r') as f:
  rq1_data = json.load(f)


original_model_performance_on_selected_samples = {}

asrs = ["quartznet", "hubert", "wav2vec-base"]
datasets = ["YBAA", "ZHAA", "ASI", "TNI", "NCC",
            "TXHC", "EBVS", "ERMS", "YDCK", "YKWK", "THV", "TLV"]
tools = ["random", "error_model_triphone_rich", "error_model_pure_diversity", "error_model_without_diversity_enhancing", "error_model",
         "asrevolve_error_model_real", "word_error_predictor_real/no_word_enhance", "word_error_predictor_real/word_enhance"]

original_model_performance_on_selected_samples = {}
for asr in asrs:
    original_model_performance_on_selected_samples[asr] = {}
    for dataset in datasets:
        original_model_performance_on_selected_samples[asr][dataset] = {}
        for tool in tools:
            original_model_performance_on_selected_samples[asr][dataset][shorten_error_model_name[tool]] = {}
            for metric in ["WER_Seed1", "WER_Seed2", "WER_Seed3", "WER_Avg"]:
                original_model_performance_on_selected_samples[asr][dataset][shorten_error_model_name[tool]][metric] = pd.read_csv(
                  rq1_data[asr][dataset][tool])[metric].tolist()

# original_model_performance_on_selected_samples


In [8]:
arr_1 = []
arr_2 = []

for asr in asrs:
    for dataset in datasets:
        for tool in tool_short_names:
            for metric in ["WER_Seed1", "WER_Seed2", "WER_Seed3"]:
                arr_1 += original_model_performance_on_selected_samples[asr][dataset][tool][metric]
                arr_2 += relative_improvement_of_finetuned_model[asr][dataset][tool][metric]

assert len(arr_1) == len(arr_2)
scipy.stats.spearmanr(arr_1, arr_2)

            


SpearmanrResult(correlation=0.021986090955998173, pvalue=0.19628736520635923)

In [9]:
arr_1 = [-3.49, 10.18, 8.98, 6.62, 27.33, 21.52]
arr_2 = [4.0375, 4.3025, 4.2475, 4.2, 4.3275, 4.4175]
scipy.stats.spearmanr(arr_1, arr_2)


SpearmanrResult(correlation=0.942857142857143, pvalue=0.004804664723032055)

In [10]:
for dataset in datasets:
    for asr in asrs:
        arr_1 = []
        arr_2 = []
        # for tool in ["RND", "TR", "PD", "IC-WDE", "IC", "ASR-EV", "NWE", "WE"]:
        # for tool in ["PD", "IC-WDE", "IC", "ASR-EV", "NWE", "WE"]:
        for tool in ["PD", "IC-WDE", "IC", "ASR-EV", "NWE"]:
            curr_arr_1 = []
            curr_arr_2 = []
            
            for metric in ["WER_Seed1", "WER_Seed2", "WER_Seed3"]:
                curr_arr_1 += original_model_performance_on_selected_samples[asr][dataset][tool][metric]
                curr_arr_2 += relative_improvement_of_finetuned_model[asr][dataset][tool][metric]
            
            arr_1.append(np.mean(curr_arr_1))
            arr_2.append(np.mean(curr_arr_2))

        assert len(arr_1) == len(arr_2)
        # print("Len arr 1: ", len(arr_1))

        correlation_coefficient, p_value = scipy.stats.spearmanr(arr_1, arr_2)

        print()
        print("Dataset: ", dataset)
        print("ASR: ", asr)
        print("Spearman's corr coeff:\t ", correlation_coefficient)
        print("p-value:\t\t ", p_value)

        break
    break



Dataset:  YBAA
ASR:  quartznet
Spearman's corr coeff:	  0.9999999999999999
p-value:		  1.4042654220543672e-24


In [11]:
for dataset in datasets:
    for asr in asrs:
        arr_1 = []
        arr_2 = []
        for tool in tool_short_names:
            for metric in ["WER_Seed1", "WER_Seed2", "WER_Seed3"]:

                curr_arr_1 = original_model_performance_on_selected_samples[
                    asr][dataset][tool][metric]
                arr_1.append(np.mean(curr_arr_1))
                curr_arr_2 = relative_improvement_of_finetuned_model[asr][dataset][tool][metric]
                arr_2.append(np.mean(curr_arr_2))

        assert len(arr_1) == len(arr_2)

        correlation_coefficient, p_value = scipy.stats.spearmanr(arr_1, arr_2)

        print()
        print("Dataset: ", dataset)
        print("ASR: ", asr)
        print("Spearman's corr coeff:\t ", correlation_coefficient)
        print("p-value:\t\t ", p_value)

        break
    break



Dataset:  YBAA
ASR:  quartznet
Spearman's corr coeff:	  0.38869565217391305
p-value:		  0.060498404649825734


### 3. Correlation between "the distance of triphone rich distribution and ideal distribution on the selected samples" and the relative improvement of WER after fine-tuning

In [12]:
with open('result/triphone_rich.json', 'r') as f:
  triphone_rich_data = json.load(f)


In [13]:
## without any grouping mechanism

arr_1 = []
arr_2 = []

for asr in asrs:
    for dataset in datasets:
        tool = "error_model_triphone_rich"
        for seed in ["1", "2", "3"] :
            arr_1 += triphone_rich_data[asr][dataset][seed]
        for metric in ["WER_Seed1", "WER_Seed2", "WER_Seed3"]:
            arr_2 += relative_improvement_of_finetuned_model[asr][dataset][shorten_error_model_name[tool]][metric]

assert len(arr_1) == len(arr_2)

correlation_coefficient, p_value = scipy.stats.spearmanr(arr_1, arr_2)
print()
print("Spearman's corr coeff:\t ", correlation_coefficient)
print("p-value:\t\t ", p_value)



Spearman's corr coeff:	  0.3096296252398027
p-value:		  4.728069257623937e-11


In [14]:
## grouping by dataset

for dataset in datasets:
    arr_1 = []
    arr_2 = []
    for asr in asrs:
        tool = "error_model_triphone_rich"
        for seed in ["1", "2", "3"]:
            arr_1 += triphone_rich_data[asr][dataset][seed]
        for metric in ["WER_Seed1", "WER_Seed2", "WER_Seed3"]:
            arr_2 += relative_improvement_of_finetuned_model[asr][dataset][shorten_error_model_name[tool]][metric]

    assert len(arr_1) == len(arr_2)

    correlation_coefficient, p_value = scipy.stats.spearmanr(arr_1, arr_2)

    print()
    print("Dataset: ", dataset)
    print("Spearman's corr coeff:\t ", correlation_coefficient)
    print("p-value:\t\t ", p_value)



Dataset:  YBAA
Spearman's corr coeff:	  0.29872252445264824
p-value:		  0.07677066890825572

Dataset:  ZHAA
Spearman's corr coeff:	  0.39171008070249985
p-value:		  0.018147182140365984

Dataset:  ASI
Spearman's corr coeff:	  0.36745807989066254
p-value:		  0.02747414148369915

Dataset:  TNI
Spearman's corr coeff:	  -0.032172703215137745
p-value:		  0.8522317119652019

Dataset:  NCC
Spearman's corr coeff:	  0.6248733893766553
p-value:		  4.623396426138484e-05

Dataset:  TXHC
Spearman's corr coeff:	  0.427992415964119
p-value:		  0.009215235980734586

Dataset:  EBVS
Spearman's corr coeff:	  0.3130928912099224
p-value:		  0.06298250889255443

Dataset:  ERMS
Spearman's corr coeff:	  0.21620999260824883
p-value:		  0.20532655747387657

Dataset:  YDCK
Spearman's corr coeff:	  0.4356320174220398
p-value:		  0.007915519401666004

Dataset:  YKWK
Spearman's corr coeff:	  0.15354817534862975
p-value:		  0.37127176401256157

Dataset:  THV
Spearman's corr coeff:	  0.5276364267396184
p-value:		  0

### 5. Correlation between the number of test cases on the selected samples and the relative improvement of WER after fine-tuning

In [15]:
with open('result/number_of_test_cases.json', 'r') as f:
  number_of_test_cases = json.load(f)

tools = ["random", "error_model_triphone_rich", "error_model_pure_diversity", "error_model_without_diversity_enhancing", "error_model",
         "asrevolve_error_model_real", "word_error_predictor_real/no_word_enhance", "word_error_predictor_real/word_enhance"]


In [16]:
## without any grouping mechanism

printed = []
significant_count = 0
non_significant_count = 0


arr_1 = []
arr_2 = []
for dataset in datasets:
    for asr in asrs:
        for tool_short_name, tool in zip(tool_short_names, tools):
          for seed in ["1", "2", "3"]:
              arr_1 += number_of_test_cases[asr][dataset][tool][seed]
          for metric in ["WER_Seed1", "WER_Seed2", "WER_Seed3"]:
              arr_2 += relative_improvement_of_finetuned_model[asr][dataset][tool_short_name][metric]

assert len(arr_1) == len(arr_2)
correlation_coefficient, p_value = scipy.stats.spearmanr(arr_1, arr_2)


printed.append(f"Corr coef:\t{correlation_coefficient}")
printed.append(f"p-value:\t{p_value}")
if p_value < CONFIDENCE_LEVEL:
    printed.append(f"SIGNIFICANT")
    significant_count += 1

else:
    non_significant_count += 1
    printed.append(f"NO")


fpath = "result/analyze/5/number_of_test_cases_correlation_without_any_grouping.txt"
save_to_file(fpath, printed, significant_count, non_significant_count)


In [17]:
## group by dataset

printed = []
significant_count = 0
non_significant_count = 0

for dataset in datasets:
    arr_1 = []
    arr_2 = []
    for asr in asrs:
        for tool_short_name, tool in zip(tool_short_names, tools):
          for seed in ["1", "2", "3"]:
              arr_1 += number_of_test_cases[asr][dataset][tool][seed]
          for metric in ["WER_Seed1", "WER_Seed2", "WER_Seed3"]:
              arr_2 += relative_improvement_of_finetuned_model[asr][dataset][tool_short_name][metric]

    assert len(arr_1) == len(arr_2)
    correlation_coefficient, p_value = scipy.stats.spearmanr(arr_1, arr_2)

    printed.append(f"Dataset: {dataset}")
    printed.append(f"Corr coef:\t{correlation_coefficient}")
    printed.append(f"p-value:\t{p_value}")
    if p_value < CONFIDENCE_LEVEL:
        printed.append(f"SIGNIFICANT")
        significant_count += 1
    else:
        printed.append(f"NO")
        non_significant_count += 1
    printed.append("")


fpath = "result/analyze/5/number_of_test_cases_correlation_group_by_dataset.txt"
save_to_file(fpath, printed, significant_count, non_significant_count)


In [18]:
## group by dataset and ASR

printed = []
significant_count = 0
non_significant_count = 0


for dataset in datasets:
    for asr in asrs:
        arr_1 = []
        arr_2 = []
        for tool_short_name, tool in zip(tool_short_names, tools):
          for seed in ["1", "2", "3"]:
              arr_1 += number_of_test_cases[asr][dataset][tool][seed]
          for metric in ["WER_Seed1", "WER_Seed2", "WER_Seed3"]:
              arr_2 += relative_improvement_of_finetuned_model[asr][dataset][tool_short_name][metric]

        assert len(arr_1) == len(arr_2)
        correlation_coefficient, p_value = scipy.stats.spearmanr(arr_1, arr_2)

        printed.append(f"Dataset: {dataset}")
        printed.append(f"ASR: {asr}")
        printed.append(f"Corr coef:\t{correlation_coefficient}")
        printed.append(f"p-value:\t{p_value}")
        if p_value < CONFIDENCE_LEVEL:
            printed.append(f"SIGNIFICANT")
            significant_count += 1

        else:
            printed.append(f"NO")
            non_significant_count += 1
        printed.append("")


fpath = "result/analyze/5/number_of_test_cases_correlation_group_by_dataset_and_asr.txt"
save_to_file(fpath, printed, significant_count, non_significant_count)


In [19]:
## group by dataset and ASR then averaging budget

printed = []
significant_count = 0
non_significant_count = 0


for dataset in datasets:
    for asr in asrs:
        arr_1 = []
        arr_2 = []
        for tool_short_name, tool in zip(tool_short_names, tools):
            for seed in ["1", "2", "3"]:
                curr_arr_1 = number_of_test_cases[asr][dataset][tool][seed]
                arr_1.append(np.mean(curr_arr_1))
            
            for metric in ["WER_Seed1", "WER_Seed2", "WER_Seed3"]:
                curr_arr_2 = relative_improvement_of_finetuned_model[asr][dataset][tool_short_name][metric]
                arr_2.append(np.mean(curr_arr_2))


        assert len(arr_1) == len(arr_2)
        correlation_coefficient, p_value = scipy.stats.spearmanr(arr_1, arr_2)

        printed.append(f"Dataset: {dataset}")
        printed.append(f"ASR: {asr}")
        printed.append(f"Corr coef:\t{correlation_coefficient}")
        printed.append(f"p-value:\t{p_value}")
        if p_value < CONFIDENCE_LEVEL:
            printed.append(f"SIGNIFICANT")
            significant_count += 1

        else:
            printed.append(f"NO")
            non_significant_count += 1
        printed.append("")


fpath = "result/analyze/5/number_of_test_cases_correlation_group_by_dataset_and_asr_then_averaging_budget.txt"
save_to_file(fpath, printed, significant_count, non_significant_count)


In [20]:
## group by dataset and ASR and budget

printed = []
significant_count = 0
non_significant_count = 0


for dataset in datasets:
    for asr in asrs:
        
        temp_1 = {}
        temp_2 = {}
        
        
        for tool_short_name, tool in zip(tool_short_names, tools):
            for seed in ["1", "2", "3"]:
                for budget, val in zip([100, 200, 300, 400], number_of_test_cases[asr][dataset][tool][seed]) :
                    if budget not in temp_1:
                        temp_1[budget] = []
                    temp_1[budget].append(val)
                

            for metric in ["WER_Seed1", "WER_Seed2", "WER_Seed3"]:
                for budget, val in zip([100, 200, 300, 400], relative_improvement_of_finetuned_model[
                    asr][dataset][tool_short_name][metric]) :
                    if budget not in temp_2:
                        temp_2[budget] = []
                    temp_2[budget].append(val)
        

        for budget in [100, 200, 300, 400]:
            arr_1 = temp_1[budget]
            arr_2 = temp_2[budget]

            assert len(arr_1) == len(arr_2)
            correlation_coefficient, p_value = scipy.stats.spearmanr(arr_1, arr_2)

            printed.append(f"Dataset: {dataset}")
            printed.append(f"ASR: {asr}")
            printed.append(f"Corr coef:\t{correlation_coefficient}")
            printed.append(f"p-value:\t{p_value}")
            if p_value < CONFIDENCE_LEVEL:
                printed.append(f"SIGNIFICANT")
                significant_count += 1

            else:
                printed.append(f"NO")
                non_significant_count += 1
            printed.append("")


fpath = "result/analyze/5/number_of_test_cases_correlation_group_by_dataset_and_asr_and_budget.txt"
save_to_file(fpath, printed, significant_count, non_significant_count)
