# Correlation Analysis

In [31]:
import os
import json
import pandas as pd
import numpy as np
import scipy
from scipy.stats import wilcoxon
from scipy.spatial import distance
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

pd.options.display.float_format = '{:,.3f}'.format


In [32]:
shorten_error_model_name = {
    "random" : "RND",
    "error_model_triphone_rich" : "TR",
    "error_model_pure_diversity" : "PD",
    "error_model_without_diversity_enhancing" : "IC-WDE",
    "error_model" : "IC",
    "asrevolve_error_model_real" : "ASR-EV",
    "word_error_predictor_real/no_word_enhance" : "NWE",
    "word_error_predictor_real/word_enhance": "WE"
}

shorten_finetuned_model_name = {
    "random": "RND",
    "triphone_rich": "TR",
    "pure_diversity": "PD",
    "icassp_without_diversity_enhancing_real_mix": "IC-WDE",
    "icassp_real_mix": "IC",
    "asrevolve_error_model_real": "ASR-EV",
    "word_error_real_mix/no_word_enhance": "NWE",
    "word_error_real_mix/word_enhance": "WE"
}

def shorten_em_name(tools):
    return [shorten_error_model_name[tool] for tool in tools]

def shorten_ft_name(tools) :
    return [shorten_finetuned_model_name[tool] for tool in tools]

tool_short_names = ["RND", "TR", "PD", "IC-WDE", "IC", "ASR-EV", "NWE", "WE"]

finetuned_model_tool_names = ["random", "triphone_rich", "pure_diversity", "icassp_without_diversity_enhancing_real_mix", "icassp_real_mix",
         "asrevolve_error_model_real", "word_error_real_mix/no_word_enhance", "word_error_real_mix/word_enhance"]





### The relative improvement of WER after fine-tuning

In [33]:
with open('result/RQ2.json', 'r') as f:
  data = json.load(f)

asrs = ["quartznet", "hubert", "wav2vec-base"]
datasets = ["YBAA", "ZHAA", "ASI", "TNI", "NCC",
            "TXHC", "EBVS", "ERMS", "YDCK", "YKWK", "THV", "TLV"]
tools = ["random", "triphone_rich", "pure_diversity", "icassp_without_diversity_enhancing_real_mix", "icassp_real_mix",
         "asrevolve_error_model_real", "word_error_real_mix/no_word_enhance", "word_error_real_mix/word_enhance"]

finetuned_model_performance_on_test_set = {}
for asr in asrs:
    finetuned_model_performance_on_test_set[asr] = {}
    for dataset in datasets:
        finetuned_model_performance_on_test_set[asr][dataset] = {}
        for tool in tools:
            finetuned_model_performance_on_test_set[asr][dataset][shorten_finetuned_model_name[tool]] = pd.read_csv(
                data[asr][dataset][tool])

# finetuned_model_performance_on_test_set


In [34]:
with open('result/original.json', 'r') as f:
  original_data = json.load(f)

original_model_performance_on_test_set = {}

for asr in asrs:
    original_model_performance_on_test_set[asr] = {}
    for dataset in datasets:
        original_model_performance_on_test_set[asr][dataset] = original_data[asr][dataset]["test"]["wer"]

# original_model_performance_on_test_set



In [35]:
relative_improvement_of_finetuned_model = {}
for asr in asrs:
    relative_improvement_of_finetuned_model[asr] = {}
    for dataset in datasets:
        relative_improvement_of_finetuned_model[asr][dataset] = {}
        for tool in shorten_ft_name(tools):
            relative_improvement_of_finetuned_model[asr][dataset][tool] = {}
            for metric in ["WER_Seed1", "WER_Seed2", "WER_Seed3", "WER_Avg"]:
                relative_improvement_of_finetuned_model[asr][dataset][tool][metric] = (
                    (original_model_performance_on_test_set[asr][dataset] - finetuned_model_performance_on_test_set[asr][dataset][tool][metric]) / original_model_performance_on_test_set[asr][dataset]).to_list()

# relative_improvement_of_finetuned_model


### 1. Correlation between the original model's WER on the selected samples and the relative improvement of WER after fine-tuning

In [36]:
with open('result/RQ1.json', 'r') as f:
  rq1_data = json.load(f)


original_model_performance_on_selected_samples = {}

asrs = ["quartznet", "hubert", "wav2vec-base"]
datasets = ["YBAA", "ZHAA", "ASI", "TNI", "NCC",
            "TXHC", "EBVS", "ERMS", "YDCK", "YKWK", "THV", "TLV"]
tools = ["random", "error_model_triphone_rich", "error_model_pure_diversity", "error_model_without_diversity_enhancing", "error_model",
         "asrevolve_error_model_real", "word_error_predictor_real/no_word_enhance", "word_error_predictor_real/word_enhance"]

original_model_performance_on_selected_samples = {}
for asr in asrs:
    original_model_performance_on_selected_samples[asr] = {}
    for dataset in datasets:
        original_model_performance_on_selected_samples[asr][dataset] = {}
        for tool in tools:
            original_model_performance_on_selected_samples[asr][dataset][shorten_error_model_name[tool]] = {}
            for metric in ["WER_Seed1", "WER_Seed2", "WER_Seed3", "WER_Avg"]:
                original_model_performance_on_selected_samples[asr][dataset][shorten_error_model_name[tool]][metric] = pd.read_csv(
                  rq1_data[asr][dataset][tool])[metric].tolist()

# original_model_performance_on_selected_samples


In [37]:
arr_1 = []
arr_2 = []

for asr in asrs:
    for dataset in datasets:
        for tool in tool_short_names:
            for metric in ["WER_Seed1", "WER_Seed2", "WER_Seed3"]:
                arr_1 += original_model_performance_on_selected_samples[asr][dataset][tool][metric]
                arr_2 += relative_improvement_of_finetuned_model[asr][dataset][tool][metric]

assert len(arr_1) == len(arr_2)
scipy.stats.spearmanr(arr_1, arr_2)

            


In [45]:
arr_1 = [-3.49, 10.18, 8.98, 6.62, 27.33, 21.52]
arr_2 = [4.0375, 4.3025, 4.2475, 4.2, 4.3275, 4.4175]
scipy.stats.spearmanr(arr_1, arr_2)


SpearmanrResult(correlation=0.942857142857143, pvalue=0.004804664723032055)

In [49]:
for dataset in datasets:
    for asr in asrs:
        arr_1 = []
        arr_2 = []
        # for tool in ["RND", "TR", "PD", "IC-WDE", "IC", "ASR-EV", "NWE", "WE"]:
        # for tool in ["PD", "IC-WDE", "IC", "ASR-EV", "NWE", "WE"]:
        for tool in ["PD", "IC-WDE", "IC", "ASR-EV", "NWE"]:
            curr_arr_1 = []
            curr_arr_2 = []
            
            for metric in ["WER_Seed1", "WER_Seed2", "WER_Seed3"]:
                curr_arr_1 += original_model_performance_on_selected_samples[asr][dataset][tool][metric]
                curr_arr_2 += relative_improvement_of_finetuned_model[asr][dataset][tool][metric]
            
            arr_1.append(np.mean(curr_arr_1))
            arr_2.append(np.mean(curr_arr_2))

        assert len(arr_1) == len(arr_2)
        # print("Len arr 1: ", len(arr_1))

        correlation_coefficient, p_value = scipy.stats.spearmanr(arr_1, arr_2)

        print()
        print("Dataset: ", dataset)
        print("ASR: ", asr)
        print("Spearman's corr coeff:\t ", correlation_coefficient)
        print("p-value:\t\t ", p_value)



Dataset:  YBAA
ASR:  quartznet
Spearman's corr coeff:	  0.9999999999999999
p-value:		  1.4042654220543672e-24

Dataset:  YBAA
ASR:  hubert
Spearman's corr coeff:	  0.8999999999999998
p-value:		  0.03738607346849874

Dataset:  YBAA
ASR:  wav2vec-base
Spearman's corr coeff:	  -0.3
p-value:		  0.6238376647810728

Dataset:  ZHAA
ASR:  quartznet
Spearman's corr coeff:	  0.19999999999999998
p-value:		  0.747060078104662

Dataset:  ZHAA
ASR:  hubert
Spearman's corr coeff:	  0.49999999999999994
p-value:		  0.39100221895577053

Dataset:  ZHAA
ASR:  wav2vec-base
Spearman's corr coeff:	  0.3
p-value:		  0.6238376647810728

Dataset:  ASI
ASR:  quartznet
Spearman's corr coeff:	  0.09999999999999999
p-value:		  0.8728885715695383

Dataset:  ASI
ASR:  hubert
Spearman's corr coeff:	  0.6
p-value:		  0.28475697986529375

Dataset:  ASI
ASR:  wav2vec-base
Spearman's corr coeff:	  0.3
p-value:		  0.6238376647810728

Dataset:  TNI
ASR:  quartznet
Spearman's corr coeff:	  0.3
p-value:		  0.6238376647810728

In [None]:
for dataset in datasets:
    for asr in asrs:
        arr_1 = []
        arr_2 = []
        for tool in tool_short_names:
            for metric in ["WER_Seed1", "WER_Seed2", "WER_Seed3"]:

                curr_arr_1 = original_model_performance_on_selected_samples[
                    asr][dataset][tool][metric]
                arr_1.append(np.mean(curr_arr_1))
                # arr_2 +=
                curr_arr_2 = relative_improvement_of_finetuned_model[asr][dataset][tool][metric]
                arr_2.append(np.mean(curr_arr_2))

        assert len(arr_1) == len(arr_2)

        correlation_coefficient, p_value = scipy.stats.spearmanr(arr_1, arr_2)

        print()
        print("Dataset: ", dataset)
        print("ASR: ", asr)
        print("Spearman's corr coeff:\t ", correlation_coefficient)
        print("p-value:\t\t ", p_value)


### 3. Correlation between "the distance of triphone rich distribution and ideal distribution on the selected samples" and the relative improvement of WER after fine-tuning

In [39]:
with open('result/triphone_rich.json', 'r') as f:
  triphone_rich_data = json.load(f)


In [40]:
## without any grouping mechanism

arr_1 = []
arr_2 = []

for asr in asrs:
    for dataset in datasets:
        tool = "error_model_triphone_rich"
        for seed in ["1", "2", "3"] :
            arr_1 += triphone_rich_data[asr][dataset][seed]
        for metric in ["WER_Seed1", "WER_Seed2", "WER_Seed3"]:
            arr_2 += relative_improvement_of_finetuned_model[asr][dataset][shorten_error_model_name[tool]][metric]

assert len(arr_1) == len(arr_2)

correlation_coefficient, p_value = scipy.stats.spearmanr(arr_1, arr_2)
print()
print("Spearman's corr coeff:\t ", correlation_coefficient)
print("p-value:\t\t ", p_value)



Spearman's corr coeff:	  0.3096296252398027
p-value:		  4.728069257623937e-11


In [41]:
## grouping by dataset

for dataset in datasets:
    arr_1 = []
    arr_2 = []
    for asr in asrs:
        tool = "error_model_triphone_rich"
        for seed in ["1", "2", "3"]:
            arr_1 += triphone_rich_data[asr][dataset][seed]
        for metric in ["WER_Seed1", "WER_Seed2", "WER_Seed3"]:
            arr_2 += relative_improvement_of_finetuned_model[asr][dataset][shorten_error_model_name[tool]][metric]

    assert len(arr_1) == len(arr_2)

    correlation_coefficient, p_value = scipy.stats.spearmanr(arr_1, arr_2)

    print()
    print("Dataset: ", dataset)
    print("Spearman's corr coeff:\t ", correlation_coefficient)
    print("p-value:\t\t ", p_value)



Dataset:  YBAA
Spearman's corr coeff:	  0.29872252445264824
p-value:		  0.07677066890825572

Dataset:  ZHAA
Spearman's corr coeff:	  0.39171008070249985
p-value:		  0.018147182140365984

Dataset:  ASI
Spearman's corr coeff:	  0.36745807989066254
p-value:		  0.02747414148369915

Dataset:  TNI
Spearman's corr coeff:	  -0.032172703215137745
p-value:		  0.8522317119652019

Dataset:  NCC
Spearman's corr coeff:	  0.6248733893766553
p-value:		  4.623396426138484e-05

Dataset:  TXHC
Spearman's corr coeff:	  0.427992415964119
p-value:		  0.009215235980734586

Dataset:  EBVS
Spearman's corr coeff:	  0.3130928912099224
p-value:		  0.06298250889255443

Dataset:  ERMS
Spearman's corr coeff:	  0.21620999260824883
p-value:		  0.20532655747387657

Dataset:  YDCK
Spearman's corr coeff:	  0.4356320174220398
p-value:		  0.007915519401666004

Dataset:  YKWK
Spearman's corr coeff:	  0.15354817534862975
p-value:		  0.37127176401256157

Dataset:  THV
Spearman's corr coeff:	  0.5276364267396184
p-value:		  0