# Correlations


In [34]:
import pandas as pd
from scipy.stats import pearsonr, kendalltau

In [25]:
file = 'MTIR-scores-all-corrected.txt'
table = pd.read_table(file)
table

Unnamed: 0,Lang,Dataset,Model,BPEsize,TrainSize,MAP,MAP.gm,P5,P10,R10,RBO,NDCG,BLEU,CHRF1,MacroF1,MicroF1,BLEURTMean,BLEURTMedian
0,cs,europarl,bm25,16000,25000,0.2536,0.1597,0.6152,0.5966,0.0625,0.316368,0.4288,17.26,0.458,13.79,42.54,-0.233231,-0.199350
1,cs,europarl,bm25,16000,50000,0.3373,0.2299,0.6690,0.6607,0.0693,0.394681,0.5096,22.32,0.514,20.45,48.66,-0.060558,-0.018857
2,cs,europarl,bm25,16000,75000,0.3568,0.2479,0.6993,0.6862,0.0725,0.415811,0.5269,23.88,0.529,22.48,50.26,-0.017226,0.032206
3,cs,europarl,bm25,16000,100000,0.3762,0.2720,0.7448,0.7097,0.0765,0.433156,0.5481,25.56,0.544,24.18,51.72,0.018455,0.068034
4,cs,europarl,bm25,16000,125000,0.3914,0.2916,0.7572,0.7324,0.0809,0.456725,0.5633,26.88,0.556,25.93,53.13,0.059481,0.106634
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,de,wiki,neural,32000,100000,0.0957,0.0017,0.0265,0.0182,0.1822,0.107964,0.1840,26.33,0.545,25.99,52.38,0.054870,0.090689
92,de,wiki,neural,32000,125000,0.0980,0.0020,0.0270,0.0190,0.1902,0.110684,0.1904,27.57,0.557,27.59,53.68,0.086427,0.128155
93,de,wiki,neural,32000,150000,0.0948,0.0021,0.0264,0.0185,0.1848,0.115407,0.1887,27.79,0.559,28.00,53.85,0.092212,0.133802
94,de,wiki,neural,32000,175000,0.1049,0.0023,0.0287,0.0200,0.1998,0.117848,0.1986,28.68,0.566,29.01,54.63,0.118354,0.156946


In [26]:
table['Group'] = table['Lang'] + ' ' + table['Dataset'] + ' ' + table['Model']
table.drop(columns=['Lang', 'Dataset', 'Model'], inplace=True)
table

Unnamed: 0,BPEsize,TrainSize,MAP,MAP.gm,P5,P10,R10,RBO,NDCG,BLEU,CHRF1,MacroF1,MicroF1,BLEURTMean,BLEURTMedian,Group
0,16000,25000,0.2536,0.1597,0.6152,0.5966,0.0625,0.316368,0.4288,17.26,0.458,13.79,42.54,-0.233231,-0.199350,cs europarl bm25
1,16000,50000,0.3373,0.2299,0.6690,0.6607,0.0693,0.394681,0.5096,22.32,0.514,20.45,48.66,-0.060558,-0.018857,cs europarl bm25
2,16000,75000,0.3568,0.2479,0.6993,0.6862,0.0725,0.415811,0.5269,23.88,0.529,22.48,50.26,-0.017226,0.032206,cs europarl bm25
3,16000,100000,0.3762,0.2720,0.7448,0.7097,0.0765,0.433156,0.5481,25.56,0.544,24.18,51.72,0.018455,0.068034,cs europarl bm25
4,16000,125000,0.3914,0.2916,0.7572,0.7324,0.0809,0.456725,0.5633,26.88,0.556,25.93,53.13,0.059481,0.106634,cs europarl bm25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91,32000,100000,0.0957,0.0017,0.0265,0.0182,0.1822,0.107964,0.1840,26.33,0.545,25.99,52.38,0.054870,0.090689,de wiki neural
92,32000,125000,0.0980,0.0020,0.0270,0.0190,0.1902,0.110684,0.1904,27.57,0.557,27.59,53.68,0.086427,0.128155,de wiki neural
93,32000,150000,0.0948,0.0021,0.0264,0.0185,0.1848,0.115407,0.1887,27.79,0.559,28.00,53.85,0.092212,0.133802,de wiki neural
94,32000,175000,0.1049,0.0023,0.0287,0.0200,0.1998,0.117848,0.1986,28.68,0.566,29.01,54.63,0.118354,0.156946,de wiki neural


In [None]:
groups = table.groupby('Group')['Group'].count()

In [32]:
for k, v in groups.items():
    print(k)

cs europarl bm25
cs wiki bm25
cs wiki neural
de europarl bm25
de wiki bm25
de wiki neural


In [44]:

ir_mets = 'MAP MAP.gm P5 P10 R10 RBO NDCG'.split()
mt_mets = 'BLEU CHRF1 MacroF1 MicroF1 BLEURTMean BLEURTMedian'.split()

corr_methods = [('kendall', kendalltau), ('pearson', pearsonr)]
alpha = 0.05

header = ['Group', 'IR' ] + mt_mets
result = []

for corr_name, corr_func in corr_methods:    
    for group_name, _ in groups.items():
        group = table[table['Group'] == group_name]
        assert len(group) == 16 
        for ir_met in ir_mets:
            row = [group_name, ir_met]
            for mt_met in mt_mets:
                #print(row, mt_met)
                corr_val, p_val = corr_func(group[mt_met], group[ir_met])
                corr_val = f'{abs(corr_val):.3f}'
                row.append(('*' if p_val >= alpha else '') + corr_val)                
            result.append(row)

    res = pd.DataFrame(result, columns=header)
    res.to_csv(f'MTIR-{corr_name}.csv', sep='\t')