In [1]:
from scipy import stats
from pathlib import Path
import pandas as pd
import numpy as np
import csv

In [3]:
exps = list(Path('prec_recall').glob('runs-*'))
print(exps)

[PosixPath('prec_recall/runs-ende-001m'), PosixPath('prec_recall/runs-deen-500k'), PosixPath('prec_recall/runs-enhi-500k'), PosixPath('prec_recall/runs-deen-all'), PosixPath('prec_recall/runs-enhi-all'), PosixPath('prec_recall/runs-enhi-030k'), PosixPath('prec_recall/runs-deen-030k'), PosixPath('prec_recall/runs-ende-030k'), PosixPath('prec_recall/runs-ende-500k'), PosixPath('prec_recall/runs-deen-001m'), PosixPath('prec_recall/runs-ende-all')]


In [13]:
exp = Path('prec_recall/runs-ende-001m') 
tsvs = list(sorted(exp.glob('*/newstest2018*.tsv')))
tsvs

[PosixPath('prec_recall/runs-ende-001m/000-ende-chars-r1/newstest2018.1gram.pr.tsv'),
 PosixPath('prec_recall/runs-ende-001m/011-ende-.5k.5k-r1/newstest2018.1gram.pr.tsv'),
 PosixPath('prec_recall/runs-ende-001m/022-ende-01k01k-r1/newstest2018.1gram.pr.tsv'),
 PosixPath('prec_recall/runs-ende-001m/033-ende-02k02k-r1/newstest2018.1gram.pr.tsv'),
 PosixPath('prec_recall/runs-ende-001m/044-ende-04k04k-r1/newstest2018.1gram.pr.tsv'),
 PosixPath('prec_recall/runs-ende-001m/055-ende-08k08k-r1/newstest2018.1gram.pr.tsv'),
 PosixPath('prec_recall/runs-ende-001m/066-ende-16k16k-r1/newstest2018.1gram.pr.tsv'),
 PosixPath('prec_recall/runs-ende-001m/077-ende-32k32k-r1/newstest2018.1gram.pr.tsv'),
 PosixPath('prec_recall/runs-ende-001m/088-ende-48k48k-r1/newstest2018.1gram.pr.tsv'),
 PosixPath('prec_recall/runs-ende-001m/099-ende-64k64k-r1/newstest2018.1gram.pr.tsv')]

In [10]:
df = pd.read_csv(tsvs[0], sep='\t', header=0, quoting=csv.QUOTE_NONE)
df.head()

Unnamed: 0,Rank,Gram,Name,RankF,RefF,CandF,Recall,Precision,F1
0,1,5,▁,23285730,61037,61229,0.950816,0.941158,0.945962
1,2,6,e,20079991,50643,51129,0.925697,0.91633,0.92099
2,3,7,n,12820996,30795,31184,0.907569,0.898449,0.902986
3,4,8,i,10202092,24393,24811,0.912216,0.893239,0.902627
4,5,9,r,9068726,23127,23381,0.90772,0.8957,0.90167


In [11]:
df[['Rank', 'Precision']].corr(method='spearman')

Unnamed: 0,Rank,Precision
Rank,1.0,0.051445
Precision,0.051445,1.0


In [48]:
def corr_analysis(tsvs, field1, field2='Rank'):
    print(f"Correlation of {field1} vs {field2}")
    meths = ['pearson', 'spearman', 'kendall']
    rows = []
    for tsv in tsvs:
        df = pd.read_csv(tsv, sep='\t', header=0, quoting=csv.QUOTE_NONE)
        row = {m: (df[field1]).corr(other=df[field2], method=m) for m in meths}
        row['name'] = tsv.name.replace('.1gram.pr.tsv', '')
        rows.append(row)
    keys = ['name', 'pearson', 'spearman', 'kendall']
    print(','.join(keys))
    for row in rows:
        vals = [row[k] for k in keys]
        vals = [f'{v:.3f}' if isinstance(v, float) else v for v in vals]
        print(','.join(vals))
        
def corr(tsv, field1, field2='Rank', method='pearson'):
    df = pd.read_csv(tsv, sep='\t', header=0, quoting=csv.QUOTE_NONE)
    return df[field1].corr(other=df[field2], method=method)


vocab_names = {
    'chars' : 'Char',
    '.5k.5k': '500',
    '01k01k': '1K',
    '02k02k': '2K',
    '04k04k': '4K',
    '08k08k': '8K', 
    '16k16k': '16K',
    '32k32k': '32K',    
    '48k48k': '48K',        
    '64k64k': '64K'    
}

exper_names = {
    'deen-030k': r'DE-EN 30K',
    'deen-500k': r'DE-EN 0.5M',
    'deen-001m': r'DE-EN 1M',
    'deen-all': r'DE-EN 4.5M',
    'ende-030k': r'EN-DE 30K',
    'ende-500k': r'EN-DE 0.5M',
    'ende-001m': r'EN-DE 1M',
    'ende-all': r'EN-DE 4.5M',
#    'enhi-030k': r'EN-HI 30K',    
    'enhi-500k': r'EN-HI 0.5M',
    'enhi-all': r'EN-HI 1.3M',
    'enlt-all': r'EN-LT 0.6M',
}
datasets = {
    'DE-EN': dict(dev='newstest2018', test='newstest2019'),
    'EN-DE': dict(dev='newstest2018', test='newstest2019'),
    'EN-LT': dict(dev='newsdev2019', test='newstest2019'),
    'EN-HI': dict(dev='IITBv1_5_dev', test='IITBv1_5_test'),
}
import collections as coll

def print_mem(mem, delim=','):
    voc_ns = [v for k, v in vocab_names.items()]
    exp_ns = [v for k, v in exper_names.items()]
    header = ['Vocabulary'] +  exp_ns
    print(delim.join(header))
    for voc_name in voc_ns:
        row = [voc_name]
        if voc_name not in mem:
            row.append("NAA")
        else:
            data = mem[voc_name]
            for exp_n in exp_ns:
                if exp_n in data:
                    row.append(f'{data[exp_n]:.3f}')
                else:
                    row.append('NA')
        print(delim.join(row))

for split_name in ['dev', 'test']:
    for field in ['Precision', 'Recall']:
        mem = coll.defaultdict(dict) # [vocab][exp] = val
        for exp in exps:
            exp_n = exp.name.replace('runs-', '')
            if exp_n == 'enhi-030k':
                continue # not used for analysis
            exp_n = exp_names[exp_n]
            out_file_name = datasets[exp_n.split()[0]][split_name]
            tsvs = sorted(list(exp.glob(f'*/*{out_file_name}*.tsv')))
            voc_ns = [str(t).split('/')[-2].split('-')[-2] for t in tsvs]
            voc_ns = [vocab_names[n] for n in voc_ns]

            for voc_name, tsv in zip(voc_ns, tsvs):
                coeff = corr(tsv, field1=field, field2='Rank')
                mem[voc_name][exp_n] = coeff
        print(f'#### {split_name} {field}')
        print_mem(mem)


#### dev Precision
Vocabulary,DE-EN 30K,DE-EN 0.5M,DE-EN 1M,DE-EN 4.5M,EN-DE 30K,EN-DE 0.5M,EN-DE 1M,EN-DE 4.5M,EN-HI 0.5M,EN-HI 1.3M,EN-LT 0.6M
Char,-0.074,-0.283,-0.362,-0.364,-0.046,-0.211,-0.242,-0.188,-0.261,-0.079,NA
500,-0.094,-0.170,-0.237,-0.234,-0.033,-0.089,-0.068,-0.061,-0.121,-0.017,NA
1K,-0.066,-0.133,-0.151,-0.164,-0.086,-0.114,-0.063,-0.117,-0.080,-0.051,NA
2K,-0.063,-0.098,-0.111,-0.111,-0.010,-0.108,-0.075,-0.062,0.010,-0.002,NA
4K,0.035,-0.028,-0.033,-0.040,0.122,-0.014,-0.022,-0.026,0.087,0.075,NA
8K,0.136,0.023,-0.004,0.016,0.202,0.006,0.002,0.008,0.135,0.127,NA
16K,0.294,0.076,0.060,0.052,0.316,0.074,0.059,0.064,0.202,0.185,NA
32K,0.312,0.120,0.104,0.088,0.361,0.101,0.074,0.081,0.213,0.201,NA
48K,NA,0.143,0.118,0.108,0.348,0.131,0.106,0.097,0.230,0.215,NA
64K,NA,0.165,0.121,0.120,0.366,0.148,0.114,0.097,0.253,0.232,NA
#### dev Recall
Vocabulary,DE-EN 30K,DE-EN 0.5M,DE-EN 1M,DE-EN 4.5M,EN-DE 30K,EN-DE 0.5M,EN-DE 1M,EN-DE 4.5M,EN-HI 0.5M,EN-HI 1.3M,EN-LT 0.6M
Char,-