# Correlations

In [11]:
DIR = '/mnt/ceph/storage/data-in-progress/data-teaching/theses/wstud-thesis-probst/deep-ct/'

def correlation(a, b, correlation):
    import pandas as pd
    import json
    # TODO: Add all tokens with weight of 0?
    tokens = set([i for i in a.keys()] + [i for i in b.keys()])
    
    df_corr = []
    for token in tokens:
        df_corr += [{'word': token, 'weight_a': a.get(token, 0), 'weight_b': b.get(token, 0)}]
    
    ret = pd.DataFrame(df_corr).corr(correlation)
    if len(ret) != 2:
        raise ValueError('Could not handle \n' + json.dumps(a) + '\n' + json.dumps(b))
    ret = ret.iloc[0]

    if ret.name != 'weight_a':
        raise ValueError('Could not handle \n' + json.dumps(a) + '\n' + json.dumps(b))
    
    return float(ret['weight_b'])

def jaccard_similarity(list1, list2):
    intersection = len(list(set(list1).intersection(list2)))
    union = (len(set(list1)) + len(set(list2))) - intersection
    return float(intersection) / union

In [12]:
correlation({"project": 0.3, "manhatten": 0.7, "success": 1.0}, {"project": 1.0, "manhatten": 0.5, "success": 0.1}, "kendall")

-1.0

In [13]:
correlation({"project": 0.3, "manhatten": 0.7, "success": 1.0}, {"project": 0.5, "manhatten": 2, "success": 0.1}, 'kendall')

-0.33333333333333337

In [15]:
def load_docs(method):
    import json
    from tqdm import tqdm
    ret = {}
    with open(DIR + method) as f:
        for i in tqdm(f):
            parsed = json.loads(i)
            doc_id = parsed['doc']['id']
            term_recall = parsed['term_recall']
            if len(term_recall.keys()) > 0:
                ret[doc_id] = term_recall
    return ret
    
    

def calculate_correlation(method_a, method_b):
    import json
    from tqdm import tqdm
    import pandas as pd
    doc_terms_method_a = load_docs(method_a)
    doc_terms_method_b = load_docs(method_b)
    ret = []
    
    for doc in tqdm([i for i in doc_terms_method_a.keys()]):
        if  doc not in doc_terms_method_b:
            continue
            
        ret += [
            {
                'doc': doc,
                'kendall': correlation(doc_terms_method_a[doc], doc_terms_method_b[doc]  ,'kendall'),
                'pearson': correlation(doc_terms_method_a[doc], doc_terms_method_b[doc]  ,'pearson'),
                'jaccard': jaccard_similarity(doc_terms_method_a[doc].keys(), doc_terms_method_b[doc].keys())
            }
        ]
    
    return pd.DataFrame(ret)

# Correlation ORCAS <-> Anchor CC-2019

In [23]:
df_corr_orcas_anchor = calculate_correlation('deep-ct-training-data-orcas-sampled-test-overlap-removed.jsonl', 'deep-ct-training-data-cc-2019-47-sampled-test-overlap-removed.jsonl')
df_corr_orcas_anchor

2080050it [00:18, 110288.14it/s]
2079888it [00:19, 105823.89it/s]
100%|██████████| 1846874/1846874 [05:21<00:00, 5740.97it/s] 


Unnamed: 0,doc,kendall,pearson,jaccard
0,D1879027___1,,,1.000000
1,D1879027___2,0.310864,0.841273,0.350000
2,D1879027___3,1.000000,0.765942,0.666667
3,D1479993___1,,,0.285714
4,D1479993___2,-0.531055,-0.550973,0.153846
...,...,...,...,...
236668,D3026391___2,-0.200000,-0.789337,0.250000
236669,D3026391___3,0.577350,0.577350,0.250000
236670,D3026391___4,0.577350,0.577350,0.250000
236671,D3026391___5,0.612372,0.612372,0.200000


In [31]:
df_corr_orcas_anchor.describe()

Unnamed: 0,kendall,pearson,jaccard
count,192712.0,193250.0,236673.0
mean,0.391388,0.605929,0.531286
std,0.54388,0.581922,0.291548
min,-1.0,-1.0,0.0
25%,0.127385,0.483638,0.333333
50%,0.5,0.905827,0.5
75%,0.816497,0.990154,0.75
max,1.0,1.0,1.0


# Correlation Anchor CC-2019 (Sample) <-> Anchor CC-2019 (Full)

# Correlation ORCAS (Sample) <-> ORCAS (Full)

# Correlation MS-Marco <-> ORCAS

In [24]:
df_corr_marco_orcas = calculate_correlation('deep-ct-training-data-ms-marco-training-set-test-overlap-removed.jsonl', 'deep-ct-training-data-orcas-sampled-test-overlap-removed.jsonl')
df_corr_marco_orcas

2079858it [00:17, 121025.39it/s]
2080050it [00:19, 108371.15it/s]
100%|██████████| 1793684/1793684 [04:48<00:00, 6216.74it/s]


Unnamed: 0,doc,kendall,pearson,jaccard
0,D1683937___1,0.000000,1.474420e-01,0.500000
1,D1683937___2,-0.306186,-5.874748e-17,0.250000
2,D1683937___3,-0.612372,-5.303301e-01,0.166667
3,D1683937___4,-0.377964,-3.572173e-01,0.200000
4,D1683937___5,-0.617213,-5.833333e-01,0.200000
...,...,...,...,...
212324,D50669___8,0.000000,1.595863e-01,0.333333
212325,D50669___9,0.000000,-3.448652e-01,0.333333
212326,D50669___10,0.816497,5.475331e-01,0.666667
212327,D50669___11,,,1.000000


In [27]:
df_corr_marco_orcas.describe()

Unnamed: 0,kendall,pearson,jaccard
count,150419.0,150530.0,212329.0
mean,0.35095,0.463958,0.505818
std,0.597499,0.633905,0.309171
min,-1.0,-1.0,0.0
25%,0.0,0.094511,0.25
50%,0.549857,0.748678,0.5
75%,0.816497,0.987829,0.714286
max,1.0,1.0,1.0


# Correlation MS-Marco <-> Anchor CC-2019

In [25]:
df_corr_marco_anchor = calculate_correlation('deep-ct-training-data-ms-marco-training-set-test-overlap-removed.jsonl', 'deep-ct-training-data-cc-2019-47-sampled-test-overlap-removed.jsonl')
df_corr_marco_anchor

2079858it [00:17, 122249.74it/s]
2079888it [00:19, 105544.04it/s]
100%|██████████| 1793684/1793684 [04:20<00:00, 6888.32it/s] 


Unnamed: 0,doc,kendall,pearson,jaccard
0,D15500___1,,,0.400000
1,D15500___2,-0.534522,-0.534522,0.300000
2,D15500___3,,,0.500000
3,D1190641___2,-1.000000,-1.000000,0.000000
4,D431186___1,,,1.000000
...,...,...,...,...
191407,D1551511___2,,,0.500000
191408,D1551511___3,,,1.000000
191409,D3221007___1,-0.201008,-0.333333,0.500000
191410,D3221007___2,-0.022361,-0.164570,0.384615


In [29]:
df_corr_marco_anchor.describe()

Unnamed: 0,kendall,pearson,jaccard
count,139275.0,139581.0,191412.0
mean,0.258559,0.405378,0.446389
std,0.6274,0.684978,0.318363
min,-1.0,-1.0,0.0
25%,-0.154284,-0.111111,0.2
50%,0.443813,0.693402,0.357143
75%,0.774597,0.996727,0.666667
max,1.0,1.0,1.0
