In [117]:
import numpy as np
import pandas as pd
from collections import defaultdict, Counter
import json
from tqdm.notebook import tqdm

from IPython.display import display, HTML

import wandb
api = wandb.Api()

In [74]:
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
sns.set(style="whitegrid")

In [75]:
sns.set_style("ticks")

# Data Loading 

### Wandb loading

In [119]:
runs = api.runs("lazary/memorization")
summary_list = [] 
config_list = [] 
name_list = [] 
ids_list = []
for run in runs:
    # run.summary are the output key/values like accuracy.  We call ._json_dict to omit large files 
    summary_list.append(run.summary._json_dict) 

    # run.config is the input metrics.  We remove special values that start with _.
    config_list.append({k:v for k,v in run.config.items() if not k.startswith('_')}) 

    # run.name is the name of the run.
    name_list.append(run.name)
    ids_list.append(run.id)

summary_df = pd.DataFrame.from_records(summary_list) 
config_df = pd.DataFrame.from_records(config_list) 
name_df = pd.DataFrame({'name': name_list}) 
ids_df = pd.DataFrame({'id': ids_list}) 
all_df = pd.concat([name_df, ids_df, config_df,summary_df], axis=1)

all_df = all_df.fillna(-1.0)
# all_df = all_df.astype({'cooccurrence': 'int32',
#                         'total_tuples': 'int32',
#                         'num_patterns': 'int32'})

# all_df.to_csv("project.csv")

In [77]:
all_df

Unnamed: 0,name,id,lm,pattern,base_acc,base_pattern,cooccurrence,_timestamp,confusing_tuples,_runtime,...,pval,_step,false_acc,num_patterns,n_subjects,total_occurrences,n_objects,spike_query,unique_objects,unique_queries
0,P176_unpattern_eval_bert-large-cased,yqwutwgq,bert-large-cased,P176,0.969178,[X] is produced by [Y].,876,1.600617e+09,"{'size': 1740, '_type': 'table-file', 'ncols':...",2.0,...,3.043624e-145,1.0,0.694825,3.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0
1,P407_unpattern_eval_bert-large-cased,d16xxake,bert-large-cased,P407,0.805022,[X] was written in [Y].,677,1.600617e+09,"{'ncols': 2, 'nrows': 50, 'sha256': '7319dc249...",2.0,...,1.077273e-26,1.0,0.600443,2.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0
2,P1303_unpattern_eval_bert-base-cased,cjk15scv,bert-base-cased,P1303,0.139535,[X] plays [Y].,387,1.600617e+09,{'path': 'media/table/confusing_tuples_1_7031f...,2.0,...,1.000000e+00,1.0,0.419466,3.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0
3,P1412_unpattern_eval_bert-base-cased,au2csvgk,bert-base-cased,P1412,0.875000,[X] used to communicate in [Y].,720,1.600617e+09,"{'_type': 'table-file', 'ncols': 2, 'nrows': 5...",2.0,...,3.082320e-90,1.0,0.616667,3.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0
4,P413_unpattern_eval_bert-base-cased,5sky97uf,bert-base-cased,P413,0.555556,[X] plays in [Y] position.,9,1.600617e+09,{'path': 'media/table/confusing_tuples_1_fa83e...,2.0,...,1.586553e-01,1.0,0.333333,2.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
235,P19_lm_bert-base-cased,31tajdk1,bert-base-cased,P19,-1.000000,-1,-1,-1.000000e+00,-1,-1.0,...,-1.000000e+00,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0
236,P17_lm_bert-base-cased,lwaapcjd,bert-base-cased,P17,-1.000000,-1,-1,-1.000000e+00,-1,-1.0,...,-1.000000e+00,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0
237,P17_lm_bert-large-cased,3qfxr8nh,bert-large-cased,P17,-1.000000,-1,-1,-1.000000e+00,-1,-1.0,...,-1.000000e+00,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0
238,P17_lm_roberta-base,3q38owrd,roberta-base,P17,-1.000000,-1,-1,-1.000000e+00,-1,-1.0,...,-1.000000e+00,-1.0,-1.000000,-1.0,-1.0,-1.0,-1.0,-1,-1.0,-1.0


### Patterns loading

In [78]:
with open('../data/trex/data/relations.jsonl', 'r') as f:
    lines = f.readlines()
    lines = [x.strip() for x in lines]
    
json_lines = [json.loads(x) for x in lines]
pid2label = {x['relation']: x['label'] for x in json_lines}

In [130]:
pid2label['P19']

'place of birth'

## Entities Correlation (unpatterns)

In [92]:
unpatterns_df_bert_large = all_df[all_df['name'].str.contains('unpattern_eval_bert-large')]

In [110]:
def parse_unpatterns(df):
    short = df[['pattern', 'lm', 'cooccurrence', 'pval', 'base_acc', 'false_acc', 'best_pattern_acc',
               'num_patterns']]
    
    short['pval'] = short['pval'] < 0.01
    short['label'] = short['pattern'].map(pid2label)
    
    short = short.rename(columns={'best_pattern_acc': 'best_false_acc'})
    
    short = short[['pattern', 'label', 'num_patterns', 'cooccurrence',
                   'pval', 'base_acc', 'false_acc', 'best_false_acc']]
    
    short = short.drop(short[short.cooccurrence == -1].index)
    
    return short

In [111]:
unpattern_table_bert_large = parse_unpatterns(unpatterns_df_bert_large)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [112]:
unpattern_table_bert_large

Unnamed: 0,pattern,label,num_patterns,cooccurrence,pval,base_acc,false_acc,best_false_acc
0,P176,manufacturer,3.0,876,True,0.969178,0.694825,0.946347
1,P407,language of work or name,2.0,677,True,0.805022,0.600443,0.611521
7,P413,position played on team / speciality,2.0,43,True,0.906977,0.081395,0.093023
10,P1303,instrument,3.0,293,False,0.286689,0.546075,0.83959
11,P1412,"languages spoken, written or signed",3.0,789,True,0.780735,0.668779,0.816223
12,P101,field of work,3.0,127,True,0.629921,0.488189,0.503937
20,P136,genre,2.0,18,False,0.666667,0.527778,0.555556
23,P276,location,3.0,555,True,0.756757,0.533333,0.623423
24,P279,subclass of,2.0,334,True,0.97006,0.398204,0.796407
25,P108,employer,5.0,16,False,0.0625,0.35,0.625


In [113]:
print(unpattern_table_bert_large.round(2).to_latex(index=False))

\begin{tabular}{llrrlrrr}
\toprule
pattern &                                 label &  num\_patterns &  cooccurrence &   pval &  base\_acc &  false\_acc &  best\_false\_acc \\
\midrule
   P176 &                          manufacturer &           3.0 &           876 &   True &      0.97 &       0.69 &            0.95 \\
   P407 &              language of work or name &           2.0 &           677 &   True &      0.81 &       0.60 &            0.61 \\
   P413 &  position played on team / speciality &           2.0 &            43 &   True &      0.91 &       0.08 &            0.09 \\
  P1303 &                            instrument &           3.0 &           293 &  False &      0.29 &       0.55 &            0.84 \\
  P1412 &   languages spoken, written or signed &           3.0 &           789 &   True &      0.78 &       0.67 &            0.82 \\
   P101 &                         field of work &           3.0 &           127 &   True &      0.63 &       0.49 &            0.50 \\
   P13

### BERT-base

In [114]:
unpatterns_df_bert_base = all_df[all_df['name'].str.contains('unpattern_eval_bert-base')]
unpattern_table_bert_base = parse_unpatterns(unpatterns_df_bert_base)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [115]:
unpattern_table_bert_base

Unnamed: 0,pattern,label,num_patterns,cooccurrence,pval,base_acc,false_acc,best_false_acc
2,P1303,instrument,3.0,387,False,0.139535,0.419466,0.527132
3,P1412,"languages spoken, written or signed",3.0,720,True,0.875,0.616667,0.940278
4,P413,position played on team / speciality,2.0,9,False,0.555556,0.333333,0.444444
6,P176,manufacturer,3.0,855,True,0.973099,0.704873,0.935673
8,P407,language of work or name,2.0,650,True,0.798462,0.726154,0.783077
14,P276,location,3.0,511,True,0.778865,0.493151,0.590998
15,P136,genre,2.0,7,False,0.857143,0.428571,0.428571
17,P279,subclass of,2.0,302,True,0.976821,0.380795,0.728477
18,P108,employer,5.0,15,True,0.866667,0.146667,0.2
19,P495,country of origin,3.0,401,False,0.374065,0.443059,0.745636


# Memorization

In [160]:
paraphrase_df = all_df[all_df.name.str.contains('paraphrase_eval_bert-large-cased')]

# for removing duplicate column (pattern)
paraphrase_df = paraphrase_df.iloc[:,~paraphrase_df.columns.duplicated()]

# adding actual relation name
paraphrase_df['pattern_name'] = paraphrase_df['pattern'].apply(lambda x: pid2label[x])

In [164]:
paraphrase_df[['pattern_name', 'pval']].round(2)

Unnamed: 0,pattern_name,pval
2,shares border with,1.0
4,location of formation,0.0
7,original network,0.04
12,country of origin,0.0
16,member of,0.0
17,capital,0.0
20,location,0.92
21,position held,0.0
26,place of birth,0.0
27,continent,1.0


In [166]:
print(paraphrase_df[['pattern_name', 'pval']].round(2).to_latex(index=False))

\begin{tabular}{lr}
\toprule
                                     pattern\_name &  pval \\
\midrule
                               shares border with &  1.00 \\
                            location of formation &  0.00 \\
                                 original network &  0.04 \\
                                country of origin &  0.00 \\
                                        member of &  0.00 \\
                                          capital &  0.00 \\
                                         location &  0.92 \\
                                    position held &  0.00 \\
                                   place of birth &  0.00 \\
                                        continent &  1.00 \\
                                   place of death &  0.00 \\
                            headquarters location &  0.01 \\
                                      named after & -1.00 \\
                      twinned administrative body & -1.00 \\
              languages spoken, written or sig

In [170]:
paraphrase_df = all_df[all_df.name.str.contains('paraphrase_eval_bert-base-cased')]

# for removing duplicate column (pattern)
paraphrase_df = paraphrase_df.iloc[:,~paraphrase_df.columns.duplicated()]

# adding actual relation name
paraphrase_df['pattern_name'] = paraphrase_df['pattern'].apply(lambda x: pid2label[x])

print(paraphrase_df[['pattern', 'pattern_name', 'pval']].round(2).to_latex())

\begin{tabular}{lllr}
\toprule
{} & pattern &                                      pattern\_name &  pval \\
\midrule
3  &     P47 &                                shares border with &  0.90 \\
5  &    P740 &                             location of formation &  0.00 \\
11 &    P449 &                                  original network &  1.00 \\
14 &    P495 &                                 country of origin &  1.00 \\
18 &    P463 &                                         member of &  0.00 \\
19 &     P36 &                                           capital &  0.00 \\
22 &    P276 &                                          location &  0.00 \\
23 &     P39 &                                     position held &  0.00 \\
28 &     P30 &                                         continent &  1.00 \\
29 &     P19 &                                    place of birth &  0.00 \\
34 &     P20 &                                    place of death &  0.00 \\
35 &    P159 &                             head

In [171]:
len(paraphrase_df[paraphrase_df['pval'] != -1.])

15

In [169]:
len(paraphrase_df[(paraphrase_df['pval'] != -1.) & (paraphrase_df['pval'] >= 0.05)])

3