In [1]:
#file1=['../tmp/kgtk_roget_synonyms.tsv', '../tmp/kgtk_roget_antonyms.tsv']
#file1=['../tmp/kgtk_conceptnet.tsv']

files=['../tmp/kgtk_wordnet.tsv', '../tmp/kgtk_wikidata.tsv', '../tmp/kgtk_conceptnet.tsv']

In [2]:
from tqdm import tqdm
import json

In [3]:
dim_file='../consolidation/dimensions.json'

In [4]:
with open(dim_file, 'r') as f:
    dim_mapping=json.load(f)

In [5]:
dimensions=set(dim_mapping.values())

In [6]:
len(dimensions)

12

In [7]:
dimensions

{'creation',
 'desire',
 'distinctness',
 'lexical',
 'part-whole',
 'quality',
 'rel-other',
 'similarity',
 'spatial',
 'taxonomic',
 'temporal',
 'utility'}

In [8]:
def get_all_labels(l):
    if '|' not in l:
        return [l]
    else:
        return [l2 for l2 in l.split('|') if l2!='']

In [13]:
def get_triples(a_file, abstract_rel=False, dim_mapping={}, dim_filter=None):
    triples=set()
    with open(a_file, 'r') as f1:
        header=next(f1)
        for line in tqdm(f1, total=3500000):
            data=line.split('\t')
            rel_label=data[1]
            if abstract_rel:
                if rel_label not in dim_mapping.keys(): continue
                if dim_filter:
                    rel_label=dim_mapping[rel_label]
                    if rel_label!=dim_filter: continue
            for n1_label in get_all_labels(data[3]):
                for n2_label in get_all_labels(data[4]):
                    if n1_label=='' or n2_label=='': print(data)
                    triple=(n1_label, rel_label, n2_label)
                    triples.add(triple)
    return triples

## Compute overlaps between each source pair

In [14]:
def file_to_source(f):
    return f.split('/')[-1].replace('kgtk_', '').replace('.tsv', '')

In [15]:
for file1 in files:
    for file2 in files:
        if file1<file2:

            f1_triples=get_triples(file1)
            f2_triples=get_triples(file2)
            both_triples=set(f1_triples) & set(f2_triples)

            only_src2=len(set(f2_triples)-set(f1_triples))
            pct_only_src2=only_src2/(only_src2+len(both_triples))

            only_src1=len(set(f1_triples)-set(f2_triples))
            pct_only_src1=only_src1/(only_src1+len(both_triples))

            f1=file_to_source(file1)
            f2=file_to_source(file2)

            print(f1, '-', f2, 'BOTH:', len(both_triples), 'ONLY S1:', only_src1, 'ONLY S2:', only_src2)

  3%|▎         | 101771/3500000 [00:00<00:05, 570487.23it/s]
  3%|▎         | 111276/3500000 [00:00<00:12, 274180.32it/s]
  3%|▎         | 105560/3500000 [00:00<00:06, 509994.77it/s]

wikidata - wordnet BOTH: 1613 ONLY S1: 98246 ONLY S2: 419103


 98%|█████████▊| 3423004/3500000 [00:05<00:00, 620244.92it/s]
  3%|▎         | 111276/3500000 [00:00<00:12, 276722.63it/s]
  3%|▎         | 104597/3500000 [00:00<00:06, 506765.46it/s]

conceptnet - wordnet BOTH: 66102 ONLY S1: 3257219 ONLY S2: 354614


 98%|█████████▊| 3423004/3500000 [00:05<00:00, 633963.60it/s]
  3%|▎         | 101771/3500000 [00:00<00:05, 591695.38it/s]


conceptnet - wikidata BOTH: 2386 ONLY S1: 3320935 ONLY S2: 97473


## Compute mappings (with dimensions instead of relations)

In [16]:
for file1 in files:
    for file2 in files:
        if file1<file2:

            f1_triples=get_triples(file1, True, dim_mapping)
            f2_triples=get_triples(file2, True, dim_mapping)
            both_triples=set(f1_triples) & set(f2_triples)

            only_src2=len(set(f2_triples)-set(f1_triples))
            pct_only_src2=only_src2/(only_src2+len(both_triples))

            only_src1=len(set(f1_triples)-set(f2_triples))
            pct_only_src1=only_src1/(only_src1+len(both_triples))

            f1=file_to_source(file1)
            f2=file_to_source(file2)

            print(f1, '-', f2, 'BOTH:', len(both_triples), 'ONLY S1:', only_src1, 'ONLY S2:', only_src2)

  3%|▎         | 101771/3500000 [00:00<00:06, 548388.54it/s]
  3%|▎         | 111276/3500000 [00:00<00:12, 265414.85it/s]
  3%|▎         | 100923/3500000 [00:00<00:06, 491480.87it/s]

wikidata - wordnet BOTH: 1613 ONLY S1: 98246 ONLY S2: 419103


 98%|█████████▊| 3423004/3500000 [00:05<00:00, 593091.84it/s]
  3%|▎         | 111276/3500000 [00:00<00:12, 270429.36it/s]
  3%|▎         | 98437/3500000 [00:00<00:07, 475514.25it/s]

conceptnet - wordnet BOTH: 66102 ONLY S1: 3244955 ONLY S2: 354614


 98%|█████████▊| 3423004/3500000 [00:06<00:00, 555812.20it/s]
  3%|▎         | 101771/3500000 [00:00<00:06, 546796.74it/s]


conceptnet - wikidata BOTH: 2386 ONLY S1: 3308671 ONLY S2: 97473


### Compute mappings per dimension

In [18]:
for file1 in files:
    for file2 in files:
        if file1<file2:

            for d in dimensions:

                f1_triples=get_triples(file1, True, dim_mapping, d)
                f2_triples=get_triples(file2, True, dim_mapping, d)
                both_triples=set(f1_triples) & set(f2_triples)
            
                try:

                    only_src2=len(set(f2_triples)-set(f1_triples))
                    pct_only_src2=only_src2/(only_src2+len(both_triples))

                    only_src1=len(set(f1_triples)-set(f2_triples))
                    pct_only_src1=only_src1/(only_src1+len(both_triples))

                    f1=file_to_source(file1)
                    f2=file_to_source(file2)

                    print(f1, '-', f2, 'DIMENSION', d, 'BOTH:', len(both_triples), 'ONLY S1:', only_src1, 'ONLY S2:', only_src2)
                except:
                    continue

  3%|▎         | 101771/3500000 [00:00<00:03, 928354.90it/s]
  3%|▎         | 111276/3500000 [00:00<00:03, 1033884.34it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 950198.26it/s]
  3%|▎         | 111276/3500000 [00:00<00:03, 1091635.24it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 1010986.01it/s]
  3%|▎         | 111276/3500000 [00:00<00:05, 656387.12it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 964680.35it/s]
  0%|          | 0/3500000 [00:00<?, ?it/s]

wikidata - wordnet DIMENSION part-whole BOTH: 82 ONLY S1: 8121 ONLY S2: 100839


  3%|▎         | 111276/3500000 [00:00<00:03, 1046741.69it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 998870.95it/s] 
  3%|▎         | 111276/3500000 [00:00<00:03, 1009679.53it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 1095888.70it/s]
  3%|▎         | 111276/3500000 [00:00<00:03, 1120613.34it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 1079888.97it/s]
  3%|▎         | 111276/3500000 [00:00<00:03, 1101744.41it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 1074531.00it/s]
  3%|▎         | 111276/3500000 [00:00<00:03, 1055519.46it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 1053737.47it/s]
  3%|▎         | 111276/3500000 [00:00<00:03, 1099240.38it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 1113443.25it/s]
  3%|▎         | 111276/3500000 [00:00<00:03, 1087975.97it/s]
  3%|▎         | 101771/3500000 [00:00<00:04, 690478.29it/s]
  3%|▎         | 111276/3500000 [00:00<00:10, 334287.63it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 863366.00it/s]
  0%|      

wikidata - wordnet DIMENSION taxonomic BOTH: 1533 ONLY S1: 69884 ONLY S2: 318236


  3%|▎         | 111276/3500000 [00:00<00:03, 973041.87it/s]
 98%|█████████▊| 3423004/3500000 [00:02<00:00, 1168302.17it/s]
  3%|▎         | 111276/3500000 [00:00<00:03, 1051041.24it/s]
 98%|█████████▊| 3423004/3500000 [00:02<00:00, 1150422.56it/s]
  3%|▎         | 111276/3500000 [00:00<00:03, 1021359.12it/s]
 98%|█████████▊| 3423004/3500000 [00:03<00:00, 1102681.76it/s]
  3%|▎         | 111276/3500000 [00:00<00:05, 643326.11it/s]
  2%|▏         | 85499/3500000 [00:00<00:03, 854980.62it/s]

conceptnet - wordnet DIMENSION part-whole BOTH: 4710 ONLY S1: 13877 ONLY S2: 96211


 98%|█████████▊| 3423004/3500000 [00:03<00:00, 945960.85it/s] 
  3%|▎         | 111276/3500000 [00:00<00:03, 986788.64it/s]
 98%|█████████▊| 3423004/3500000 [00:03<00:00, 1065617.06it/s]
  3%|▎         | 111276/3500000 [00:00<00:03, 928692.13it/s]
 98%|█████████▊| 3423004/3500000 [00:03<00:00, 1129921.33it/s]
  3%|▎         | 111276/3500000 [00:00<00:03, 950638.28it/s]
 98%|█████████▊| 3423004/3500000 [00:03<00:00, 1090927.47it/s]
  3%|▎         | 111276/3500000 [00:00<00:03, 1025067.14it/s]
 98%|█████████▊| 3423004/3500000 [00:02<00:00, 1145490.11it/s]
  3%|▎         | 111276/3500000 [00:00<00:03, 1041774.45it/s]
 98%|█████████▊| 3423004/3500000 [00:04<00:00, 736205.75it/s]
  3%|▎         | 111276/3500000 [00:00<00:03, 888259.65it/s]
 98%|█████████▊| 3423004/3500000 [00:02<00:00, 1157884.50it/s]
  3%|▎         | 111276/3500000 [00:00<00:03, 1022240.51it/s]
 98%|█████████▊| 3423004/3500000 [00:03<00:00, 1095591.23it/s]
  3%|▎         | 111276/3500000 [00:00<00:10, 324581.65it/s]
  5%|▌

conceptnet - wordnet DIMENSION taxonomic BOTH: 73123 ONLY S1: 161584 ONLY S2: 246646


 98%|█████████▊| 3423004/3500000 [00:03<00:00, 1107281.25it/s]
  3%|▎         | 111276/3500000 [00:00<00:03, 945067.51it/s]
 98%|█████████▊| 3423004/3500000 [00:02<00:00, 1153630.11it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 1032840.48it/s]
 98%|█████████▊| 3423004/3500000 [00:02<00:00, 1165481.53it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 1018075.06it/s]
  3%|▎         | 100394/3500000 [00:00<00:03, 1003928.99it/s]

conceptnet - wikidata DIMENSION distinctness BOTH: 266 ONLY S1: 19867 ONLY S2: 6455


 98%|█████████▊| 3423004/3500000 [00:02<00:00, 1178823.44it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 962492.11it/s]
  4%|▍         | 157101/3500000 [00:00<00:04, 819306.49it/s]

conceptnet - wikidata DIMENSION part-whole BOTH: 68 ONLY S1: 18519 ONLY S2: 8135


 98%|█████████▊| 3423004/3500000 [00:03<00:00, 928317.11it/s] 
  3%|▎         | 101771/3500000 [00:00<00:03, 1024059.00it/s]
  5%|▌         | 178185/3500000 [00:00<00:03, 850688.50it/s]

conceptnet - wikidata DIMENSION lexical BOTH: 20 ONLY S1: 654991 ONLY S2: 517


 98%|█████████▊| 3423004/3500000 [00:03<00:00, 1040288.57it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 893527.24it/s]
  5%|▌         | 189323/3500000 [00:00<00:03, 888305.36it/s]

conceptnet - wikidata DIMENSION similarity BOTH: 102 ONLY S1: 243724 ONLY S2: 1223


 98%|█████████▊| 3423004/3500000 [00:03<00:00, 1134602.95it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 898210.79it/s]
  2%|▏         | 73456/3500000 [00:00<00:04, 734551.94it/s]

conceptnet - wikidata DIMENSION quality BOTH: 0 ONLY S1: 8761 ONLY S2: 1046


 98%|█████████▊| 3423004/3500000 [00:03<00:00, 1116755.07it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 1038921.38it/s]
 98%|█████████▊| 3423004/3500000 [00:02<00:00, 1147406.91it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 937034.37it/s]
  3%|▎         | 98310/3500000 [00:00<00:03, 983058.75it/s]

conceptnet - wikidata DIMENSION utility BOTH: 14 ONLY S1: 68669 ONLY S2: 2213


 98%|█████████▊| 3423004/3500000 [00:04<00:00, 730345.09it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 961481.83it/s]
  6%|▌         | 192641/3500000 [00:00<00:03, 901416.72it/s]

conceptnet - wikidata DIMENSION rel-other BOTH: 264 ONLY S1: 1915965 ONLY S2: 5267


 98%|█████████▊| 3423004/3500000 [00:02<00:00, 1161913.43it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 963778.62it/s]
  3%|▎         | 108033/3500000 [00:00<00:03, 1080318.15it/s]

conceptnet - wikidata DIMENSION creation BOTH: 0 ONLY S1: 262 ONLY S2: 187


 98%|█████████▊| 3423004/3500000 [00:03<00:00, 1090807.37it/s]
  3%|▎         | 101771/3500000 [00:00<00:05, 650572.01it/s]
  5%|▌         | 182395/3500000 [00:00<00:03, 845470.30it/s]

conceptnet - wikidata DIMENSION taxonomic BOTH: 1888 ONLY S1: 232819 ONLY S2: 69529


 98%|█████████▊| 3423004/3500000 [00:03<00:00, 1118726.06it/s]
  3%|▎         | 101771/3500000 [00:00<00:03, 885804.40it/s]

conceptnet - wikidata DIMENSION temporal BOTH: 1 ONLY S1: 69016 ONLY S2: 2600





## Thoughts/discussion

* also with WordNet based nodes?


'dog' '/r/IsA' 'mammal'
dog.n.1 '/r/IsA' mammal.n.01

(There is Kypher query for this)

Discussion in the paper: what does a 'failure of overlap' mean? Is it due to the label missing or due to the edge missing?