In [3]:
import pandas as pd
import scipy.stats as stats
import json
from tqdm.notebook import tqdm
import dask.dataframe as dd

# Other Utility Files

## Transitive Class Counts

In [2]:
derivedIsA = pd.read_csv('../data/derived.dwd_isa_class_count.compact.tsv', sep='\t')

In [4]:
derivedIsADict = {}
min1, max1 = float('inf'), 0
for _, row in tqdm(derivedIsA.iterrows(), total=len(derivedIsA)):
    for val in row['node2'].split('|'):
        qnode, count = val.split(':')
        count = int(count)
        if qnode in derivedIsADict:
            assert derivedIsADict[qnode] == count
        else:
            derivedIsADict[qnode] = count
        min1 = min(count, min1)
        max1 = max(count, max1)            

  0%|          | 0/41613030 [00:00<?, ?it/s]

In [5]:
print(f"Min branching factor: {min1}, Max branching factor: {max1}. Number of unique nodes: {len(derivedIsADict)}")

Min branching factor: 1, Max branching factor: 41575722. Number of unique nodes: 250112


In [None]:
# Alt
# !zcat ../data/derived.dwd_isa_class_count.compact.tsv.gz | cut -f 3 | tail -n +2 | sed -e 's/|/\n/g' -e 's/:/\t/g' | sort | uniq > ../data/class-counts.tsv

## Direct Class Counts

In [None]:
hierDF = pd.read_csv('../data/Master_P279_dataset/P279ChildPar.csv')
hierDF = hierDF[['id','node1','label','node2']]
hierDFCounts = hierDF.groupby('node2').count()
print(f"Found P279 counts for {len(hierDFCounts)} parents from a dataset of size {len(hierDF)}")
hierDFCounts = hierDFCounts['node1'].rename({'node1': 'child_count'})
hierDFCounts.to_csv('../data/Master_P279_dataset/hierDF_direct_P279_counts.csv')

# P279 ChildPar Dataset

Use descriptions wherever available, if not use labels, if not skip rows

[Wikidata OS File (Wikidata 2021-02-15 DWD version)](https://drive.google.com/drive/folders/168j3OfdVGXMTKcs6VyH8rq_p0n6w0GGj?usp=sharing)

In [None]:
# !kgtk filter -i ../data/wikidataos.for.text-embedding.tsv.gz -p ";P279;" -o ../data/P279_dataset/wikidata-P279.tsv

In [2]:
p279DF = pd.read_csv('../data/P279_dataset/wikidata-P279.tsv',sep='\t')
p279DFNodesSet = set(p279DF.node1.to_list() + p279DF.node2.to_list())
labelsDF = pd.read_csv('../data/labels.en.tsv', sep='\t')
descriptionsDF = pd.read_csv('../../wd-correctness/gdrive-kgtk-dump-2020-12-07/descriptions.en.tsv.gz', compression='gzip', sep='\t')
labelsDF = labelsDF[labelsDF.node1.apply(lambda p: p in p279DFNodesSet)]
descriptionsDF = descriptionsDF[descriptionsDF.node1.apply(lambda p: p in p279DFNodesSet)]

In [3]:
print(f"There are {len(p279DF)} rows in P279 dataset, {len(p279DFNodesSet)} unique number of nodes in this dataset, {len(labelsDF)} labels and {len(descriptionsDF)} descriptions")

There are 721983 rows in P279 dataset, 606996 unique number of nodes in this dataset, 511841 labels and 299844 descriptions


In [4]:
p279DFNew = p279DF
temp1 = p279DFNew.set_index('node1').join(labelsDF.set_index('node1'), rsuffix='_label')
temp1 = temp1[['id','label','node2','node2_label']]
temp2 = temp1.reset_index().set_index('node2').join(labelsDF.set_index('node1'), rsuffix='_label2')
temp3 = temp2[['id','node1','label','node2', 'node2_label']].rename(columns={'node2_label': 'node1_label', 'node2':'node2_label'}).reset_index().rename(columns={'index':'node2'})
temp3 = temp3[['id','node1','node1_label','label','node2','node2_label']]
temp3['node1_label'] = temp3['node1_label'].apply(lambda p: p[1:-4] if type(p) == str else None)
temp3['node2_label'] = temp3['node2_label'].apply(lambda p: p[1:-4] if type(p) == str else None)
hierDF = temp3.copy()

p279DFNew = hierDF
temp1 = p279DFNew.set_index('node1').join(descriptionsDF.set_index('node1'), rsuffix='_desc')
temp1 = temp1[['id','label','node2','node1_label','node2_label', 'node2_desc']]
temp2 = temp1.reset_index().set_index('node2').join(descriptionsDF.set_index('node1'), rsuffix='_desc2')
temp3 = temp2[['id','node1','label','node2', 'node1_label', 'node2_label', 'node2_desc']].rename(columns={'node2_desc': 'node1_desc', 'node2':'node2_desc'}).reset_index().rename(columns={'index':'node2'})
# temp3 = temp3[['id','node1','node1_label','label','node2','node2_label']]
# temp3 = temp3[~temp3['node1_label'].isna()]
# temp3 = temp3[~temp3['node2_label'].isna()]
temp3['node1_desc'] = temp3['node1_desc'].apply(lambda p: p[1:-4] if type(p) == str else None)
temp3['node2_desc'] = temp3['node2_desc'].apply(lambda p: p[1:-4] if type(p) == str else None)
hierDF = temp3.copy()

In [15]:
def combineLabDesc(row, nodeNum):
    if type(row['node'+str(nodeNum)+'_desc']) == str and type(row['node'+str(nodeNum)+'_label']) == str :
        return row['node'+str(nodeNum)+'_label'] + ' ' + row['node'+str(nodeNum)+'_desc']
    else:
        return None
hierDF['node1_labDesc'] = hierDF.apply(combineLabDesc, axis=1, args=(1,))
hierDF['node2_labDesc'] = hierDF.apply(combineLabDesc, axis=1, args=(2,))
hierDF = hierDF[~hierDF['node1_labDesc'].isna()]
hierDF = hierDF[~hierDF['node2_labDesc'].isna()]

In [16]:
print(f"There are {len(hierDF)} rows in P279 dataset s.t. both nodes have either a label or a description. \
    There are {len(hierDF[(~hierDF.node1_label.isna()) & (~hierDF.node2_label.isna())])} rows where both nodes have labels. \
    There are {len(hierDF[(~hierDF.node1_desc.isna()) & (~hierDF.node2_desc.isna())])} rows where both nodes have descriptions.")

There are 373463 rows in P279 dataset s.t. both nodes have either a label or a description.     There are 373463 rows where both nodes have labels.     There are 373463 rows where both nodes have descriptions.


In [None]:
hierDF.head()

In [8]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from time import time
import pandas as pd

def getSentEmbeddings(valSeries, modelName):
    model = SentenceTransformer(modelName)
    start = time()
    encodings = model.encode(valSeries.to_list())
    print(time()-start,'s')
    return encodings

In [None]:
modelName = 'sentence-transformers/all-distilroberta-v1'

In [9]:
hierDF = hierDF.reset_index()

In [10]:
hierDF.node1_labDesc.isna().sum(), hierDF.node2_labDesc.isna().sum()

(0, 0)

In [11]:
hierDF['node1_emb'] = pd.Series(list(getSentEmbeddings(hierDF.node1_labDesc, modelName)))
hierDF['node2_emb'] = pd.Series(list(getSentEmbeddings(hierDF.node2_labDesc, modelName)))

712.1184396743774 s
560.1313090324402 s


In [12]:
hierDF.head()

Unnamed: 0.1,level_0,Unnamed: 0,index,node2,id,node1,label,node2_desc,node1_label,node2_label,node1_desc,node1_labDesc,node2_labDesc,node1_emb,node2_emb,bert2SentSim
0,0,0,1,Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,1974 fighter aircraft family by General Dynamics,YF-16,F-16 Fighting Falcon,initial prototype of the F-16 fighter aircraft,YF-16 initial prototype of the F-16 fighter ai...,F-16 Fighting Falcon 1974 fighter aircraft fam...,"[-0.43750098, 0.04365531, 0.325353, 0.16819148...","[-0.12933655, 0.60260355, 0.24470265, -0.32232...",0.666525
1,1,1,2,Q100026,Q17372377-P279-Q100026-fd42bd71-0,Q17372377,P279,1974 fighter aircraft family by General Dynamics,F-16A/B Fighting Falcon,F-16 Fighting Falcon,initial series of the F-16 fighter aircraft,F-16A/B Fighting Falcon initial series of the ...,F-16 Fighting Falcon 1974 fighter aircraft fam...,"[-0.5292441, 0.21775092, 0.29205856, -0.043881...","[-0.12933655, 0.60260355, 0.24470265, -0.32232...",0.700515
2,2,2,3,Q100026,Q17372444-P279-Q100026-ca0fc4bd-0,Q17372444,P279,1974 fighter aircraft family by General Dynamics,F-16C/D Fighting Falcon,F-16 Fighting Falcon,multirole series of the F-16 fighter aircraft,F-16C/D Fighting Falcon multirole series of th...,F-16 Fighting Falcon 1974 fighter aircraft fam...,"[-0.43385798, 0.17806369, 0.35786152, 0.033635...","[-0.12933655, 0.60260355, 0.24470265, -0.32232...",0.683245
3,3,3,4,Q100026,Q17372455-P279-Q100026-c2b1bf36-0,Q17372455,P279,1974 fighter aircraft family by General Dynamics,F-16E/F Desert Falcon,F-16 Fighting Falcon,export strike fighter series of the F-16 fight...,F-16E/F Desert Falcon export strike fighter se...,F-16 Fighting Falcon 1974 fighter aircraft fam...,"[-0.35206, 0.22591999, 0.20191742, -0.02384881...","[-0.12933655, 0.60260355, 0.24470265, -0.32232...",0.689516
4,4,4,5,Q100026,Q2029940-P279-Q100026-ceba4380-0,Q2029940,P279,1974 fighter aircraft family by General Dynamics,F-16 VISTA,F-16 Fighting Falcon,experimental aircraft,F-16 VISTA experimental aircraft,F-16 Fighting Falcon 1974 fighter aircraft fam...,"[-0.27010456, 0.22041212, 0.8711852, 0.2625613...","[-0.12933655, 0.60260355, 0.24470265, -0.32232...",0.603161


In [13]:
hierDF.node1_emb.isna().sum(), hierDF.node2_emb.isna().sum()

(0, 0)

In [14]:
hierDF['bert2SentSim'] = hierDF.apply(lambda p: cosine_similarity(p.node1_emb.reshape(1,-1), p.node2_emb.reshape(1,-1))[0][0], axis=1)

In [18]:
hierDF.to_csv('../data/Master_P279_dataset/P279ChildPar_filtered.csv')

## Filter based on direct P279 counts

In [13]:
hierDF = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_filtered.csv')

In [19]:
len(hierDF)

373463

In [20]:
dirP279Counts = pd.read_csv('../data/Master_P279_dataset/hierDF_direct_P279_counts.csv')

In [21]:
dirP279Counts.head()

Unnamed: 0,node2,node1
0,Q100026,7
1,Q1000371,1
2,Q100047,10
3,Q100052008,4
4,Q100052938,1


In [22]:
dirP279Counts = dirP279Counts.set_index('node2')['node1']

In [23]:
dirP279Counts1 = dirP279Counts[dirP279Counts >= 7]

In [24]:
dirP279Counts.head()

node2
Q100026        7
Q1000371       1
Q100047       10
Q100052008     4
Q100052938     1
Name: node1, dtype: int64

In [25]:
dirP279Counts.describe()

count    109631.000000
mean          5.686056
std         124.365173
min           1.000000
25%           1.000000
50%           2.000000
75%           4.000000
max       39217.000000
Name: node1, dtype: float64

In [26]:
dirP279CountsNeededSet = set(dirP279Counts1.index.to_list())

In [27]:
hierDF_dirP279_filtered = hierDF[hierDF.node2.apply(lambda p: p in dirP279CountsNeededSet)]

In [28]:
(hierDF_dirP279_filtered.node1_desc == hierDF_dirP279_filtered.node2_desc).sum()

4864

In [29]:
hierDF_dirP279_filtered1 = hierDF_dirP279_filtered[hierDF_dirP279_filtered.node1_desc != hierDF_dirP279_filtered.node2_desc]

In [30]:
len(hierDF_dirP279_filtered), len(hierDF_dirP279_filtered1)

(263907, 259043)

In [46]:
hierDF_dirP279_filtered1.to_csv('../data/Master_P279_dataset/P279ChildPar_dirP279_filtered.csv')

In [56]:
hierDF_dirP279_filtered1[['id', 'node1', 'label', 'node2']].to_csv('../data/Master_P279_dataset/P279ChildPar_dirP279_filtered_baremetal.csv')

In [8]:
pd.read_csv('../data/Master_P279_dataset/P279ChildPar_dirP279_filtered_baremetal.csv')[['id', 'node1', 'label', 'node2']].to_csv('../data/Master_P279_dataset/P279ChildPar_dirP279_filtered_baremetal.tsv', index=False, sep='\t')

## Filter based on transitive P279 counts

In [None]:
hierDF = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_filtered.csv')

In [31]:
transitive_P279Counts = pd.read_csv('../data/class-counts.tsv', sep='\t', header=None)

In [32]:
transitive_P279Counts = transitive_P279Counts.set_index(0)[1]

In [33]:
transitive_P279Counts = transitive_P279Counts[transitive_P279Counts >= 7]

In [34]:
transitive_P279Counts.describe()

count    6.853900e+04
mean     2.466702e+04
std      5.315130e+05
min      7.000000e+00
25%      1.200000e+01
50%      2.600000e+01
75%      9.600000e+01
max      4.157572e+07
Name: 1, dtype: float64

In [35]:
transitive_P279CountsNeededSet = set(transitive_P279Counts.index.to_list())

In [36]:
hierDF_transP279_filtered = hierDF[hierDF.node2.apply(lambda p: p in transitive_P279CountsNeededSet)]

In [37]:
(hierDF_transP279_filtered.node1_desc == hierDF_transP279_filtered.node2_desc).sum()

6769

In [38]:
hierDF_transP279_filtered1 = hierDF_transP279_filtered[hierDF_transP279_filtered.node1_desc != hierDF_transP279_filtered.node2_desc]

In [39]:
len(hierDF_transP279_filtered), len(hierDF_transP279_filtered1)

(310053, 303284)

In [41]:
hierDF_transP279_filtered1.to_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered.csv')

In [42]:
hierDF_transP279_filtered1[['id', 'node1', 'label', 'node2']].to_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_baremetal.csv')

In [43]:
pd.read_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_baremetal.csv')[['id', 'node1', 'label', 'node2']].to_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_baremetal.tsv', index=False, sep='\t')

In [59]:
hierDF_transP279_filtered1 = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered.csv')
hierDF_transP279_filtered1.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,index,node2,id,node1,label,node2_desc,node1_label,node2_label,node1_desc,node1_labDesc,node2_labDesc,node1_emb,node2_emb,bert2SentSim
0,0,0,1,Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,1974 fighter aircraft family by General Dynamics,YF-16,F-16 Fighting Falcon,initial prototype of the F-16 fighter aircraft,YF-16 initial prototype of the F-16 fighter ai...,F-16 Fighting Falcon 1974 fighter aircraft fam...,[-5.89544833e-01 1.19935878e-01 3.76383096e-...,[ 1.79253638e-01 6.10661507e-01 8.33987653e-...,0.666525
1,1,1,2,Q100026,Q17372377-P279-Q100026-fd42bd71-0,Q17372377,P279,1974 fighter aircraft family by General Dynamics,F-16A/B Fighting Falcon,F-16 Fighting Falcon,initial series of the F-16 fighter aircraft,F-16A/B Fighting Falcon initial series of the ...,F-16 Fighting Falcon 1974 fighter aircraft fam...,[-6.08038664e-01 5.00908047e-02 5.60190260e-...,[ 1.79253891e-01 6.10661447e-01 8.33987534e-...,0.700515
2,2,2,3,Q100026,Q17372444-P279-Q100026-ca0fc4bd-0,Q17372444,P279,1974 fighter aircraft family by General Dynamics,F-16C/D Fighting Falcon,F-16 Fighting Falcon,multirole series of the F-16 fighter aircraft,F-16C/D Fighting Falcon multirole series of th...,F-16 Fighting Falcon 1974 fighter aircraft fam...,[-5.40582001e-01 1.56212196e-01 5.69522917e-...,[ 1.79253638e-01 6.10661507e-01 8.33987653e-...,0.683245
3,3,3,4,Q100026,Q17372455-P279-Q100026-c2b1bf36-0,Q17372455,P279,1974 fighter aircraft family by General Dynamics,F-16E/F Desert Falcon,F-16 Fighting Falcon,export strike fighter series of the F-16 fight...,F-16E/F Desert Falcon export strike fighter se...,F-16 Fighting Falcon 1974 fighter aircraft fam...,[-4.99146163e-01 6.76080063e-02 3.48868877e-...,[ 1.79253638e-01 6.10661507e-01 8.33987653e-...,0.689516
4,4,4,5,Q100026,Q2029940-P279-Q100026-ceba4380-0,Q2029940,P279,1974 fighter aircraft family by General Dynamics,F-16 VISTA,F-16 Fighting Falcon,experimental aircraft,F-16 VISTA experimental aircraft,F-16 Fighting Falcon 1974 fighter aircraft fam...,[-1.47221148e-01 9.54690054e-02 1.05311513e+...,[ 1.79253638e-01 6.10661507e-01 8.33987653e-...,0.603161


In [60]:
hierDF_transP279_filtered1[['id', 'node1', 'label', 'node2', 'bert2SentSim']].to_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_min_cols.csv')

In [61]:
hierDF_transP279_filtered1['bert2SentSim'].describe()

count    303284.000000
mean          0.623925
std           0.176450
min          -0.111115
25%           0.496446
50%           0.630122
75%           0.751900
max           1.000000
Name: bert2SentSim, dtype: float64

## Correct bertsim values

In [3]:
hierDF = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_filtered.bert.base.csv')

In [4]:
hierDF.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,index,node2,id,node1,label,node2_desc,node1_label,node2_label,node1_desc,node1_labDesc,node2_labDesc,node1_emb,node2_emb,bert2SentSim
0,0,0,1,Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,1974 fighter aircraft family by General Dynamics,YF-16,F-16 Fighting Falcon,initial prototype of the F-16 fighter aircraft,YF-16 initial prototype of the F-16 fighter ai...,F-16 Fighting Falcon 1974 fighter aircraft fam...,[-5.89544833e-01 1.19935878e-01 3.76383096e-...,[ 1.79253638e-01 6.10661507e-01 8.33987653e-...,0.666525
1,1,1,2,Q100026,Q17372377-P279-Q100026-fd42bd71-0,Q17372377,P279,1974 fighter aircraft family by General Dynamics,F-16A/B Fighting Falcon,F-16 Fighting Falcon,initial series of the F-16 fighter aircraft,F-16A/B Fighting Falcon initial series of the ...,F-16 Fighting Falcon 1974 fighter aircraft fam...,[-6.08038664e-01 5.00908047e-02 5.60190260e-...,[ 1.79253891e-01 6.10661447e-01 8.33987534e-...,0.700515
2,2,2,3,Q100026,Q17372444-P279-Q100026-ca0fc4bd-0,Q17372444,P279,1974 fighter aircraft family by General Dynamics,F-16C/D Fighting Falcon,F-16 Fighting Falcon,multirole series of the F-16 fighter aircraft,F-16C/D Fighting Falcon multirole series of th...,F-16 Fighting Falcon 1974 fighter aircraft fam...,[-5.40582001e-01 1.56212196e-01 5.69522917e-...,[ 1.79253638e-01 6.10661507e-01 8.33987653e-...,0.683245
3,3,3,4,Q100026,Q17372455-P279-Q100026-c2b1bf36-0,Q17372455,P279,1974 fighter aircraft family by General Dynamics,F-16E/F Desert Falcon,F-16 Fighting Falcon,export strike fighter series of the F-16 fight...,F-16E/F Desert Falcon export strike fighter se...,F-16 Fighting Falcon 1974 fighter aircraft fam...,[-4.99146163e-01 6.76080063e-02 3.48868877e-...,[ 1.79253638e-01 6.10661507e-01 8.33987653e-...,0.689516
4,4,4,5,Q100026,Q2029940-P279-Q100026-ceba4380-0,Q2029940,P279,1974 fighter aircraft family by General Dynamics,F-16 VISTA,F-16 Fighting Falcon,experimental aircraft,F-16 VISTA experimental aircraft,F-16 Fighting Falcon 1974 fighter aircraft fam...,[-1.47221148e-01 9.54690054e-02 1.05311513e+...,[ 1.79253638e-01 6.10661507e-01 8.33987653e-...,0.603161


In [5]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from time import time
import pandas as pd

def getSentEmbeddings(valSeries, modelName):
    model = SentenceTransformer(modelName)
    start = time()
    encodings = model.encode(valSeries.to_list())
    print(time()-start,'s')
    return encodings

In [6]:
modelName = 'sentence-transformers/all-distilroberta-v1'

In [7]:
hierDF = hierDF.reset_index()

In [8]:
hierDF.node1_labDesc.isna().sum(), hierDF.node2_labDesc.isna().sum()

(0, 0)

In [9]:
hierDF['node1_emb'] = pd.Series(list(getSentEmbeddings(hierDF.node1_labDesc, modelName)))
hierDF['node2_emb'] = pd.Series(list(getSentEmbeddings(hierDF.node2_labDesc, modelName)))

Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404
SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch
Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling


198.2163701057434 s


Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404
SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch
Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling


157.96648454666138 s


In [10]:
hierDF.head()

Unnamed: 0.2,level_0,Unnamed: 0,Unnamed: 0.1,index,node2,id,node1,label,node2_desc,node1_label,node2_label,node1_desc,node1_labDesc,node2_labDesc,node1_emb,node2_emb,bert2SentSim
0,0,0,0,1,Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,1974 fighter aircraft family by General Dynamics,YF-16,F-16 Fighting Falcon,initial prototype of the F-16 fighter aircraft,YF-16 initial prototype of the F-16 fighter ai...,F-16 Fighting Falcon 1974 fighter aircraft fam...,"[-0.0003191034, -0.31257066, 0.4071588, 0.4445...","[-0.26255214, -0.14327924, 0.36980763, 0.37005...",0.666525
1,1,1,1,2,Q100026,Q17372377-P279-Q100026-fd42bd71-0,Q17372377,P279,1974 fighter aircraft family by General Dynamics,F-16A/B Fighting Falcon,F-16 Fighting Falcon,initial series of the F-16 fighter aircraft,F-16A/B Fighting Falcon initial series of the ...,F-16 Fighting Falcon 1974 fighter aircraft fam...,"[-0.066474006, -0.17350458, 0.24638148, 0.4571...","[-0.26255214, -0.14327924, 0.36980763, 0.37005...",0.700515
2,2,2,2,3,Q100026,Q17372444-P279-Q100026-ca0fc4bd-0,Q17372444,P279,1974 fighter aircraft family by General Dynamics,F-16C/D Fighting Falcon,F-16 Fighting Falcon,multirole series of the F-16 fighter aircraft,F-16C/D Fighting Falcon multirole series of th...,F-16 Fighting Falcon 1974 fighter aircraft fam...,"[-0.11043589, -0.09184384, 0.24768801, 0.26410...","[-0.26255214, -0.14327924, 0.36980763, 0.37005...",0.683245
3,3,3,3,4,Q100026,Q17372455-P279-Q100026-c2b1bf36-0,Q17372455,P279,1974 fighter aircraft family by General Dynamics,F-16E/F Desert Falcon,F-16 Fighting Falcon,export strike fighter series of the F-16 fight...,F-16E/F Desert Falcon export strike fighter se...,F-16 Fighting Falcon 1974 fighter aircraft fam...,"[-0.40679306, -0.026886435, 0.032887965, 0.641...","[-0.26255214, -0.14327924, 0.36980763, 0.37005...",0.689516
4,4,4,4,5,Q100026,Q2029940-P279-Q100026-ceba4380-0,Q2029940,P279,1974 fighter aircraft family by General Dynamics,F-16 VISTA,F-16 Fighting Falcon,experimental aircraft,F-16 VISTA experimental aircraft,F-16 Fighting Falcon 1974 fighter aircraft fam...,"[-0.18904611, 0.10738096, 0.30325654, 0.650738...","[-0.26255214, -0.14327924, 0.36980763, 0.37005...",0.603161


In [11]:
hierDF.node1_emb.isna().sum(), hierDF.node2_emb.isna().sum()

(0, 0)

In [12]:
hierDF['similarity_value'] = hierDF.apply(lambda p: cosine_similarity(p.node1_emb.reshape(1,-1), p.node2_emb.reshape(1,-1))[0][0], axis=1)

In [17]:
hierDF.to_csv('../data/Master_P279_dataset/P279ChildPar_filtered.csv')

# P279 Siblings Dataset

In [44]:
# hierDF_dirP279_filtered = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_dirP279_filtered_baremetal.csv')

In [2]:
hierDF_transP279_filtered = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_baremetal.csv')

In [46]:
hierDF_transP279_filtered.head()

Unnamed: 0.1,Unnamed: 0,id,node1,label,node2
0,0,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,Q100026
1,1,Q17372377-P279-Q100026-fd42bd71-0,Q17372377,P279,Q100026
2,2,Q17372444-P279-Q100026-ca0fc4bd-0,Q17372444,P279,Q100026
3,3,Q17372455-P279-Q100026-c2b1bf36-0,Q17372455,P279,Q100026
4,4,Q2029940-P279-Q100026-ceba4380-0,Q2029940,P279,Q100026


In [3]:
# hierDF_dirP279_filtered_left = hierDF_dirP279_filtered.set_index('node2')
# hierDF_dirP279_filtered_right = hierDF_dirP279_filtered_left.copy()

hierDF_transP279_filtered_left = hierDF_transP279_filtered.set_index('node2')
hierDF_transP279_filtered_right = hierDF_transP279_filtered.copy().set_index('node2')

In [4]:
# hierDF_dirP279_filtered_sibs = hierDF_dirP279_filtered_left.join(hierDF_dirP279_filtered_right, rsuffix='_right')

In [5]:
%%time
hierDF_transP279_filtered_sibs = hierDF_transP279_filtered_left.join(hierDF_transP279_filtered_right, rsuffix='_right')

CPU times: user 19.2 s, sys: 12.7 s, total: 31.8 s
Wall time: 31.8 s


In [6]:
len(hierDF_transP279_filtered_sibs)

109578774

In [7]:
hierDF_transP279_filtered_sibs.head()

Unnamed: 0_level_0,Unnamed: 0,id,node1,label,Unnamed: 0_right,id_right,node1_right,label_right
node2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Q100026,0,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,0,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279
Q100026,0,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,1,Q17372377-P279-Q100026-fd42bd71-0,Q17372377,P279
Q100026,0,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,2,Q17372444-P279-Q100026-ca0fc4bd-0,Q17372444,P279
Q100026,0,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,3,Q17372455-P279-Q100026-c2b1bf36-0,Q17372455,P279
Q100026,0,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,4,Q2029940-P279-Q100026-ceba4380-0,Q2029940,P279


In [47]:
hierDF_transP279_filtered_sibs1 = hierDF_transP279_filtered_sibs.drop(columns=['Unnamed: 0', 'Unnamed: 0_right', 'label_right']).reset_index().rename(columns={'node2': 'par_node', 'node1_right': 'node2'})

In [48]:
hierDF_transP279_filtered_sibs1.head()

Unnamed: 0,par_node,id,node1,label,id_right,node2
0,Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,Q17372279-P279-Q100026-beba8cd1-0,Q17372279
1,Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,Q17372377-P279-Q100026-fd42bd71-0,Q17372377
2,Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,Q17372444-P279-Q100026-ca0fc4bd-0,Q17372444
3,Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,Q17372455-P279-Q100026-c2b1bf36-0,Q17372455
4,Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,Q2029940-P279-Q100026-ceba4380-0,Q2029940


In [49]:
len(hierDF_transP279_filtered_sibs1)

109578774

In [50]:
hierDF_transP279_filtered_sibs1 = hierDF_transP279_filtered_sibs1[hierDF_transP279_filtered_sibs1.node1 != hierDF_transP279_filtered_sibs1.node2]

In [51]:
len(hierDF_transP279_filtered_sibs1)

109275284

In [52]:
hierDF_transP279_filtered_sibs1.to_csv('../data/Master_P279_dataset/P279Siblings_transP279_filtered_baremetal.csv')

## Add labels + descs

In [3]:
from sklearn.utils.random import sample_without_replacement
import pandas as pd

In [4]:
hierDF_transP279_filtered_sibs = pd.read_csv('../data/Master_P279_dataset/P279Siblings_transP279_filtered_baremetal.csv')

ParserError: Error tokenizing data. C error: Calling read(nbytes) on source failed. Try engine='python'.

In [None]:
hierDF_transP279_filtered_sibs1 = hierDF_transP279_filtered_sibs.iloc[sample_without_replacement(len(hierDF_transP279_filtered_sibs), 10000000, random_state=13)]

In [None]:
hierDF_transP279_filtered_sibs1 = hierDF_transP279_filtered_sibs1.drop(columns=['Unnamed: 0']).reset_index()

In [None]:
hierDF_transP279_filtered_sibs1.head()

In [None]:
len(hierDF_transP279_filtered_sibs1)

In [None]:
labelsDF = pd.read_csv('../data/labels.en.tsv', sep='\t')

In [None]:
hierDF_transP279_filtered_set = set(hierDF_transP279_filtered_sibs1.node1.to_list() + hierDF_transP279_filtered_sibs1.node2.to_list() + hierDF_transP279_filtered_sibs1.par_node.to_list())

In [None]:
labelsDF = labelsDF[labelsDF.node1.apply(lambda p: p in hierDF_transP279_filtered_set)]

In [None]:
labelsDF.head()

In [None]:
labelsDict = {row['node1']: row['node2'] for _, row in labelsDF.iterrows() if row['node1'] in hierDF_transP279_filtered_set}

In [None]:
descriptionsDF = pd.read_csv('../../wd-correctness/gdrive-kgtk-dump-2020-12-07/descriptions.en.tsv.gz', compression='gzip', sep='\t')

In [None]:
descriptionsDF = descriptionsDF[descriptionsDF.node1.apply(lambda p: p in hierDF_transP279_filtered_set)]

In [None]:
descDict = {row['node1']: row['node2'] for _, row in descriptionsDF.iterrows() if row['node1'] in hierDF_transP279_filtered_set}

In [None]:
hierDF_transP279_filtered_sibs1['node1_label'] = hierDF_transP279_filtered_sibs1.node1.apply(lambda p: labelsDict[p][1:-4] if p in labelsDict else None)
hierDF_transP279_filtered_sibs1['node2_label'] = hierDF_transP279_filtered_sibs1.node2.apply(lambda p: labelsDict[p][1:-4] if p in labelsDict else None)
hierDF_transP279_filtered_sibs1['par_label'] = hierDF_transP279_filtered_sibs1.par_node.apply(lambda p: labelsDict[p][1:-4] if p in labelsDict else None)

In [None]:
hierDF_transP279_filtered_sibs1['node1_desc'] = hierDF_transP279_filtered_sibs1.node1.apply(lambda p: descDict[p][1:-4] if p in descDict else None)
hierDF_transP279_filtered_sibs1['node2_desc'] = hierDF_transP279_filtered_sibs1.node2.apply(lambda p: descDict[p][1:-4] if p in descDict else None)

In [None]:
hierDF_transP279_filtered_sibs1['node1_sent'] = hierDF_transP279_filtered_sibs1.apply(lambda p: p['node1_label'] + ' ' + p['node1_desc'] + ' is ' + p['par_label'],axis=1)
hierDF_transP279_filtered_sibs1['node2_sent'] = hierDF_transP279_filtered_sibs1.apply(lambda p: p['node2_label'] + ' ' + p['node2_desc'] + ' is ' + p['par_label'],axis=1)

In [None]:
hierDF_transP279_filtered_sibs1.head()

In [None]:
# hierDF_transP279_filtered_sibs1.to_csv('../data/Master_P279_dataset/P279Siblings_transP279_filtered.csv')

In [None]:
hierDF_transP279_filtered_sibs1[['id', 'node1', 'label', 'node2', 'par_node']].to_csv('../data/Master_P279_dataset/P279Siblings_transP279_filtered_min_cols.csv')

In [None]:
hierDF_transP279_filtered_sibs2 = hierDF_transP279_filtered_sibs1[hierDF_transP279_filtered_sibs1.node1_desc != hierDF_transP279_filtered_sibs1.node2_desc]

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from time import time
import pandas as pd

def getSentEmbeddings(valSeries, modelName):
    model = SentenceTransformer(modelName, device='cuda:2')
    start = time()
    encodings = model.encode(valSeries.to_list(), show_progress_bar=True, batch_size=1000)
    print(time()-start,'s')
    return encodings

In [None]:
modelName = 'sentence-transformers/all-distilroberta-v1'

In [24]:
hierDF_transP279_filtered_sibs2 = hierDF_transP279_filtered_sibs2.reset_index()

In [25]:
hierDF_transP279_filtered_sibs2['node1_emb'] = pd.Series(list(getSentEmbeddings(hierDF_transP279_filtered_sibs2.node1_sent, modelName)))
hierDF_transP279_filtered_sibs2['node2_emb'] = pd.Series(list(getSentEmbeddings(hierDF_transP279_filtered_sibs2.node2_sent, modelName)))

Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404
SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch
Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling


3682.2832567691803 s


Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404
SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch
Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling


3649.976934671402 s


In [25]:
# Alt scoring
from sentence_transformers.cross_encoder import CrossEncoder
crossEncModel = 'cross-encoder/nli-distilroberta-base'
model = CrossEncoder(crossEncModel)
crossEncSimVals = model.predict(list(zip(hierDF_transP279_filtered_sibs2.node1_sent.to_list(), hierDF_transP279_filtered_sibs2.node2_sent.to_list())))

In [26]:
crossEncSimVals[:2]

array([[ 0.70532274, -0.6136245 ,  0.10315116],
       [ 2.0433352 , -1.5074009 , -0.38424107]], dtype=float32)

In [29]:
model.predict(["mayor of Dobrești mayor of Dobrești, Bihor ", "mayor of Pâncota public office in Romania is"])

array([ 0.57001173, -1.8462102 ,  1.5409116 ], dtype=float32)

In [26]:
hierDF_transP279_filtered_sibs2.head()

Unnamed: 0,level_0,index,par_node,id,node1,label,id_right,node2,node1_label,node2_label,par_label,node1_desc,node2_desc,node1_sent,node2_sent,node1_emb,node2_emb
0,1,103575910,Q99762605,Q98956067-P279-Q99762605-c31fed7d-0,Q98956067,P279,Q98954409-P279-Q99762605-e294a05f-0,Q98954409,mayor of Dobrești,mayor of Pâncota,mayor of a place in Romania,"mayor of Dobrești, Bihor county",public office in Romania,"mayor of Dobrești mayor of Dobrești, Bihor cou...",mayor of Pâncota public office in Romania is m...,"[0.29432657, -0.12315029, -0.045695502, 0.3632...","[0.40726575, -0.3538948, -0.19948234, -0.10712..."
1,4,2387372,Q11436,Q135880-P279-Q11436-96699f83-0,Q135880,P279,Q7395320-P279-Q11436-c0a6c3fd-0,Q7395320,A-12,SZD-49 Jantar K,aircraft,1936 autogyro prototype by the Central Aero-Hy...,glider aircraft,A-12 1936 autogyro prototype by the Central Ae...,SZD-49 Jantar K glider aircraft is aircraft,"[-0.19948986, -0.35904005, 0.09085509, 0.16439...","[-0.22941455, -0.05107765, 0.3621345, 0.495969..."
2,5,24016847,Q21167512,Q69525859-P279-Q21167512-f5328a1d-0,Q69525859,P279,Q21175029-P279-Q21167512-56cdfd0e-0,Q21175029,sodium carbonate exposures,pentaborane exposure,chemical hazard,hazardous chemical exposures,hazardous chemical exposure,sodium carbonate exposures hazardous chemical ...,pentaborane exposure hazardous chemical exposu...,"[-0.23254174, -0.019664198, -0.49748045, 0.018...","[-0.57571846, -0.29551026, -0.50132465, -0.239..."
3,10,108650504,Q99762605,Q98958530-P279-Q99762605-fdded642-0,Q98958530,P279,Q98956734-P279-Q99762605-2dbce366-0,Q98956734,mayor of Urziceni,mayor of Ionești,mayor of a place in Romania,"mayor of Urziceni, Satu Mare county","mayor of Ionești, Vâlcea county","mayor of Urziceni mayor of Urziceni, Satu Mare...","mayor of Ionești mayor of Ionești, Vâlcea coun...","[0.5272284, -0.36796138, -0.053719513, 0.40885...","[0.11683124, -0.23857604, -0.057602327, -0.193..."
4,11,106811689,Q99762605,Q98957693-P279-Q99762605-5c11a747-0,Q98957693,P279,Q98955251-P279-Q99762605-400c2ed9-0,Q98955251,mayor of Racovițeni,mayor of Blandiana,mayor of a place in Romania,"mayor of Racovițeni, Buzău county","mayor of Blandiana, Alba county","mayor of Racovițeni mayor of Racovițeni, Buzău...","mayor of Blandiana mayor of Blandiana, Alba co...","[0.34337035, -0.25570026, -0.20981959, 0.10492...","[0.36922738, 0.20065087, -0.18316983, 0.193541..."


In [27]:
hierDF_transP279_filtered_sibs2.node1_emb.isna().sum(), hierDF_transP279_filtered_sibs2.node2_emb.isna().sum()

(0, 0)

In [28]:
hierDF_transP279_filtered_sibs2['similarity_value'] = hierDF_transP279_filtered_sibs2.apply(lambda p: cosine_similarity(p.node1_emb.reshape(1,-1), p.node2_emb.reshape(1,-1))[0][0], axis=1)

In [29]:
hierDF_transP279_filtered_sibs2['similarity_value'].describe()

count    3.707847e+06
mean     5.596878e-01
std      1.731009e-01
min     -1.290048e-01
25%      4.218415e-01
50%      5.907606e-01
75%      7.042572e-01
max      9.998616e-01
Name: similarity_value, dtype: float64

In [30]:
print(f"Dataset reduced from {len(hierDF_transP279_filtered_sibs1)} to {len(hierDF_transP279_filtered_sibs2)} by eliminating rows where node1_desc == node2_desc")

Dataset reduced from 10000000 to 3707847 by eliminating rows where node1_desc == node2_desc


In [31]:
hierDF_transP279_filtered_sibs2[['id', 'node1', 'label', 'node2', 'par_node', 'similarity_value']].to_csv('../data/Master_P279_dataset/P279Siblings_transP279_filtered_min_cols_with_desc_dups_removed.csv')

In [None]:
# import pandas as pd
# pd.read_csv('../data/Master_P279_dataset/P279Siblings_transP279_filtered_min_cols_with_desc_dups_removed.csv')['similarity_value'].describe()

In [6]:
wordSim353AnnotDF_New = pd.read_csv('../data/wordsim353_with_r3.csv')

In [7]:
wordsim_nodes = set(wordSim353AnnotDF_New.word1_kg_id.to_list() + wordSim353AnnotDF_New.word2_kg_id.to_list())

In [27]:
P279_3M_data = pd.read_csv('../data/Master_P279_dataset/P279Siblings_transP279_filtered_min_cols_with_desc_dups_removed.csv')

In [28]:
P279_10M_data = pd.read_csv('../data/Master_P279_dataset/P279Siblings_transP279_filtered_min_cols.csv')

In [29]:
%%time
P279_3M_data.apply(lambda p: p['node1'] in wordsim_nodes or p['node2'] in wordsim_nodes or p['par_node'] in wordsim_nodes, axis=1).sum()

CPU times: user 40.2 s, sys: 779 ms, total: 40.9 s
Wall time: 40.9 s


398221

In [30]:
%%time
P279_10M_data.apply(lambda p: p['node1'] in wordsim_nodes or p['node2'] in wordsim_nodes or p['par_node'] in wordsim_nodes, axis=1).sum()

CPU times: user 1min 48s, sys: 2.78 s, total: 1min 51s
Wall time: 1min 51s


406185

In [31]:
P279_10M_data[P279_10M_data.apply(lambda p: p['node1'] in wordsim_nodes or p['node2'] in wordsim_nodes or p['par_node'] in wordsim_nodes, axis=1)].to_csv('../data/Master_P279_dataset/P279Siblings_transP279_filtered_min_cols_wordsim_only.csv')

In [23]:
P279_19k_data = pd.read_csv('../data/P279_dataset/P279_19k_Siblings_Dataset.csv')

In [24]:
P279_19k_data.head()

Unnamed: 0.1,Unnamed: 0,index,node1,parent,node2,node1_label,par_label,node2_label,node1_sent,node2_sent,comb_sent,node1_emb,node2_emb,bert2SentSim
0,0,2897760,Q7047901,Q17517,Q7410591,Nokia 5330 Mobile TV Edition,mobile phone,Samsung SGH-i780,Nokia 5330 Mobile TV Edition is mobile phone,Samsung SGH-i780 is mobile phone,mobile phone is typically Nokia 5330 Mobile TV...,[-5.48004732e-02 3.50154340e-01 4.38094527e-...,[-2.98046470e-01 3.74356419e-01 5.27383387e-...,0.88349
1,1,10431390,Q15991265,Q2736,Q15991267,amateur football,association football,professional football,amateur football is association football,professional football is association football,association football is typically amateur foot...,[-1.95897296e-01 1.04528800e-01 4.40772206e-...,[-2.94859111e-01 1.88360184e-01 -1.96894854e-...,0.803443
2,2,417162,Q15222772,Q1420,Q7246913,Glider (automobiles),motor car,Probe 16,Glider (automobiles) is motor car,Probe 16 is motor car,motor car is typically Glider (automobiles) or...,[-1.21378571e-01 3.04012895e-01 4.18751419e-...,[ 0.16769831 0.62631613 1.0727373 0.235646...,0.760663
3,3,420822,Q15609267,Q1420,Q16997137,Isotta Fraschini Tipo D,motor car,Allard M2,Isotta Fraschini Tipo D is motor car,Allard M2 is motor car,motor car is typically Isotta Fraschini Tipo D...,[ 1.71256125e-01 2.56909907e-01 7.91365981e-...,[-3.38893235e-01 3.94817479e-02 6.53142512e-...,0.851455
4,4,3361954,Q10748135,Q2095,Q17116114,kelewele,food,Horsebread,kelewele is food,Horsebread is food,food is typically kelewele or Horsebread,[ 2.51727879e-01 -1.59988493e-01 1.39032674e+...,[-2.70331770e-01 -6.03621602e-01 7.06666887e-...,0.728477


In [26]:
%%time
P279_19k_data.apply(lambda p: p['node1'] in wordsim_nodes or p['node2'] in wordsim_nodes or p['parent'] in wordsim_nodes, axis=1).sum()

CPU times: user 305 ms, sys: 2.01 ms, total: 307 ms
Wall time: 304 ms


19454

## Correct bertsim values

In [19]:
hierDF_transP279_filtered_sibs1 = pd.read_csv('../data/Master_P279_dataset/P279Siblings_transP279_filtered.tsv', sep='\t')

EmptyDataError: No columns to parse from file

In [53]:
hierDF_transP279_filtered_sibs2 = hierDF_transP279_filtered_sibs1[hierDF_transP279_filtered_sibs1.node1_desc != hierDF_transP279_filtered_sibs1.node2_desc]

In [64]:
print(f"Dataset reduced from {len(hierDF_transP279_filtered_sibs1)} to {len(hierDF_transP279_filtered_sibs2)} by eliminating rows where node1_desc == node2_desc")

Dataset reduced from 10000000 to 3707847 by eliminating rows where node1_desc == node2_desc


In [None]:
hierDF_transP279_filtered_sibs1 = hierDF_transP279_filtered_sibs2

In [45]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from time import time
import pandas as pd

def getSentEmbeddings(valSeries, modelName):
    model = SentenceTransformer(modelName)
    start = time()
    encodings = model.encode(valSeries.to_list())
    print(time()-start,'s')
    return encodings

modelName = 'sentence-transformers/all-distilroberta-v1'

In [46]:
hierDF_transP279_filtered_sibs1['node1_emb'] = pd.Series(list(getSentEmbeddings(hierDF_transP279_filtered_sibs1.node1_sent, modelName)))
hierDF_transP279_filtered_sibs1['node2_emb'] = pd.Series(list(getSentEmbeddings(hierDF_transP279_filtered_sibs1.node2_sent, modelName)))

15682.8797955513 s
15108.38348031044 s


In [48]:
hierDF_transP279_filtered_sibs1.node1_emb.isna().sum(), hierDF_transP279_filtered_sibs1.node2_emb.isna().sum()

(0, 0)

In [49]:
hierDF_transP279_filtered_sibs1['similarity_value'] = hierDF_transP279_filtered_sibs1.apply(lambda p: cosine_similarity(p.node1_emb.reshape(1,-1), p.node2_emb.reshape(1,-1))[0][0], axis=1)

In [52]:
hierDF_transP279_filtered_sibs1['similarity_value'].describe()

count    1.000000e+07
mean     9.141334e-01
std      1.338514e-01
min     -5.180746e-02
25%      8.585989e-01
50%      9.876406e-01
75%      9.928079e-01
max      1.000000e+00
Name: bert2SentSim, dtype: float64

In [65]:
hierDF_transP279_filtered_sibs2[['id', 'node1', 'label', 'node2', 'par_node', 'similarity_value']].to_csv('../data/Master_P279_dataset/P279Siblings_transP279_filtered_min_cols_with_desc_dups_removed.csv')

# Class Similarity Datasets

## ChildPar

In [32]:
hierDF_transP279_filtered2 = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_min_cols.csv')

In [39]:
hierDF_transP279_filtered2[['node1', 'node2']].rename(columns={'node1':'q1', 'node2':'q2'}).to_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_min_cols_FOR_CLASS.tsv', sep='\t', index=False)

In [42]:
# import os
# import requests
# import pandas as pd


# def call_semantic_similarity(input_file, url):
#     file_name = os.path.basename(input_file)

#     files = {
#         'file': (file_name, open(input_file, mode='rb'), 'application/octet-stream')
#     }
#     resp = requests.post(url, files=files)
#     print(resp)
#     s = resp.json()

#     return pd.DataFrame(s)
 

# url = 'https://kgtk.isi.edu/similarity_api'
# df = call_semantic_similarity('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_min_cols_FOR_CLASS.tsv', url)
# # df.to_csv('test_file_similarity.tsv', index=False, sep='\t')

In [74]:
import requests
from tqdm.notebook import tqdm
import json
from joblib import Parallel, delayed
import sys

hierDF_transP279_filtered2_class_arr = []

def fetchClassSim(row):
    resp = requests.get("https://kgtk.isi.edu/similarity_api?q1="+row['node1']+"&q2="+row['node2']+"&embedding_type=class")
    try:
        row['classSim'] = float(resp.json()['similarity']) if resp else -1
    except Exception as exc:
        print(exc)
        print(f"Resp not found for {row['node1']}, {row['node2']}")
        row['classSim'] = None
    row['Resp_code'] = resp
    return row

hierDF_transP279_filtered2_class_arr = Parallel(n_jobs=5)(delayed(fetchClassSim)(row) for _, row in tqdm(hierDF_transP279_filtered2.iloc[:100].iterrows(), total=len(hierDF_transP279_filtered2)))
                                                                                                                                          
                                                                                                                                          

  0%|          | 0/303284 [00:00<?, ?it/s]

could not convert string to float: ''
Resp not found for Q102108504, Q100171002


In [75]:
hierDF_transP279_filtered2_class_arr

[Unnamed: 0                                      0
 id              Q17372279-P279-Q100026-beba8cd1-0
 node1                                   Q17372279
 label                                        P279
 node2                                     Q100026
 bert2SentSim                             0.666525
 classSim                                 0.835782
 Resp_code                        <Response [200]>
 Name: 0, dtype: object,
 Unnamed: 0                                      1
 id              Q17372377-P279-Q100026-fd42bd71-0
 node1                                   Q17372377
 label                                        P279
 node2                                     Q100026
 bert2SentSim                             0.700515
 classSim                                  0.94674
 Resp_code                        <Response [200]>
 Name: 1, dtype: object,
 Unnamed: 0                                      2
 id              Q17372444-P279-Q100026-ca0fc4bd-0
 node1                          

In [66]:
hierDF_transP279_filtered2_class = pd.DataFrame(hierDF_transP279_filtered2_class_arr)

In [67]:
hierDF_transP279_filtered2_class.head()

In [52]:
hierDF_transP279_filtered2_class.classSim.isna().sum()

302108

## Siblings

In [33]:
P279_3M_data = pd.read_csv('../data/Master_P279_dataset/P279Siblings_transP279_filtered_min_cols_with_desc_dups_removed.csv')
P279_10M_data = pd.read_csv('../data/Master_P279_dataset/P279Siblings_transP279_filtered_min_cols.csv')

In [None]:
import requests
from tqdm.notebook import tqdm
import json
from joblib import Parallel, delayed

P279_3M_data_class_arr = []

def fetchClassSim(row):
    try:
        resp = requests.get("https://kgtk.isi.edu/similarity_api?q1="+row['node1']+"&q2="+row['node2']+"&embedding_type=class").json()['similarity']
        row['classSim'] = float(resp) if resp else -1
    except:
        print(f"Resp not found for {row['node1']}, {row['node2']}")
        row['classSim'] = None
    return row

P279_3M_data_class_arr = Parallel(n_jobs=20)(delayed(fetchClassSim)(row) for _, row in tqdm(P279_3M_data.iterrows(), total=len(P279_3M_data)))

In [None]:
import requests
from tqdm.notebook import tqdm
import json
from joblib import Parallel, delayed

P279_10M_data_class_arr = []

def fetchClassSim(row):
    try:
        resp = requests.get("https://kgtk.isi.edu/similarity_api?q1="+row['node1']+"&q2="+row['node2']+"&embedding_type=class").json()['similarity']
        row['classSim'] = float(resp) if resp else -1
    except:
        print(f"Resp not found for {row['node1']}, {row['node2']}")
        row['classSim'] = None
    return row

P279_10M_data_class_arr = Parallel(n_jobs=20)(delayed(fetchClassSim)(row) for _, row in tqdm(P279_10M_data.iterrows(), total=len(P279_10M_data)))