In [2]:
import pandas as pd
import scipy.stats as stats
import json
from tqdm.notebook import tqdm
import gzip
from collections import defaultdict

In [3]:
# Filepaths

# DWD Files
DERIVED_IS_COUNTS_FILE = '../source_dataset_files/wikidata-20210215-dwd-v2/dwd_isa_class_count.compact.tsv.gz'
LABELS_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/labels.en.tsv.gz"
DESCRIPTIONS_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/descriptions.en.tsv.gz"
P279_EDGES_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/derived.P279.tsv.gz"
P279STAR_EDGES_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/derived.P279star.tsv.gz"
CLASS_COUNTS_FILE = "../data/supplementary_files/class-counts.tsv"
CLASS_TRANSITIVE_P279_COUNTS_FILE = "../data/supplementary_files/class-counts-P279star.tsv"

In [4]:
P279_CHILD_PAR_INTERM_FILE = "../data/basis/intermediate_files/P279_ChildPar.labDesc.csv"
P279_CHILD_PAR_DISTILBERT_COSSIM_FILE = "../data/basis/P279_ChildPar.all-distilroberta-v1.csv"
P279_CHILD_PAR_CLASSSIM_FILE = "../data/basis/P279_ChildPar.classSim.csv"

P279_SIBLINGS_INTERM_FILE = "../data/basis/intermediate_files/P279_Siblings.all_combinations.csv"
P279_SIBLINGS_DISTILBERT_COSSIM_FILE = "../data/basis/P279_Siblings.all-distilroberta-v1.csv"
P279_SIBLINGS_CLASSSIM_FILE = "../data/basis/P279_Siblings.classSim.csv"

In [11]:
WORDSIM_DF = '../data/evaluation/wordsim353_with_r3.csv'

In [31]:
PROBASE_SOURCE_DIR = "../source_dataset_files/probase/"
PROBASE_INTERM_FILE = "../data/basis/intermediate_files/probase_WQnodes.csv"
PROBASE_FINAL_FILE = '../data/basis/intermediate_files/probase_WQnodes_subset_and_sim.csv'

# Utilities

In [13]:
def fetch_labels_to_qnodes_dict():
    labels_to_qnodes_dict = defaultdict(str)
    with gzip.open(LABELS_FILE) as lab_f:
        first_line = True
        for line in tqdm(lab_f, total=41845781):
            if first_line:
                first_line = False
                continue
            line = line.decode('utf-8').strip().split()
            str1 = line[3][1:-4]
            qnode = line[1]
            if str1 in labels_to_qnodes_dict:
                if qnode < labels_to_qnodes_dict[str1]:
                    labels_to_qnodes_dict[str1] = qnode
            else:
                labels_to_qnodes_dict[str1] = qnode
    return labels_to_qnodes_dict
labels_to_qnodes_dict = fetch_labels_to_qnodes_dict()

  0%|          | 0/41845781 [00:00<?, ?it/s]

In [12]:
def fetch_worsim_labels_to_qnodes_dict():
    wordsim_DF = pd.read_csv(WORDSIM_DF)
    labels_to_qnodes_dict = {row['Word 1']: row['word1_kg_id'] for _, row in wordsim_DF.iterrows()}
    temp2 = {row['Word 2']: row['word2_kg_id'] for _, row in wordsim_DF.iterrows()}
    labels_to_qnodes_dict.update(temp2)
    return labels_to_qnodes_dict

wordsim_labels_to_qnodes_dict = fetch_worsim_labels_to_qnodes_dict()

# Class Counts File Construction

In [6]:
# Took almost 1.5 hours to execute
# !zcat ../source_dataset_files/wikidata-20210215-dwd-v2/dwd_isa_class_count.compact.tsv.gz | cut -f 3 | tail -n +2 | sed -e 's/|/\n/g' -e 's/:/\t/g' | sort | uniq > ../data/supplementary_files/class-counts.tsv

In [None]:
class_counts = defaultdict(int)
with gzip.open(P279STAR_EDGES_FILE, 'r') as f:
    first_line = True
    for line in tqdm(f):
        if first_line:
            first_line = False
            continue
        line = line.decode('utf-8').strip().split('\t')
        class_counts[line[2]] += 1

In [59]:
class_counts_df = pd.DataFrame.from_dict(class_counts, orient='index',columns=['class_count'])

In [88]:
class_counts_df.reset_index().rename(columns={'index': '0'}).to_csv(CLASS_TRANSITIVE_P279_COUNTS_FILE, sep='\t', header=None, index=None)

# P279 Datasets

## P279 ChildPar Dataset

Use descriptions wherever available, if not use labels, if not skip rows

**Old Version Details:**

[Wikidata OS File (Wikidata 2021-02-15 DWD version)](https://drive.google.com/drive/folders/168j3OfdVGXMTKcs6VyH8rq_p0n6w0GGj?usp=sharing)

There are 721983 rows in P279 dataset, 606996 unique number of nodes in this dataset, 511841 labels and 299844 descriptions

There are 373463 rows in P279 dataset s.t. both nodes have either a label or a description.     There are 373463 rows where both nodes have labels.     There are 373463 rows where both nodes have descriptions.

In [7]:
# !kgtk filter -i ../data/wikidataos.for.text-embedding.tsv.gz -p ";P279;" -o ../data/P279_dataset/wikidata-P279.tsv

In [8]:
p279DF = pd.read_csv(P279_EDGES_FILE,sep='\t')
p279DFNodesSet = set(p279DF.node1.to_list() + p279DF.node2.to_list())
labelsDF = pd.read_csv(LABELS_FILE, sep='\t')
descriptionsDF = pd.read_csv(DESCRIPTIONS_FILE, compression='gzip', sep='\t')
labelsDF = labelsDF[labelsDF.node1.apply(lambda p: p in p279DFNodesSet)]
descriptionsDF = descriptionsDF[descriptionsDF.node1.apply(lambda p: p in p279DFNodesSet)]

In [9]:
print(f"There are {len(p279DF)} rows in P279 dataset, {len(p279DFNodesSet)} unique number of nodes in this dataset, {len(labelsDF)} labels and {len(descriptionsDF)} descriptions")

There are 3077831 rows in P279 dataset, 2503943 unique number of nodes in this dataset, 2406580 labels and 1369514 descriptions


In [10]:
p279DFNew = p279DF
temp1 = p279DFNew.set_index('node1').join(labelsDF.set_index('node1'), rsuffix='_label')
temp1 = temp1[['id','label','node2','node2_label']]
temp2 = temp1.reset_index().set_index('node2').join(labelsDF.set_index('node1'), rsuffix='_label2')
temp3 = temp2[['id','node1','label','node2', 'node2_label']].rename(columns={'node2_label': 'node1_label', 'node2':'node2_label'}).reset_index().rename(columns={'index':'node2'})
temp3 = temp3[['id','node1','node1_label','label','node2','node2_label']]
temp3['node1_label'] = temp3['node1_label'].apply(lambda p: p[1:-4] if type(p) == str else None)
temp3['node2_label'] = temp3['node2_label'].apply(lambda p: p[1:-4] if type(p) == str else None)
hierDF = temp3.copy()

p279DFNew = hierDF
temp1 = p279DFNew.set_index('node1').join(descriptionsDF.set_index('node1'), rsuffix='_desc')
temp1 = temp1[['id','label','node2','node1_label','node2_label', 'node2_desc']]
temp2 = temp1.reset_index().set_index('node2').join(descriptionsDF.set_index('node1'), rsuffix='_desc2')
temp3 = temp2[['id','node1','label','node2', 'node1_label', 'node2_label', 'node2_desc']].rename(columns={'node2_desc': 'node1_desc', 'node2':'node2_desc'}).reset_index().rename(columns={'index':'node2'})
temp3['node1_desc'] = temp3['node1_desc'].apply(lambda p: p[1:-4] if type(p) == str else None)
temp3['node2_desc'] = temp3['node2_desc'].apply(lambda p: p[1:-4] if type(p) == str else None)
hierDF = temp3.copy()

In [11]:
def combineLabDesc(row, nodeNum):
    if type(row['node'+str(nodeNum)+'_desc']) == str and type(row['node'+str(nodeNum)+'_label']) == str :
        return row['node'+str(nodeNum)+'_label'] + ' ' + row['node'+str(nodeNum)+'_desc']
    else:
        return None
hierDF['node1_labDesc'] = hierDF.apply(combineLabDesc, axis=1, args=(1,))
hierDF['node2_labDesc'] = hierDF.apply(combineLabDesc, axis=1, args=(2,))
hierDF = hierDF[~hierDF['node1_labDesc'].isna()]
hierDF = hierDF[~hierDF['node2_labDesc'].isna()]

In [12]:
print(f"There are {len(hierDF)} rows in P279 dataset s.t. both nodes have either a label or a description. \
    There are {len(hierDF[(~hierDF.node1_label.isna()) & (~hierDF.node2_label.isna())])} rows where both nodes have labels. \
    There are {len(hierDF[(~hierDF.node1_desc.isna()) & (~hierDF.node2_desc.isna())])} rows where both nodes have descriptions.")

There are 1898589 rows in P279 dataset s.t. both nodes have either a label or a description.     There are 1898589 rows where both nodes have labels.     There are 1898589 rows where both nodes have descriptions.


In [13]:
hierDF.head()

Unnamed: 0,node2,id,node1,label,node2_desc,node1_label,node2_label,node1_desc,node1_labDesc,node2_labDesc
1,Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,1944 fighter aircraft family by Oberst R. Lehmann,YF-16,F-16,initial prototype of the F-16 fighter aircraft,YF-16 initial prototype of the F-16 fighter ai...,F-16 1944 fighter aircraft family by Oberst R....
2,Q100026,Q17372377-P279-Q100026-fd42bd71-0,Q17372377,P279,1944 fighter aircraft family by Oberst R. Lehmann,F-16A/B Fighting Falcon,F-16,initial series of the F-16 fighter aircraft,F-16A/B Fighting Falcon initial series of the ...,F-16 1944 fighter aircraft family by Oberst R....
3,Q100026,Q17372444-P279-Q100026-ca0fc4bd-0,Q17372444,P279,1944 fighter aircraft family by Oberst R. Lehmann,F-16C/D Fighting Falcon,F-16,multirole series of the F-16 fighter aircraft,F-16C/D Fighting Falcon multirole series of th...,F-16 1944 fighter aircraft family by Oberst R....
4,Q100026,Q17372455-P279-Q100026-c2b1bf36-0,Q17372455,P279,1944 fighter aircraft family by Oberst R. Lehmann,F-16E/F Desert Falcon,F-16,export strike fighter series of the F-16 fight...,F-16E/F Desert Falcon export strike fighter se...,F-16 1944 fighter aircraft family by Oberst R....
5,Q100026,Q2029940-P279-Q100026-ceba4380-0,Q2029940,P279,1944 fighter aircraft family by Oberst R. Lehmann,F-16 VISTA,F-16,experimental aircraft,F-16 VISTA experimental aircraft,F-16 1944 fighter aircraft family by Oberst R....


### Filter based on transitive P279 counts

6769
(310053, 303284)

In [None]:
# hierDF = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_filtered.csv')

In [134]:
transitive_P279Counts = pd.read_csv(CLASS_COUNTS_FILE, sep='\t', header=None)
transitive_P279Counts = transitive_P279Counts.set_index(0)[1]

In [137]:
def extract_pairs_with_top_k_class_counts(transitive_P279Counts, k, hierDF):
    transitive_P279Counts = transitive_P279Counts[transitive_P279Counts >= k]
    transitive_P279CountsNeededSet = set(transitive_P279Counts.index.to_list())
    hierDF_transP279_filtered = hierDF[hierDF.node2.apply(lambda p: p in transitive_P279CountsNeededSet)]
#     print("No. of pairs with identical desc:", (hierDF_transP279_filtered.node1_desc == hierDF_transP279_filtered.node2_desc).sum())
    hierDF_transP279_filtered1 = hierDF_transP279_filtered[hierDF_transP279_filtered.node1_desc != hierDF_transP279_filtered.node2_desc]
#     print(len(hierDF_transP279_filtered), len(hierDF_transP279_filtered1))
    hierDF_transP279_filtered1 = hierDF_transP279_filtered1.groupby("node2").apply(lambda g: g if len(g) <= 500 else g.sample(500))
    return hierDF_transP279_filtered1, k, len(hierDF_transP279_filtered1), hierDF_transP279_filtered1.node2.nunique()

In [140]:
hierDF_transP279_filtered1, k, cnt, n_pars = extract_pairs_with_top_k_class_counts(transitive_P279Counts, 10, hierDF)
k, cnt, n_pars

(10, 304654, 28511)

In [141]:
# tempRes = []
# for k in range(10000,100000,10000):
#     _, k, cnt, n_pars = extract_pairs_with_top_k_class_counts(transitive_P279Counts, k, hierDF)
#     tempRes.append((k, cnt, n_pars))
# pd.DataFrame(tempRes, columns=['Thresh', 'No. of rows', 'No. of unique parents'])

In [142]:
hierDF_transP279_filtered1.node2.value_counts().describe()

count    28511.000000
mean        10.685490
std         28.508339
min          1.000000
25%          2.000000
50%          4.000000
75%         10.000000
max        500.000000
Name: node2, dtype: float64

In [144]:
hierDF_transP279_filtered1.to_csv(P279_CHILD_PAR_INTERM_FILE)

In [145]:
hierDF = pd.read_csv(P279_CHILD_PAR_INTERM_FILE)

In [146]:
hierDF.head()

Unnamed: 0,node2,Unnamed: 1,node2.1,id,node1,label,node2_desc,node1_label,node2_label,node1_desc,node1_labDesc,node2_labDesc
0,Q100026,1,Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,1944 fighter aircraft family by Oberst R. Lehmann,YF-16,F-16,initial prototype of the F-16 fighter aircraft,YF-16 initial prototype of the F-16 fighter ai...,F-16 1944 fighter aircraft family by Oberst R....
1,Q100026,2,Q100026,Q17372377-P279-Q100026-fd42bd71-0,Q17372377,P279,1944 fighter aircraft family by Oberst R. Lehmann,F-16A/B Fighting Falcon,F-16,initial series of the F-16 fighter aircraft,F-16A/B Fighting Falcon initial series of the ...,F-16 1944 fighter aircraft family by Oberst R....
2,Q100026,3,Q100026,Q17372444-P279-Q100026-ca0fc4bd-0,Q17372444,P279,1944 fighter aircraft family by Oberst R. Lehmann,F-16C/D Fighting Falcon,F-16,multirole series of the F-16 fighter aircraft,F-16C/D Fighting Falcon multirole series of th...,F-16 1944 fighter aircraft family by Oberst R....
3,Q100026,4,Q100026,Q17372455-P279-Q100026-c2b1bf36-0,Q17372455,P279,1944 fighter aircraft family by Oberst R. Lehmann,F-16E/F Desert Falcon,F-16,export strike fighter series of the F-16 fight...,F-16E/F Desert Falcon export strike fighter se...,F-16 1944 fighter aircraft family by Oberst R....
4,Q100026,5,Q100026,Q2029940-P279-Q100026-ceba4380-0,Q2029940,P279,1944 fighter aircraft family by Oberst R. Lehmann,F-16 VISTA,F-16,experimental aircraft,F-16 VISTA experimental aircraft,F-16 1944 fighter aircraft family by Oberst R....


In [147]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from time import time
import pandas as pd

def getSentEmbeddings(valSeries, modelName):
    model = SentenceTransformer(modelName, device='cuda:0')
    start = time()
    encodings = model.encode(valSeries.to_list(), show_progress_bar=True, batch_size=500)
    print(time()-start,'s')
    return encodings

In [148]:
modelName = 'sentence-transformers/all-distilroberta-v1'

In [149]:
hierDF = hierDF.reset_index()

In [150]:
hierDF.node1_labDesc.isna().sum(), hierDF.node2_labDesc.isna().sum()

(0, 0)

In [151]:
hierDF['node1_emb'] = pd.Series(list(getSentEmbeddings(hierDF.node1_labDesc, modelName)))
hierDF['node2_emb'] = pd.Series(list(getSentEmbeddings(hierDF.node2_labDesc, modelName)))

Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404
SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch
Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling


149.29957628250122 s


Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404
SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch
Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling


103.25607371330261 s


In [152]:
hierDF.head()

Unnamed: 0,index,node2,Unnamed: 1,node2.1,id,node1,label,node2_desc,node1_label,node2_label,node1_desc,node1_labDesc,node2_labDesc,node1_emb,node2_emb
0,0,Q100026,1,Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,1944 fighter aircraft family by Oberst R. Lehmann,YF-16,F-16,initial prototype of the F-16 fighter aircraft,YF-16 initial prototype of the F-16 fighter ai...,F-16 1944 fighter aircraft family by Oberst R....,"[-0.0003189822, -0.31257084, 0.4071587, 0.4445...","[-0.074766316, -0.20337783, 0.23880175, 0.4511..."
1,1,Q100026,2,Q100026,Q17372377-P279-Q100026-fd42bd71-0,Q17372377,P279,1944 fighter aircraft family by Oberst R. Lehmann,F-16A/B Fighting Falcon,F-16,initial series of the F-16 fighter aircraft,F-16A/B Fighting Falcon initial series of the ...,F-16 1944 fighter aircraft family by Oberst R....,"[-0.066473946, -0.17350475, 0.24638157, 0.4571...","[-0.074766316, -0.20337783, 0.23880175, 0.4511..."
2,2,Q100026,3,Q100026,Q17372444-P279-Q100026-ca0fc4bd-0,Q17372444,P279,1944 fighter aircraft family by Oberst R. Lehmann,F-16C/D Fighting Falcon,F-16,multirole series of the F-16 fighter aircraft,F-16C/D Fighting Falcon multirole series of th...,F-16 1944 fighter aircraft family by Oberst R....,"[-0.110435985, -0.09184393, 0.24768817, 0.2641...","[-0.074766316, -0.20337783, 0.23880175, 0.4511..."
3,3,Q100026,4,Q100026,Q17372455-P279-Q100026-c2b1bf36-0,Q17372455,P279,1944 fighter aircraft family by Oberst R. Lehmann,F-16E/F Desert Falcon,F-16,export strike fighter series of the F-16 fight...,F-16E/F Desert Falcon export strike fighter se...,F-16 1944 fighter aircraft family by Oberst R....,"[-0.40679273, -0.026886985, 0.032887716, 0.641...","[-0.074766316, -0.20337783, 0.23880175, 0.4511..."
4,4,Q100026,5,Q100026,Q2029940-P279-Q100026-ceba4380-0,Q2029940,P279,1944 fighter aircraft family by Oberst R. Lehmann,F-16 VISTA,F-16,experimental aircraft,F-16 VISTA experimental aircraft,F-16 1944 fighter aircraft family by Oberst R....,"[-0.18904628, 0.10738095, 0.3032565, 0.6507382...","[-0.074766316, -0.20337783, 0.23880175, 0.4511..."


In [153]:
hierDF.node1_emb.isna().sum(), hierDF.node2_emb.isna().sum()

(0, 0)

In [154]:
hierDF['similarity_value'] = hierDF.apply(lambda p: cosine_similarity(p.node1_emb.reshape(1,-1), p.node2_emb.reshape(1,-1))[0][0], axis=1)

In [167]:
hierDF[['id', 'node1', 'label', 'node2', 'similarity_value']].to_csv(P279_CHILD_PAR_DISTILBERT_COSSIM_FILE, index=None)

In [168]:
hierDF['similarity_value'].describe()

count    304654.000000
mean          0.499092
std           0.186887
min          -0.130950
25%           0.371610
50%           0.499928
75%           0.626436
max           0.989182
Name: similarity_value, dtype: float64

In [40]:
# # Alt scoring
# from sentence_transformers.cross_encoder import CrossEncoder
# crossEncModel = 'cross-encoder/stsb-roberta-large'
# model = CrossEncoder(crossEncModel, device='cuda:3')
# crossEncSimVals = model.predict(list(zip(hierDF.node1_labDesc.to_list(), hierDF.node2_labDesc.to_list())), show_progress_bar=True, batch_size=500)
# hierDF['similarity_value'] = crossEncSimVals

Batches:   0%|          | 0/747 [00:00<?, ?it/s]

## P279 Siblings Dataset

In [169]:
hierDF_transP279_filtered = pd.read_csv(P279_CHILD_PAR_DISTILBERT_COSSIM_FILE)
hierDF_transP279_filtered.head()

Unnamed: 0,id,node1,label,node2,similarity_value
0,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,Q100026,0.59253
1,Q17372377-P279-Q100026-fd42bd71-0,Q17372377,P279,Q100026,0.660948
2,Q17372444-P279-Q100026-ca0fc4bd-0,Q17372444,P279,Q100026,0.634662
3,Q17372455-P279-Q100026-c2b1bf36-0,Q17372455,P279,Q100026,0.552555
4,Q2029940-P279-Q100026-ceba4380-0,Q2029940,P279,Q100026,0.504576


In [170]:
hierDF_transP279_filtered_left = hierDF_transP279_filtered.set_index('node2')
hierDF_transP279_filtered_right = hierDF_transP279_filtered.copy().set_index('node2')

In [171]:
%%time
hierDF_transP279_filtered_sibs = hierDF_transP279_filtered_left.join(hierDF_transP279_filtered_right, rsuffix='_right')

CPU times: user 2.04 s, sys: 684 ms, total: 2.72 s
Wall time: 2.72 s


In [172]:
len(hierDF_transP279_filtered_sibs)

26426178

In [173]:
hierDF_transP279_filtered_sibs.head()

Unnamed: 0_level_0,id,node1,label,similarity_value,id_right,node1_right,label_right,similarity_value_right
node2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,0.59253,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,0.59253
Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,0.59253,Q17372377-P279-Q100026-fd42bd71-0,Q17372377,P279,0.660948
Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,0.59253,Q17372444-P279-Q100026-ca0fc4bd-0,Q17372444,P279,0.634662
Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,0.59253,Q17372455-P279-Q100026-c2b1bf36-0,Q17372455,P279,0.552555
Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,0.59253,Q2029940-P279-Q100026-ceba4380-0,Q2029940,P279,0.504576


In [175]:
hierDF_transP279_filtered_sibs1 = hierDF_transP279_filtered_sibs.drop(columns=['label_right']).reset_index().rename(columns={'node2': 'par_node', 'node1_right': 'node2'})

In [176]:
hierDF_transP279_filtered_sibs1.head()

Unnamed: 0,par_node,id,node1,label,similarity_value,id_right,node2,similarity_value_right
0,Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,0.59253,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,0.59253
1,Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,0.59253,Q17372377-P279-Q100026-fd42bd71-0,Q17372377,0.660948
2,Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,0.59253,Q17372444-P279-Q100026-ca0fc4bd-0,Q17372444,0.634662
3,Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,0.59253,Q17372455-P279-Q100026-c2b1bf36-0,Q17372455,0.552555
4,Q100026,Q17372279-P279-Q100026-beba8cd1-0,Q17372279,P279,0.59253,Q2029940-P279-Q100026-ceba4380-0,Q2029940,0.504576


In [177]:
len(hierDF_transP279_filtered_sibs1)

26426178

In [178]:
hierDF_transP279_filtered_sibs1 = hierDF_transP279_filtered_sibs1[hierDF_transP279_filtered_sibs1.node1 != hierDF_transP279_filtered_sibs1.node2]

In [179]:
len(hierDF_transP279_filtered_sibs1)

26121338

In [181]:
hierDF_transP279_filtered_sibs1.to_csv(P279_SIBLINGS_INTERM_FILE, index=None)

### Add labels + descs

In [182]:
from sklearn.utils.random import sample_without_replacement
import pandas as pd

In [193]:
hierDF_transP279_filtered_sibs = pd.read_csv(P279_SIBLINGS_INTERM_FILE)
hierDF_transP279_filtered_sibs1 = hierDF_transP279_filtered_sibs.iloc[sample_without_replacement(len(hierDF_transP279_filtered_sibs), 1000000, random_state=13)]
hierDF_transP279_filtered_sibs1 = hierDF_transP279_filtered_sibs1.reset_index()
print(len(hierDF_transP279_filtered_sibs1))
labelsDF = pd.read_csv(LABELS_FILE, compression='gzip', sep='\t')
hierDF_transP279_filtered_set = set(hierDF_transP279_filtered_sibs1.node1.to_list() + hierDF_transP279_filtered_sibs1.node2.to_list() + hierDF_transP279_filtered_sibs1.par_node.to_list())
labelsDF = labelsDF[labelsDF.node1.apply(lambda p: p in hierDF_transP279_filtered_set)]
labelsDict = {row['node1']: row['node2'] for _, row in labelsDF.iterrows() if row['node1'] in hierDF_transP279_filtered_set}
descriptionsDF = pd.read_csv(DESCRIPTIONS_FILE, compression='gzip', sep='\t')
descriptionsDF = descriptionsDF[descriptionsDF.node1.apply(lambda p: p in hierDF_transP279_filtered_set)]
descDict = {row['node1']: row['node2'] for _, row in descriptionsDF.iterrows() if row['node1'] in hierDF_transP279_filtered_set}

1000000


In [194]:
hierDF_transP279_filtered_sibs1['node1_label'] = hierDF_transP279_filtered_sibs1.node1.apply(lambda p: labelsDict[p][1:-4] if p in labelsDict else None)
hierDF_transP279_filtered_sibs1['node2_label'] = hierDF_transP279_filtered_sibs1.node2.apply(lambda p: labelsDict[p][1:-4] if p in labelsDict else None)
hierDF_transP279_filtered_sibs1['par_label'] = hierDF_transP279_filtered_sibs1.par_node.apply(lambda p: labelsDict[p][1:-4] if p in labelsDict else None)
hierDF_transP279_filtered_sibs1['node1_desc'] = hierDF_transP279_filtered_sibs1.node1.apply(lambda p: descDict[p][1:-4] if p in descDict else None)
hierDF_transP279_filtered_sibs1['node2_desc'] = hierDF_transP279_filtered_sibs1.node2.apply(lambda p: descDict[p][1:-4] if p in descDict else None)
hierDF_transP279_filtered_sibs1['node1_sent'] = hierDF_transP279_filtered_sibs1.apply(lambda p: p['node1_label'] + ' ' + p['node1_desc'] + ' is ' + p['par_label'],axis=1)
hierDF_transP279_filtered_sibs1['node2_sent'] = hierDF_transP279_filtered_sibs1.apply(lambda p: p['node2_label'] + ' ' + p['node2_desc'] + ' is ' + p['par_label'],axis=1)
hierDF_transP279_filtered_sibs2 = hierDF_transP279_filtered_sibs1[hierDF_transP279_filtered_sibs1.node1_desc != hierDF_transP279_filtered_sibs1.node2_desc]

In [195]:
print(f"Dataset reduced from {len(hierDF_transP279_filtered_sibs1)} to {len(hierDF_transP279_filtered_sibs2)} by eliminating rows where node1_desc == node2_desc")

Dataset reduced from 1000000 to 785418 by eliminating rows where node1_desc == node2_desc


In [196]:
hierDF_transP279_filtered_sibs1

Unnamed: 0,index,par_node,id,node1,label,similarity_value,id_right,node2,similarity_value_right,node1_label,node2_label,par_label,node1_desc,node2_desc,node1_sent,node2_sent
0,24998287,Q89,Q96627253-P279-Q89-51483924-0,Q96627253,P279,0.460166,Q595003-P279-Q89-ddc59387-0,Q595003,0.553182,Sevillana (manzana),Calville Blanc d\'hiver,apple,apple cultivar,apple cultivar,Sevillana (manzana) apple cultivar is apple,Calville Blanc d\'hiver apple cultivar is apple
1,6348351,Q17589470,Q56942-P279-Q17589470-d0932570-0,Q56942,P279,0.527086,Q69974281-P279-Q17589470-ed5eb339-0,Q69974281,0.466616,Wii U,Gamatic 7600,home video game console,eight-generation home video game console by Ni...,Pong console. The same of Otron Gamatic 7600,Wii U eight-generation home video game console...,Gamatic 7600 Pong console. The same of Otron G...
2,3244723,Q13357858,Q63845311-P279-Q13357858-c40a679c-0,Q63845311,P279,0.695520,Q79132633-P279-Q13357858-4ccf52ce-0,Q79132633,0.679767,2013 World Senior Badminton Championships O35,Slovak Badminton Championships U17,badminton tournament,badminton championships,badminton championships,2013 World Senior Badminton Championships O35 ...,Slovak Badminton Championships U17 badminton c...
3,5923963,Q174736,Q583164-P279-Q174736-262a2fe0-0,Q583164,P279,0.459580,Q2692256-P279-Q174736-e1333ac9-0,Q2692256,0.592024,Project 30bis,Bagley-class destroyer,destroyer,1949 Soviet destroyer class,class of U.S. destroyers,Project 30bis 1949 Soviet destroyer class is d...,Bagley-class destroyer class of U.S. destroyer...
4,10867070,Q215627,Q19840821-P279-Q215627-eb7ec03a-0,Q19840821,P279,0.201813,Q40554819-P279-Q215627-95d720c4-0,Q40554819,0.104633,Singaporeans,Sicilians,person,citizens or residents of Singapore,ethnic group indigenous to the island of Sicil...,Singaporeans citizens or residents of Singapor...,Sicilians ethnic group indigenous to the islan...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
999995,15478058,Q3382119,Q75086913-P279-Q3382119-a9ff163a-0,Q75086913,P279,0.769255,Q75828417-P279-Q3382119-59b8950c-0,Q75828417,0.621310,Sub-prefect of Nogent-le-Rotrou,Sub-prefect of Pont-Audemer,sub-prefect,French official position,former French official position (1800-1926),Sub-prefect of Nogent-le-Rotrou French officia...,Sub-prefect of Pont-Audemer former French offi...
999996,23394433,Q785745,Q23022056-P279-Q785745-f48ca89b-0,Q23022056,P279,0.405016,Q3511974-P279-Q785745-42da354d-0,Q3511974,0.306730,Tasmanian Government Railways J class,Alsace-Lorraine E 6,tank locomotive,class of 1 Australian 2-6-4-0T locomotive,class of 13 German (later French) metre-gauge ...,Tasmanian Government Railways J class class of...,Alsace-Lorraine E 6 class of 13 German (later ...
999997,19035228,Q483373,Q682971-P279-Q483373-6ff4adf5-0,Q682971,P279,0.314112,Q4598091-P279-Q483373-fd15b986-0,Q4598091,0.213652,Rhaetian Railway ABe 4/16,CTA 2000 series,electric multiple unit,multiple unit,class of Chicago Transit Authority cars,Rhaetian Railway ABe 4/16 multiple unit is ele...,CTA 2000 series class of Chicago Transit Autho...
999998,20618370,Q625151,Q841350-P279-Q625151-bd2c0728-0,Q841350,P279,0.459560,Q463577-P279-Q625151-4cfe245b-0,Q463577,0.466203,OSE class 120,BLS Re 4/4,electric locomotive,class of Greek electric locomotives,class of 35 Swiss electric locomoties,OSE class 120 class of Greek electric locomoti...,BLS Re 4/4 class of 35 Swiss electric locomoti...


In [197]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from time import time
import pandas as pd

def getSentEmbeddings(valSeries, modelName):
    model = SentenceTransformer(modelName, device='cuda:0')
    start = time()
    encodings = model.encode(valSeries.to_list(), show_progress_bar=True, batch_size=1000)
    print(time()-start,'s')
    return encodings
modelName = 'sentence-transformers/all-distilroberta-v1'
hierDF_transP279_filtered_sibs2 = hierDF_transP279_filtered_sibs2.reset_index()
hierDF_transP279_filtered_sibs2['node1_emb'] = pd.Series(list(getSentEmbeddings(hierDF_transP279_filtered_sibs2.node1_sent, modelName)))
hierDF_transP279_filtered_sibs2['node2_emb'] = pd.Series(list(getSentEmbeddings(hierDF_transP279_filtered_sibs2.node2_sent, modelName)))
hierDF_transP279_filtered_sibs2['similarity_value'] = hierDF_transP279_filtered_sibs2.apply(lambda p: cosine_similarity(p.node1_emb.reshape(1,-1), p.node2_emb.reshape(1,-1))[0][0], axis=1)
hierDF_transP279_filtered_sibs2['similarity_value'].describe()

Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404
SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch
Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling


Batches:   0%|          | 0/786 [00:00<?, ?it/s]

693.1311120986938 s


Exception when trying to download https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip. Response 404
SentenceTransformer-Model https://sbert.net/models/sentence-transformers/all-distilroberta-v1.zip not found. Try to create it from scratch
Try to create Transformer Model sentence-transformers/all-distilroberta-v1 with mean pooling


Batches:   0%|          | 0/786 [00:00<?, ?it/s]

688.8171112537384 s


count    785418.000000
mean          0.512485
std           0.160710
min          -0.145408
25%           0.399946
50%           0.515756
75%           0.632409
max           0.997993
Name: similarity_value, dtype: float64

In [214]:
hierDF_transP279_filtered_sibs2[['id', 'node1', 'label', 'node2', 'par_node', 'similarity_value']].to_csv(P279_SIBLINGS_DISTILBERT_COSSIM_FILE)

In [28]:
# # Alt scoring
# from sentence_transformers.cross_encoder import CrossEncoder
# crossEncModel = 'cross-encoder/stsb-roberta-large'
# model = CrossEncoder(crossEncModel, device='cuda:3')
# crossEncSimVals = model.predict(list(zip(hierDF_transP279_filtered_sibs2.node1_sent.to_list(), hierDF_transP279_filtered_sibs2.node2_sent.to_list())), show_progress_bar=True, batch_size=1000)

Downloading:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/139 [00:00<?, ?B/s]

Batches:   0%|          | 0/3708 [00:00<?, ?it/s]

# Probase Datasets

In [6]:
probDF = pd.read_csv(PROBASE_SOURCE_DIR + 'data-concept/data-concept-instance-relations.txt',header=None, sep='\t')
probDF.columns=['node1_label', 'node2_label', 'no_of_relations']
probDF.head()

Unnamed: 0,node1_label,node2_label,no_of_relations
0,factor,age,35167
1,free rich company datum,size,33222
2,free rich company datum,revenue,33185
3,state,california,18062
4,supplement,msm glucosamine sulfate,15942


In [14]:
probDF['node1'] = probDF.node1_label.apply(lambda p: labels_to_qnodes_dict[p] if p in labels_to_qnodes_dict else None)
probDF['node2'] = probDF.node2_label.apply(lambda p: labels_to_qnodes_dict[p] if p in labels_to_qnodes_dict else None)

In [26]:
def fix_qnodes_to_wsim(row):
    if row.node1_label in wordsim_labels_to_qnodes_dict:
        row['node1'] = wordsim_labels_to_qnodes_dict[row.node1_label]
    if row.node2_label in wordsim_labels_to_qnodes_dict:
        row['node2'] = wordsim_labels_to_qnodes_dict[row.node2_label]
    return row
tqdm.pandas()
probDF1 = probDF.progress_apply(fix_qnodes_to_wsim, axis=1)

  0%|          | 0/33377320 [00:00<?, ?it/s]

In [27]:
probDF1

Unnamed: 0,node1_label,node2_label,no_of_relations,node1,node2
0,factor,age,35167,Q103858669,Q100343219
1,free rich company datum,size,33222,,Q322481
2,free rich company datum,revenue,33185,,Q850210
3,state,california,18062,P1310,
4,supplement,msm glucosamine sulfate,15942,Q2915731,
...,...,...,...,...,...
33377315,popular legacy datum structure,binary search tree,1,,
33377316,norwegian food,lefse,1,,
33377317,freeze skill,new fot,1,,
33377318,enhanced enforcement initiative,monthly impact inspections of problem mine,1,,


In [28]:
len(probDF1)

33377320

In [29]:
((~probDF1.node1.isna()) & (~probDF1.node2.isna())).sum()

955210

In [30]:
probDF1.node1_label.value_counts()

factor                         364111
feature                        203549
issue                          201986
product                        172106
item                           158829
                                ...  
non opec supplier                   1
proactive industry group            1
car s important information         1
mesofronts                          1
bucket related offer                1
Name: node1_label, Length: 5376524, dtype: int64

In [32]:
probDF1.to_csv(PROBASE_INTERM_FILE,index=False)

In [33]:
probDF_Qnodes_DF_WQnodes = pd.read_csv(PROBASE_INTERM_FILE)

In [34]:
probDF_Qnodes_DF_WQnodes1_subset = probDF_Qnodes_DF_WQnodes[(~probDF_Qnodes_DF_WQnodes.node1.isna()) & (~probDF_Qnodes_DF_WQnodes.node2.isna())]

In [35]:
probDF_Qnodes_DF_WQnodes1_subset['no_of_relations'].describe()

count    955210.000000
mean         10.028577
std          85.410631
min           1.000000
25%           1.000000
50%           2.000000
75%           5.000000
max       35167.000000
Name: no_of_relations, dtype: float64

In [36]:
maxNR = probDF_Qnodes_DF_WQnodes1_subset['no_of_relations'].max()
probDF_Qnodes_DF_WQnodes1_subset['similarity_value'] = probDF_Qnodes_DF_WQnodes1_subset['no_of_relations'].apply(np.log10) / np.log10(maxNR)
probDF_Qnodes_DF_WQnodes1_subset.similarity_value.describe()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  probDF_Qnodes_DF_WQnodes1_subset['similarity_value'] = probDF_Qnodes_DF_WQnodes1_subset['no_of_relations'].apply(np.log10) / np.log10(maxNR)


count    955210.000000
mean          0.091717
std           0.112667
min           0.000000
25%           0.000000
50%           0.066217
75%           0.153750
max           1.000000
Name: similarity_value, dtype: float64

In [37]:
probDF_Qnodes_DF_WQnodes1_subset.to_csv(PROBASE_FINAL_FILE, index=None)

# Class Similarity Datasets

`zcat derived.dwd_isa_class_count.compact.tsv.gz | cut -f 3 | tail -n +2 | sed -e 's/|/\n/g' -e 's/:/\t/g' | sort | uniq > class-counts.tsv`

In [201]:
import requests
from tqdm.notebook import tqdm
import json
from joblib import Parallel, delayed
import sys

hierDF_transP279_filtered2_class_arr = []

def fetchClassSim(row, similarity_type):
    resp = requests.get("https://kgtk.isi.edu/similarity_api?q1="+row['node1']+"&q2="+row['node2']+"&embedding_type="+similarity_type)
    try:
        row['similarity_value'] = float(resp.json()['similarity']) if resp else -1
    except Exception as exc:
        print(exc)
        print(f"Resp not found for {row['node1']}, {row['node2']}")
        row['similarity_value'] = None
    return row

In [200]:
P279childPar_class = pd.read_csv(P279_CHILD_PAR_DISTILBERT_COSSIM_FILE)

In [205]:
tqdm.pandas()
P279childPar_class_new = P279childPar_class.progress_apply(fetchClassSim, axis=1, args=('class',))

  0%|          | 0/304654 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
P279childPar_class_new.to_csv(P279_CHILD_PAR_CLASSSIM_FILE, index=None)

In [202]:
P279childPar_class_arr = Parallel(n_jobs=5)(delayed(fetchClassSim)(row, 'class') for _, row in tqdm(P279childPar_class.iterrows(), total=len(P279childPar_class)))
                                                                                                                                                                                                                                                                                  

  0%|          | 0/304654 [00:00<?, ?it/s]

KeyboardInterrupt: 

## ChildPar

In [32]:
hierDF_transP279_filtered2 = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_min_cols.csv')

In [39]:
hierDF_transP279_filtered2[['node1', 'node2']].rename(columns={'node1':'q1', 'node2':'q2'}).to_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_min_cols_FOR_CLASS.tsv', sep='\t', index=False)

In [212]:
class_dict = {}
hierDF_transP279_filtered = pd.read_csv(P279_CHILD_PAR_DISTILBERT_COSSIM_FILE)
p279QnodesList = set(hierDF_transP279_filtered.node1.to_list() + hierDF_transP279_filtered.node2.to_list())
with gzip.open(DERIVED_IS_COUNTS_FILE) as class_file:
    for line in tqdm(class_file):
        line = line.decode('utf-8').strip().split('\t')
        if line[0] in p279QnodesList:
            class_dict[line[0]] = line[2].strip()
all_class_count_dict = {}
with open(CLASS_COUNTS_FILE) as class_file:
    for line in tqdm(class_file):
        line = line.split('\t')
        if line[0] in p279QnodesList:
            all_class_count_dict[line[0]] = line[1].strip()


class SemanticSimilarity(object):
    def __init__(self):
#         self.config = config
#         self.embeddings_type = ['class']
        self.N = float(52546967)
        self.all_class_count_dict = all_class_count_dict
        
    def get_qnode_details(self, qnodes: list):
        global class_dict
        qnodes_dict = {}
        qnodes_dict[qnodes[0]] = class_dict.get(qnodes[0],'')
        qnodes_dict[qnodes[1]] = class_dict.get(qnodes[1],'')
        return qnodes_dict
        
    def compute_class_similarity(self, q1, q2):
        qnodes_dict = self.get_qnode_details([q1, q2])
#         print(qnodes_dict)
        feature_dict, feature_count_dict = self.build_qnode_feature_dict(qnodes_dict)
#         print(feature_dict, feature_count_dict)
        normalized_classes_idf = self.normalize_idf_classes(feature_dict, feature_count_dict)
        if q1 in feature_dict and q2 in feature_dict:
            q1_cl = set(feature_dict[q1])
            q2_cl = set(feature_dict[q2])
            q1_q2_intersection = q1_cl.intersection(q2_cl)

            _similarity = sum([normalized_classes_idf.get(c) for c in q1_q2_intersection])
            return {
                'similarity': _similarity
            }
        return {
            'similarity': None
        }

    def build_qnode_feature_dict(self, qnodes_dict: dict) -> (dict, dict):
        feature_dict = {}
        feature_count_dict = {}

        for qnode in qnodes_dict:
            feature_val = []
            
            if '|' in qnodes_dict[qnode]:
                cl = qnodes_dict[qnode].split("|")

                for c in cl:
                    vals = c.split(":")
                    feature_val.append(vals[0])
                    feature_count_dict[vals[0]] = float(vals[1])
            if qnode not in feature_val:
                feature_val.append(qnode)
            if qnode not in feature_count_dict:
                feature_count_dict[qnode] = float(self.all_class_count_dict.get(qnode, 1.0))
            feature_dict[qnode] = feature_val

        return feature_dict, feature_count_dict

    def normalize_idf_classes(self, feature_dict, feature_count_dict):

        classes_count = {}
        for qnode in feature_dict:

            classes = feature_dict[qnode]
            for c in classes:
                if c not in classes_count:
                    classes_count[c] = 0
                classes_count[c] += 1

        classes_idf = self.calculate_idf_features(feature_count_dict)

        # multiply class count with idf
        for c in classes_idf:
            classes_idf[c] = classes_count[c] * classes_idf[c]

        # normalize the idf scores so that they sum to 1
        classes_idf_sum = sum([classes_idf[x] for x in classes_idf])
        for c in classes_idf:
            classes_idf[c] = classes_idf[c] / classes_idf_sum

        return classes_idf

    def calculate_idf_features(self, feature_count_dict):
        _ = {}
        for c in feature_count_dict:
            _[c] = math.log(self.N / feature_count_dict[c])
        return _

0it [00:00, ?it/s]

0it [00:00, ?it/s]

In [None]:
# import requests
# from tqdm.notebook import tqdm
# import json
# from joblib import Parallel, delayed
# import sys

# hierDF_transP279_filtered2_class_arr = []

# def fetchClassSim(row):
#     resp = requests.get("https://kgtk.isi.edu/similarity_api?q1="+row['node1']+"&q2="+row['node2']+"&embedding_type=class")
#     try:
#         row['classSim'] = float(resp.json()['similarity']) if resp else -1
#     except Exception as exc:
#         print(exc)
#         print(f"Resp not found for {row['node1']}, {row['node2']}")
#         row['classSim'] = None
#     row['Resp_code'] = resp
#     return row

# hierDF_transP279_filtered2_class_arr = Parallel(n_jobs=5)(delayed(fetchClassSim)(row) for _, row in tqdm(hierDF_transP279_filtered2.iloc[:100].iterrows(), total=len(hierDF_transP279_filtered2)))
                                                                                                                                          
                                                                                                                                          

In [None]:
hierDF_transP279_filtered2_class_arr

In [66]:
hierDF_transP279_filtered2_class = pd.DataFrame(hierDF_transP279_filtered2_class_arr)

In [67]:
hierDF_transP279_filtered2_class.head()

In [52]:
hierDF_transP279_filtered2_class.classSim.isna().sum()

302108

## Siblings

In [33]:
P279_3M_data = pd.read_csv('../data/Master_P279_dataset/P279Siblings_transP279_filtered_min_cols_with_desc_dups_removed.csv')
P279_10M_data = pd.read_csv('../data/Master_P279_dataset/P279Siblings_transP279_filtered_min_cols.csv')

In [None]:
# import requests
# from tqdm.notebook import tqdm
# import json
# from joblib import Parallel, delayed

# P279_3M_data_class_arr = []

# def fetchClassSim(row):
#     try:
#         resp = requests.get("https://kgtk.isi.edu/similarity_api?q1="+row['node1']+"&q2="+row['node2']+"&embedding_type=class").json()['similarity']
#         row['classSim'] = float(resp) if resp else -1
#     except:
#         print(f"Resp not found for {row['node1']}, {row['node2']}")
#         row['classSim'] = None
#     return row

# P279_3M_data_class_arr = Parallel(n_jobs=20)(delayed(fetchClassSim)(row) for _, row in tqdm(P279_3M_data.iterrows(), total=len(P279_3M_data)))

In [None]:
# import requests
# from tqdm.notebook import tqdm
# import json
# from joblib import Parallel, delayed

# P279_10M_data_class_arr = []

# def fetchClassSim(row):
#     try:
#         resp = requests.get("https://kgtk.isi.edu/similarity_api?q1="+row['node1']+"&q2="+row['node2']+"&embedding_type=class").json()['similarity']
#         row['classSim'] = float(resp) if resp else -1
#     except:
#         print(f"Resp not found for {row['node1']}, {row['node2']}")
#         row['classSim'] = None
#     return row

# P279_10M_data_class_arr = Parallel(n_jobs=20)(delayed(fetchClassSim)(row) for _, row in tqdm(P279_10M_data.iterrows(), total=len(P279_10M_data)))

# JC Similarity Datasets

## ChildPar

In [32]:
hierDF_transP279_filtered2 = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_filtered.csv')

In [74]:
import requests
from tqdm.notebook import tqdm
import json
from joblib import Parallel, delayed
import sys

hierDF_transP279_filtered2_class_arr = []

def fetchJCSim(row):
    resp = requests.get("https://kgtk.isi.edu/similarity_api?q1="+row['node1']+"&q2="+row['node2']+"&embedding_type=class")
    try:
        row['similarity_value'] = float(resp.json()['similarity']) if resp else -1
    except Exception as exc:
        print(exc)
        print(f"Resp not found for {row['node1']}, {row['node2']}")
        row['similarity_value'] = None
    row['Resp_code'] = resp
    return row

hierDF_transP279_filtered2_class_arr = Parallel(n_jobs=5)(delayed(fetchClassSim)(row) for _, row in tqdm(hierDF_transP279_filtered2.iloc[:100].iterrows(), total=len(hierDF_transP279_filtered2)))
                                                                                                                                          
                                                                                                                                          

  0%|          | 0/303284 [00:00<?, ?it/s]

could not convert string to float: ''
Resp not found for Q102108504, Q100171002


In [75]:
hierDF_transP279_filtered2_class_arr

[Unnamed: 0                                      0
 id              Q17372279-P279-Q100026-beba8cd1-0
 node1                                   Q17372279
 label                                        P279
 node2                                     Q100026
 bert2SentSim                             0.666525
 classSim                                 0.835782
 Resp_code                        <Response [200]>
 Name: 0, dtype: object,
 Unnamed: 0                                      1
 id              Q17372377-P279-Q100026-fd42bd71-0
 node1                                   Q17372377
 label                                        P279
 node2                                     Q100026
 bert2SentSim                             0.700515
 classSim                                  0.94674
 Resp_code                        <Response [200]>
 Name: 1, dtype: object,
 Unnamed: 0                                      2
 id              Q17372444-P279-Q100026-ca0fc4bd-0
 node1                          

In [66]:
hierDF_transP279_filtered2_class = pd.DataFrame(hierDF_transP279_filtered2_class_arr)

In [67]:
hierDF_transP279_filtered2_class.head()

In [52]:
hierDF_transP279_filtered2_class.classSim.isna().sum()

302108

## Siblings

In [33]:
P279_3M_data = pd.read_csv('../data/Master_P279_dataset/P279Siblings_transP279_filtered_min_cols_with_desc_dups_removed.csv')
P279_10M_data = pd.read_csv('../data/Master_P279_dataset/P279Siblings_transP279_filtered_min_cols.csv')

In [None]:
import requests
from tqdm.notebook import tqdm
import json
from joblib import Parallel, delayed

P279_3M_data_class_arr = []

def fetchClassSim(row):
    try:
        resp = requests.get("https://kgtk.isi.edu/similarity_api?q1="+row['node1']+"&q2="+row['node2']+"&embedding_type=class").json()['similarity']
        row['classSim'] = float(resp) if resp else -1
    except:
        print(f"Resp not found for {row['node1']}, {row['node2']}")
        row['classSim'] = None
    return row

P279_3M_data_class_arr = Parallel(n_jobs=20)(delayed(fetchClassSim)(row) for _, row in tqdm(P279_3M_data.iterrows(), total=len(P279_3M_data)))

In [None]:
import requests
from tqdm.notebook import tqdm
import json
from joblib import Parallel, delayed

P279_10M_data_class_arr = []

def fetchClassSim(row):
    try:
        resp = requests.get("https://kgtk.isi.edu/similarity_api?q1="+row['node1']+"&q2="+row['node2']+"&embedding_type=class").json()['similarity']
        row['classSim'] = float(resp) if resp else -1
    except:
        print(f"Resp not found for {row['node1']}, {row['node2']}")
        row['classSim'] = None
    return row

P279_10M_data_class_arr = Parallel(n_jobs=20)(delayed(fetchClassSim)(row) for _, row in tqdm(P279_10M_data.iterrows(), total=len(P279_10M_data)))

# Wiki-CS

In [1]:
import pandas as pd
import os
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib import pyplot

In [None]:
wikiDF = pd.read_csv('../data/wikidata-cs-20200504.tsv',sep='\t')
mapper1 = {'/r/DistinctFrom': 'distinctness',
          '/r/Antonym': 'antonymy',
          '/r/Synonym': 'synonymy',
          '/r/SimilarTo': 'similarity',
          '/r/DerivedFrom': 'derivation',
          '/r/IsA': 'inheritance',
          '/r/PartOf': 'meronymy',
          '/r/MadeOf': 'material',
          '/r/CreatedBy': 'attribution',
          '/r/UsedFor': 'utility',
          '/r/HasProperty': 'properties',
          '/r/Causes': 'causation',
          '/r/HasPrerequisite': 'ordering',
          '/r/HasContext': 'context',
          '/r/RelatedTo': 'other'}

wikiDF['dim'] = wikiDF['relation'].apply(lambda p: mapper1[p])
wikiDF['category'] = wikiDF['dim'].apply(lambda p: 'I' if p == 'synonymy' or p == 'similarity' else 'M')
wikiDF['dim'].value_counts()

In [None]:
wikiDF['dim'].apply(lambda p: 'I' if p == 'synonymy' else 'M').value_counts()

In [None]:
wikiDF1 = wikiDF[wikiDF.apply(lambda p: p['node1'].startswith('Q') and p['node2'].startswith('Q'), axis=1)]
wikiDF1.category.value_counts()

In [None]:
wikiDF1 = wikiDF1.rename(columns={'node1':'word1_kg_id', 'node2':'word2_kg_id'})
wikiDF1[['word1_kg_id', 'word2_kg_id', 'category', 'node1;label', 'node2;label']].to_csv('../data/wikidata-cs_categorized.csv',index=None)

In [3]:
wikiDF = pd.read_csv('../data/wikidata-cs_categorized.csv')

In [4]:
wikiDF.category.value_counts()

M    100110
I      1415
Name: category, dtype: int64

In [5]:
I_set_size = len(wikiDF[wikiDF.category == 'I'])
M_sampled_set = wikiDF[wikiDF.category == 'M'].sample(I_set_size, random_state=13)

In [19]:
from itertools import combinations

def find_relation_mapping(mainset, subset, needed_size):
    nodes = set([row['word1_kg_id'] for _,row in subset.iterrows()] 
            + [row['word2_kg_id'] for _,row in subset.iterrows()])
    node_label_mappings = {row['word1_kg_id']: row['node1;label'] for _, row in subset.iterrows()}.update(
                                {row['word2_kg_id']: row['node2;label'] for _, row in subset.iterrows()}
                        )
    mainset_pairs = set([(row['word1_kg_id'], row['word2_kg_id']) for _,row in mainset.iterrows()])

    all_pairs = set([pair for pair in list(combinations(list(nodes), 2))[10*needed_size]])
    
    removed_pairs = set()
    
    first_line = True
    with gzip.open('../data/wikidata-20210215-dwd/claims.tsv.gz', 'r') as claims_file:
        for line in tqdm(claims_file,total=98482585):
            if first_line:
                first_line = False
                continue
            line = line.decode("utf-8").strip().split('\t')
            curr_pair = (line[1], line[3])
            if curr_pair in all_pairs:
                removed_pairs.add(curr_pair)
    newDF = []
    for pair in all_pairs - removed_pairs - mainset_pairs:
        try:
            if pair[0] is None or pair[1] is None:
                continue
            newDF.append(pair[0], pair[1], 'U', node_label_mappings[pair[0]], node_label_mappings[pair[1]])
        except:
            print(pair)
    return pd.DataFrame(newDF, columns=subset.columns)

In [20]:
%%time

U_sampled_set = find_relation_mapping(wikiDF, wikiDF[wikiDF.category == 'M'], I_set_size)

  0%|          | 0/98482585 [00:00<?, ?it/s]

Q1070669
Q20755059
CPU times: user 13min 20s, sys: 1min 24s, total: 14min 44s
Wall time: 14min 40s


In [21]:
len(U_sampled_set)

0

# Check Diff in versions

In [44]:
import pandas as pd
def get_all_nodes():
    p279ChildPar = pd.read_csv('../data/Master_P279_dataset/P279ChildPar_transP279_filtered_baremetal.csv')
    wordsim_df = pd.read_csv('../data/wordsim353_with_r3.csv')
    wiki_cs_df = pd.read_csv('../data/wikidata-cs_categorized.csv')
    concept_net_df = pd.read_csv('../data/kgtk_conceptnet_evaluation.csv')
    p279QnodesList = set(p279ChildPar.node1.to_list() 
                        + p279ChildPar.node2.to_list()
                        + wordsim_df['word1_kg_id'].to_list() 
                        + wordsim_df['word2_kg_id'].to_list()
                        + wiki_cs_df['word1_kg_id'].to_list() 
                        + wiki_cs_df['word2_kg_id'].to_list()
                        + concept_net_df['word1_kg_id'].to_list()
                        + concept_net_df['word2_kg_id'].to_list())
    print(len(p279QnodesList))
    return p279QnodesList
allNodes = get_all_nodes()

275927


In [None]:
files_set_1 = ['../data/wikidata-20210215-dwd/descriptions.en.tsv.gz',
              '../data/wikidata-20210215-dwd/sitelinks.en.tsv.gz']

files_set_2 = [DESCRIPTIONS_FILE,
              '../../wd-correctness/gdrive-kgtk-dump-2020-12-07/sitelinks.en.tsv.gz']


all_nodes_set = set(allNodes)
for file1, file2 in zip(files_set_1, files_set_2):
    file1_dict, file2_dict = {}, {}
    with gzip.open(file1) as file1:
        for line in tqdm(file1):
            line = line.decode("utf-8").strip().split('\t')
            if line[1] in all_nodes_set:
                file1_dict[line[1]] = line[3][1:-4]
    with gzip.open(file2) as file2:
        for line in tqdm(file2):
            line = line.decode("utf-8").strip().split('\t')
            if line[1] in all_nodes_set:
                file2_dict[line[1]] = line[3][1:-4]
    for elem in file1_dict.keys():
        try:
            if file1_dict[elem] != file2_dict[elem]:
                print(elem, file1_dict[elem], file2_dict[elem])
        except:
            continue