In [20]:
import pandas as pd
from tqdm.notebook import tqdm
import gzip
import os
from sklearn.metrics.pairwise import cosine_similarity
from matplotlib import pyplot
from itertools import combinations
import random

In [3]:
LABELS_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/labels.en.tsv.gz"
ALL_EDGES_FILE = "../source_dataset_files/wikidata-20210215-dwd-v2/claims.tsv.gz"

In [26]:
CONCEPTNET_SOURCE_FILE = "../data/evaluation/source_files/kgtk_conceptnet.tsv"
CONCEPTNET_INTERM_FILE = "../data/evaluation/intermediate_files/kgtk_conceptnet.csv"
CONCEPTNET_FINAL_FILE = "../data/evaluation/kgtk_conceptnet_final.csv"

In [65]:
WIKI_CS_SOURCE_FILE = '../data/evaluation/source_files/wikidata-cs-20200504.tsv'
WIKI_CS_INTERM_FILE = '../data/evaluation/intermediate_files/wikidata-cs_categorized.csv'
WIKI_CS_FINAL_FILE = '../data/evaluation/wikidata-cs_final.csv'

In [57]:
def uniformize_dataset(wikiDF):
    needed_size = len(wikiDF[wikiDF.category == 'I'])
    M_sampled_set = wikiDF[wikiDF.category == 'M'].sample(needed_size, random_state=13)
    subset = wikiDF[wikiDF.category == 'M']
    mainset = wikiDF
    
    mainset_pairs = set([(row['word1_kg_id'], row['word2_kg_id']) for _,row in mainset.iterrows()])
    nodes = set([row['word1_kg_id'] for _,row in subset.iterrows()] 
            + [row['word2_kg_id'] for _,row in subset.iterrows()])
    node_label_mappings = {row['word1_kg_id']: row['node1;label'] for _, row in subset.iterrows()}
    node_label_mappings.update({row['word2_kg_id']: row['node2;label'] for _, row in subset.iterrows()})
    
    all_pairs = set([pair for pair in random.sample(list(combinations(list(nodes), 2)), 10*needed_size)])
    removed_pairs = set()

    first_line = True
    
    with gzip.open(ALL_EDGES_FILE, 'r') as claims_file:
        for line in tqdm(claims_file, total=491297976): #, total=1034927835):
            if first_line:
                first_line = False
                continue
            line = line.decode("utf-8").strip().split('\t')
            curr_pair = (line[1], line[3])
            if curr_pair in all_pairs:
                removed_pairs.add(curr_pair)
    newDF = []
    count = 0
    
    for pair in all_pairs - removed_pairs - mainset_pairs:
        if pair is None or pair[0] is None or pair[1] is None:
            continue
        try:
            newDF.append((pair[0], pair[1], 'U', node_label_mappings.get(pair[0],""), node_label_mappings.get(pair[1],""), None, None, None, None, None))
            count += 1
            if count == needed_size:
                break
        except:
            print(f"Error while processing pair: {pair}")
            
    return pd.concat([
            wikiDF[wikiDF.category == 'I'],
            M_sampled_set,
            pd.DataFrame(newDF, columns=subset.columns)
        ])

# ConceptNet

In [52]:
labelsMap = {}
with open(CONCEPTNET_SOURCE_FILE) as conceptFile:
    firstLine = True
    for line in tqdm(conceptFile):
        if firstLine:
            firstLine = False
            continue
        line = line.split('\t')
        labelsMap[line[3][1:-1]] = None
        labelsMap[line[4][1:-1]] = None
print("No. of conceptNet nodes fetched: ",len(labelsMap))
with gzip.open(LABELS_FILE, 'r') as labelsFile:
    firstLine = True
    for line in tqdm(labelsFile):
        if firstLine:
            firstLine = False
            continue
        line = line.decode('utf-8').strip().split('\t')
        line[3] = line[3][1:-5]
        qnode, label = line[1], line[3]
#         print(qnode, label)
        if label in labelsMap:
            if labelsMap[label] == None:
                labelsMap[label] = []
            labelsMap[label].append(qnode)

0it [00:00, ?it/s]

No. of conceptNet nodes fetched:  1165190


0it [00:00, ?it/s]

In [53]:
def fetchQnode(label):
    if pd.isna(label):
        return None
    label = label[1:-1]
    if label not in labelsMap:
        return None
    else:
        return labelsMap[label]

In [54]:
conceptDF = pd.read_csv(CONCEPTNET_SOURCE_FILE, sep='\t')
conceptDF['node1;qnode'] = conceptDF['node1;label'].apply(fetchQnode)
conceptDF['node2;qnode'] = conceptDF['node2;label'].apply(fetchQnode)
conceptDF1 = conceptDF[(~conceptDF['node1;qnode'].isna()) & (~conceptDF['node2;qnode'].isna())]
print(f"Concept Net dataset size reduced from {len(conceptDF)} pairs to {len(conceptDF1)} pairs")
conceptDF1['node1;qnode;len'] = conceptDF1['node1;qnode'].apply(len)
conceptDF1['node2;qnode;len'] = conceptDF1['node2;qnode'].apply(len)
conceptDF2 = conceptDF1[(conceptDF1['node1;qnode;len'] == 1) & (conceptDF1['node2;qnode;len'] == 1)]
print(f"Concept Net dataset size reduced from {len(conceptDF1)} pairs to {len(conceptDF2)} pairs by keeping only pairs which have one-one mapping with qnodes")
conceptDF2['relation;label'].value_counts()

  has_raised = await self.run_ast_nodes(code_ast.body, cell_name,


Concept Net dataset size reduced from 3423004 pairs to 99052 pairs


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conceptDF1['node1;qnode;len'] = conceptDF1['node1;qnode'].apply(len)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conceptDF1['node2;qnode;len'] = conceptDF1['node2;qnode'].apply(len)


Concept Net dataset size reduced from 99052 pairs to 2084 pairs by keeping only pairs which have one-one mapping with qnodes


related to                   758
synonym                      708
form of                      219
etymologically related to     82
derived from                  77
is a                          63
has context                   45
at location                   29
antonym                       15
manner of                     15
similar to                    15
part of                       12
causes                         9
distinct from                  6
used for                       5
receives action                4
has prerequisite               4
genus                          2
has a                          2
instance of                    2
has subevent                   2
capital                        2
motivated by goal              2
has last subevent              1
has property                   1
causes desire                  1
genre                          1
has first subevent             1
language                       1
Name: relation;label, dtype: int64

In [55]:
conceptDF2['category'] = conceptDF2['relation;label'].apply(lambda p: 'I' if p == 'synonym' or p == 'similar to' else 'M')
conceptDF2['relation;label'].apply(lambda p: 'I' if p == 'synonym' or p == 'similar to' else 'M').value_counts()
conceptDF2['word1_kg_id'] = conceptDF2['node1;qnode'].apply(lambda p: p[0])
conceptDF2['word2_kg_id'] = conceptDF2['node2;qnode'].apply(lambda p: p[0])
conceptDF2[['word1_kg_id', 'word2_kg_id', 'category', 'node1;label', 'node2;label', 'relation', 'relation;label', 'relation;dimension', 'source', 'sentence']].to_csv(CONCEPTNET_INTERM_FILE, index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conceptDF2['category'] = conceptDF2['relation;label'].apply(lambda p: 'I' if p == 'synonym' or p == 'similar to' else 'M')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  conceptDF2['word1_kg_id'] = conceptDF2['node1;qnode'].apply(lambda p: p[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  concep

In [58]:
conceptnet_uniform_df = uniformize_dataset(pd.read_csv(CONCEPTNET_INTERM_FILE))

  0%|          | 0/491297976 [00:00<?, ?it/s]

In [61]:
conceptnet_uniform_df.to_csv(CONCEPTNET_FINAL_FILE, index=None)

In [62]:
conceptnet_uniform_df.category.value_counts()

I    723
U    723
M    723
Name: category, dtype: int64

In [67]:
conceptnet_uniform_df[(conceptnet_uniform_df.category == 'I') & (conceptnet_uniform_df.word1_kg_id != conceptnet_uniform_df.word2_kg_id)].head(100)

Unnamed: 0,word1_kg_id,word2_kg_id,category,node1;label,node2;label,relation,relation;label,relation;dimension,source,sentence
1350,Q29736947,Q50414211,I,arco,bowed,/r/SimilarTo,similar to,,CN,[[arco]] is similar to [[bowed]]
1351,Q29782278,Q12835706,I,auxo,thallo,/r/SimilarTo,similar to,,CN,
1352,Q50414211,Q29736947,I,bowed,arco,/r/SimilarTo,similar to,,CN,[[bowed]] is similar to [[arco]]
1353,Q1146652,Q12835706,I,dike,thallo,/r/SimilarTo,similar to,,CN,
1354,Q61504726,Q3366001,I,empty,stripped,/r/SimilarTo,similar to,,CN,[[empty]] is similar to [[stripped]]
...,...,...,...,...,...,...,...,...,...,...
2025,Q190527,Q2655953,I,value,prize,/r/Synonym,synonym,,CN,[[value]] is a synonym of [[prize]]
2037,Q1128240,P3828,I,weary,aweary,/r/Synonym,synonym,,CN,[[weary]] is a synonym of [[aweary]]
2038,Q1128240,Q29713327,I,weary,tire,/r/Synonym,synonym,,CN,[[weary]] is a synonym of [[tire]]
2045,Q103859907,Q213449,I,whipping,beating,/r/Synonym,synonym,,CN,[[whipping]] is a synonym of [[beating]]


# Wiki-CS

In [41]:
wikiDF = pd.read_csv(WIKI_CS_SOURCE_FILE,sep='\t')
mapper1 = {'/r/DistinctFrom': 'distinctness',
          '/r/Antonym': 'antonymy',
          '/r/Synonym': 'synonymy',
          '/r/SimilarTo': 'similarity',
          '/r/DerivedFrom': 'derivation',
          '/r/IsA': 'inheritance',
          '/r/PartOf': 'meronymy',
          '/r/MadeOf': 'material',
          '/r/CreatedBy': 'attribution',
          '/r/UsedFor': 'utility',
          '/r/HasProperty': 'properties',
          '/r/Causes': 'causation',
          '/r/HasPrerequisite': 'ordering',
          '/r/HasContext': 'context',
          '/r/RelatedTo': 'other'}

wikiDF['dim'] = wikiDF['relation'].apply(lambda p: mapper1[p])
wikiDF['category'] = wikiDF['dim'].apply(lambda p: 'I' if p == 'synonymy' or p == 'similarity' else 'M')
wikiDF['dim'].value_counts()

inheritance     72707
meronymy         6886
context          5541
distinctness     4934
utility          2243
antonymy         2184
ordering         2107
material         1426
synonymy         1070
properties       1049
derivation        540
causation         510
similarity        345
attribution       187
other              42
Name: dim, dtype: int64

In [42]:
wikiDF['dim'].apply(lambda p: 'I' if p == 'synonymy' else 'M').value_counts()

M    100701
I      1070
Name: dim, dtype: int64

In [43]:
wikiDF1 = wikiDF[wikiDF.apply(lambda p: p['node1'].startswith('Q') and p['node2'].startswith('Q'), axis=1)]
wikiDF1.category.value_counts()

M    100110
I      1415
Name: category, dtype: int64

In [51]:
wikiDF1 = wikiDF1.rename(columns={'node1':'word1_kg_id', 'node2':'word2_kg_id'})
wikiDF1[['word1_kg_id', 'word2_kg_id', 'category', 'node1;label', 'node2;label', 'relation', 'relation;label', 'relation;dimension', 'source', 'sentence']].to_csv(WIKI_CS_INTERM_FILE,index=None)

In [59]:
wikiCS_uniform_df = uniformize_dataset(pd.read_csv(WIKI_CS_INTERM_FILE))

  0%|          | 0/491297976 [00:00<?, ?it/s]

In [66]:
wikiCS_uniform_df.to_csv(WIKI_CS_FINAL_FILE, index=None)

In [63]:
wikiCS_uniform_df.category.value_counts()

M    1415
I    1415
U    1415
Name: category, dtype: int64