In [1]:
from nltk.corpus import wordnet as wn
import pandas as pd
import json
import os

In [2]:
import config

In [3]:
def create_uri(ns, rel):
    return '%s:%s' % (ns, rel)

In [4]:
cnfile='../input/conceptnet/conceptnet-en-with-externalurl.csv'
data_source=config.wn_ds
weight="1.0"
VERSION=config.VERSION

EDGE_COLS=config.edges_cols

cn_nodes_file=f'../output_v{VERSION}/conceptnet/nodes_v{VERSION}.csv'
wn_nodes_file=f'../output_v{VERSION}/wordnet/nodes_v{VERSION}.csv'
vg_nodes_file=f'../output_v{VERSION}/visualgenome/nodes_v{VERSION}.csv'

wordnet30_ili_file='../input/mappings/ili-map-pwn30.tab'
wordnet31_ili_file='../input/mappings/ili-map-pwn31.tab'

# OUTPUT FILE
output_dir=f'../output_v{VERSION}/mappings'
edges_file=f'{output_dir}/edges_v{VERSION}.csv'

MOWGLI_NS=config.mowgli_ns
WORDNET_NS=config.wordnet_ns

SAMEAS_REL=create_uri(MOWGLI_NS, config.sameas)

In [5]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

### Load the data in pandas

In [None]:
df=pd.read_csv(cnfile, sep='\t', header=None, converters={4: json.loads})

In [None]:
df.columns=['assertion','rel','subj','obj','metadata']

In [None]:
df.drop(columns=['assertion'])

In [None]:
len(df)

In [None]:
df_wordnet=df.loc[(df['rel'] == '/r/ExternalURL') & (df['obj'].str.contains(r'http://wordnet-'))]

In [None]:
len(df_wordnet)

### Get previous nodes

In [None]:
all_nodes=set()
wn_nodes=set()

In [None]:
with open(cn_nodes_file, 'r') as f:
    for line in f:
        first=line.split('\t')[0]
        all_nodes.add(first)

In [None]:
with open(wn_nodes_file, 'r') as f:
    for line in f:
        first=line.split('\t')[0]
        all_nodes.add(first)
        wn_nodes.add(first)

In [None]:
with open(vg_nodes_file, 'r') as f:
    for line in f:
        first=line.split('\t')[0]
        if first.startswith('wn:'):
            all_nodes.add(first)
            wn_nodes.add(first)

In [None]:
len(wn_nodes)

### Load Wordnet mapings 3.0 to 3.1

In [None]:
mapping={}

In [None]:
with open(wordnet31_ili_file, 'r') as f:
    for line in f:
        ili, wn31=line.split('\t')
        mapping[ili]={'31': wn31}

In [None]:
with open(wordnet30_ili_file, 'r') as f:
    for line in f:
        ili, wn30=line.split('\t')
        if ili in mapping.keys():
            mapping[ili]['30']=wn30

In [None]:
mapping_31_30={}
for ili, ili_data in mapping.items():
    id_31=ili_data['31'].strip()
    id_30=ili_data['30'].strip()
    mapping_31_30[id_31]=id_30

In [None]:
list(mapping_31_30.keys())[:10]

In [None]:
len(mapping_31_30)

In [None]:
def extract_wn_version_id(uri):
    splitted=uri.split('/')
    wn_offset_id=splitted[4][1:]
    return splitted[3], wn_offset_id

In [None]:
len(df_wordnet)

In [None]:
missing_in_mapping=0
missing_node_case=0
all_edges=[]
for i, row in df_wordnet.iterrows():
    wn_version, wn_offset_id=extract_wn_version_id(row['obj'])
    if wn_version=='wn31':
        if wn_offset_id not in mapping_31_30.keys():
            missing_in_mapping+=1#
        else:
            wn_30_id=mapping_31_30[wn_offset_id]
            offset, pos=wn_30_id.split('-')
            wn_30_synset=wn.synset_from_pos_and_offset(pos,int(offset)).name()
            wn_30_synset_uri=create_uri(WORDNET_NS, wn_30_synset)
            
            #if row['subj'] in all_nodes:# and 
            if wn_30_synset_uri in wn_nodes:
                an_edge=[row['subj'], SAMEAS_REL, wn_30_synset_uri, data_source, weight, {}]
                all_edges.append(an_edge)
            else:
                missing_node_case+=1

In [None]:
missing_in_mapping

In [None]:
missing_node_case

In [None]:
len(all_edges)

In [None]:
edges_df = pd.DataFrame(all_edges, columns = EDGE_COLS)
edges_df.sort_values(by=['subject', 'predicate','object']).to_csv(edges_file, index=False, sep='\t')