In [61]:
import json
import pandas as pd
from collections import defaultdict
import os
from nltk.corpus import wordnet as wn
from copy import copy

import config

In [2]:
VERSION=config.VERSION

In [14]:
NODE_COLS=config.nodes_cols
EDGE_COLS=config.edges_cols
datasource=config.wn_ds

In [4]:
# INPUT FILES
input_dir='../input/wordnet'
subclass_file='%s/Edges_Synset_subClassOf.csv' % input_dir

# OUTPUT FILES
output_dir='../output_v%s/wordnet' % VERSION
nodes_file='%s/nodes_v%s.csv' % (output_dir, VERSION)
edges_file='%s/edges_v%s.csv' % (output_dir, VERSION)

In [5]:
if not os.path.exists(output_dir):
    os.makedirs(output_dir)

### Store subclass edges

In [23]:
df=pd.read_csv(subclass_file, sep='\t', header=0, converters={5: json.loads})

In [70]:
tmp_edges_df=df[(df['object'] != 'None') & (df['subject'] != 'None')]

In [71]:
len(tmp_edges_df)

87291

In [68]:
clean_edges=[]
for i, row in tmp_edges_df.iterrows():
    if ',' in row['object']:
        vals=row['object'].split(',')
        for v in vals:
            a_row=copy(row)
            a_row['object']=v
            clean_edges.append(a_row)
    else:
        clean_edges.append(row)

In [69]:
len(clean_edges)

88776

In [72]:
edges_df=pd.DataFrame(clean_edges, columns=EDGE_COLS)

In [73]:
len(edges_df)

88776

In [74]:
edges_df.sort_values(by=['subject', 'predicate','object']).to_csv(edges_file, index=False, sep='\t')

### Create nodes file and store it

In [75]:
nodes=set()

In [76]:
for i, row in edges_df.iterrows():
    nodes.add(row['subject'])
    nodes.add(row['object'])

In [77]:
len(nodes)

87664

In [82]:
node_data=[]
for a_node in nodes:
    n=a_node.split(':')[1]
    label=''
    aliases=''
    
    lemmas=[]
    syn=wn.synset(n)
    for lemma in syn.lemmas():
        lemmas.append(str(lemma.name()))
    if len(lemmas):
        label=lemmas[0].replace('_', ' ')
    if len(lemmas)>0:
        alias_labels=[]
        for l in lemmas[1:]:
            if l!=lemmas[0]:
                alias_labels.append(l.replace('_', ' '))
        aliases=','.join(alias_labels)

    if len(n.split('.'))<3:
        print(n)
    else:
        pos=n.split('.')[1]
    other={}
    a_row=[a_node, label, aliases, pos, datasource, other]
    node_data.append(a_row)

In [83]:
len(node_data)

87664

In [84]:
nodes_df=pd.DataFrame(node_data, columns = NODE_COLS)
nodes_df.sort_values('id').to_csv(nodes_file, index=False, sep='\t')