In [23]:
import csv

## Setup directories

In [27]:
# input_file
WebChild_file = "comparative-cw0912.txt"

# ouput_file
kgtk_webchild = "kgtk_webchild_comparative.tsv"

## Transfer WebChild Structure

Structure is similar to kgtk_wordnet.tsv or kgtk_conceptnet.tsv

**Document Head Meaning Explaination**

link:http://people.mpi-inf.mpg.de/~ntandon/resources/readme-comparative.html

x         | character varying | ambiguous arg1 e.g. car

r         | character varying | ambiguous attribute e.g. fast

y         | character varying | ambiguous arg2 e.g. bike

awps      | character varying | disambiguated attribute e.g. fast#a#1

xwps      | character varying | disambiguated arg1 e.g. car#n#1

rnorm     | character varying | normalized comparing relation 
                             e.g. be faster than
                             
ywps      | character varying | disambiguated arg2 e.g. plant#n#2

direction | character varying | fwd or backward direction e.g. 
                             car faster bike is forward whereas bike slower car is backward direction 
                             
freq      | real              | corpus frequency of the triple.

normid    | character varying | id of the direction-noramlized triple
                             e.g. car fast bike; bike slow car -- 12345
                             
observid  | character varying | id of the observation  e.g. car faster bike 1234 
                             
sources   | character varying | list of the sources e.g. ngram, wordnet

In [57]:
def load_file(filename):
    # load WebChild file
    with open(filename,"r", encoding='latin-1') as f:
        head = f.readline().strip().split(",")
        blank_line = f.readline()
        
        lines = []
        for line in f:
            lines.append(line.strip().split("\t"))
            
    return head,lines

def generate_wnId(id_):
    id_list = id_.split("#")
    if len(id_list[-1])<2:
        id_list[-1] = "0"+id_list[-1]
        
    return "wn:"+".".join(id_list)

def write_file(data, filename):
    # transfer WebChild file into the KGTK format, with the same columns as kgtk_wordnet.tsv or kgtk_conceptnet.tsv
    # Columns: node1 relation node2 node1;label node2;label relation;label relation;dimension source sentence
    head = ["node1","relation", "node2", "node1;label", "node2;label", "relation;label", "relation;dimension", "source", "sentence"]
    with open(filename, "w",newline="",encoding="latin-1") as f:
        writer = csv.writer(f, delimiter='\t')
        writer.writerow(head)
        
        for line in data:
            
            # transfer data
            node1Id =generate_wnId(line[1])
            relationId =generate_wnId(line[4])
            node2Id =generate_wnId(line[6])
            node1Label = line[0]
            node2Label =line[5]
            relationLabel =line[3]
            relationDim = ""
            temp_source = line[13]
            if temp_source:
                temp_source = temp_source[1:].replace(",","|")
            source = temp_source
            sentence = "[["+line[0]+"]]" + " "+line[2]+" "+"[["+line[5]+"]]"
            
            new_line = [node1Id,relationId,node2Id,node1Label,node2Label,relationLabel,relationDim,source,sentence]
            
            # write new line
            writer.writerow(new_line)

In [58]:
# load WebChild file
head, lines = load_file(WebChild_file)

#write file into KGTK format
write_file(lines, kgtk_webchild)