1. download cskg with dimensions filled from here: https://drive.google.com/file/d/152rNzL3dNQGU7ee0UOK3UMU5H-mvWaeP/view?usp=sharing
2. Select only edges that:

a). have a dimension that is either "spatial" or "part-whole", and

b). have a node label that is one of the ~50 labels of interest shown in Figure 2c of this paper: https://arxiv.org/pdf/1711.11543.pdf. The label can appear either in the subject (node1) or object (node2)
3. Store these in a file.
4. compute statistics over this file: how many edges, distribution per relation (table), distribution per dimension (table), distribution per source (table), distribution for each of the labels of interest (table)
5. look at the quality of the edges per label of interest, and see if some of the 50 labels have better quality of information than others. Create one file per label of interest so we can take a look together.
6. look at the quality of the edges per source, and see if some sources have better quality than others. Create one file per source so we can take a look together.
7. look at the quality of the edges per relation, and see if some relation have better quality than others. Create one file per relation so we can take a look together.

In [1]:
import pandas as pd

## Set up folder

In [38]:
# input
cskg_dim_path = "./data/cskg_dim/cskg_dim.tsv"

# output
filter_line_file = "./data/cskg_dim/filter_cskg_dim.tsv"
edges_label = "./data/cskg_dim/cskg_dim_label.tsv"
edges_source = "./data/cskg_dim/cskg_dim_source.tsv"
edges_relation = "./data/cskg_dim/cskg_dim_relation.tsv"
matrix= "./data/cskg_dim/matrix.xlsx"

In [3]:
def load_source(filename):
    """
    load cskg_dim file
    
    return: head, lines
    """
    
    with open(filename, "r") as f:
        head = f.readline()
        lines = []
        for line_str in f:
            line = line_str.strip().split("\t")
            lines.append(line)
            
    return head,lines

def save_data(data,filename):
    """
    save cskg_dim filtered file
    return : None
    """
    with open(filename, "w") as f:
        f.write(head)
        for line in data:
            f.write(("\t").join(line)+"\n")
            
    return

def save_dataByItem(data_dict,filename):
    """
    save line by item
    """
    with open(filename,"w") as f:
        for key_ in data_dict:
            lines = data_dict[key_]
            f.write("**"+key_+"**"+"\n")
            for line in lines:
                f.write(("\t").join(line)+"\n")
            f.write("##########################################\n")
    return

def data_filter(data, dim_limit=None, label_limit=None):
    """
    only select the edge whose relation dimension in dim_limit and label in label_limit
    """
    label_count = dict()
    for item in label_limit:
        label_count[item] = 0
    
    f_line=[]
    
    for line in data:
        dim = line[7]
        label1_s = line[4].split("|")
        label2_s = line[5].split("|")
        
        if dim in dim_limit:
            for label1_, label2_ in zip(label1_s,label2_s):
                if label1_ in label_limit:
                    label_count[label1_] = label_count.get(label1_,0)+1
                    f_line.append(line)
                    continue
                    
                elif label1_ in label_limit:
                    label_count[label2_] = label_count.get(label2_,0)+1
                    f_line.append(line)
                    continue
                    
                else:
                    continue
    return f_line,label_count

## Load File

In [4]:
# load cskg_dim
head, lines = load_source(cskg_dim_path)

# example of lines
lines[0]

['/c/en/0-/r/DefinedAs-/c/en/empty_set-0000',
 '/c/en/0',
 '/r/DefinedAs',
 '/c/en/empty_set',
 '0',
 'empty set',
 'defined as',
 'similarity',
 'CN',
 '[[0]] is the [[empty set]].']

In [5]:
len(lines)

5895122

In [6]:
# limit dimension
dim_limit = set(["spatial", "part-whole"])

# limit label
label_limit = set(["rug","piano","dryer","computer","fireplace","whiteboard","bookshelf","wardrobe cabinet",
                  "pan","toilet","plates","ottoman","fish tank","dishwasher","microwave","water dispenser",
                  "bed","table","mirror","tv stand","stereo set","chessboard","playstation","vacuum cleaner",
                  "cup","xbox","heater","bathtub","shoe rack","range oven","refrigerator","coffee machine",
                  "sink","sofa","kettle","dresser","knife rack","towel rack","loudspeaker","utensil holder",
                  "desk","vase","shower","washer","fruit bowl","television","dressing table","cutting board",
                  "ironing board","food processor"])

# filter data

filter_lines,label_dict = data_filter(lines, dim_limit=dim_limit, label_limit=label_limit)

In [7]:
save_data(filter_lines,filter_line_file)

## Statistic

In [8]:
# number of edges
print("Number of edges:", len(filter_lines))

Number of edges: 3734


In [10]:
filter_lines[0]

['/c/en/aquarium/n/wn/artifact-/r/LocatedNear-/c/en/couch/n/wn/artifact-0000',
 '/c/en/aquarium/n/wn/artifact',
 '/r/LocatedNear',
 '/c/en/couch/n/wn/artifact',
 'fish tank',
 'couch',
 'next to',
 'spatial',
 'VG']

In [12]:
# distribution of relation id
relation_id_dict=dict()
for edge in filter_lines:
    rel_id=edge[2]
    relation_id_dict[rel_id]=relation_id_dict.get(rel_id,0)+1

In [14]:
rel_id_distri=pd.DataFrame(sorted(relation_id_dict.items(),key=lambda k:k[1],reverse=True))
rel_id_distri

Unnamed: 0,0,1
0,/r/LocatedNear,3446
1,/r/AtLocation,228
2,/r/PartOf,25
3,/r/HasA,20
4,/r/MadeOf,15


In [26]:
relation_id_idx=dict()
n=0
for item in relation_id_dict:
    relation_id_idx[item]=n
    n+=1

In [27]:
relation_id_idx

{'/r/LocatedNear': 0,
 '/r/AtLocation': 1,
 '/r/HasA': 2,
 '/r/MadeOf': 3,
 '/r/PartOf': 4}

In [28]:
edge

['Q806617-/r/PartOf-Q2092603-0000',
 'Q806617',
 '/r/PartOf',
 'Q2092603',
 'bed',
 'member',
 'has part|part of',
 'part-whole',
 'WD']

In [39]:
# build matrix
node_ids=dict()
for edge in filter_lines:
    subject_=edge[1]
    rel_=edge[2]
    object_=edge[3]
    
    temp1=node_ids.get(subject_,[0]*len(rel_id_distri))
    temp1[relation_id_idx[rel_]]=1
    node_ids[subject_]=temp1
    
    temp2=node_ids.get(subject_,[0]*len(rel_id_distri))
    temp2[relation_id_idx[rel_]]=1
    node_ids[subject_]=temp2

In [40]:
pd_dict=dict()
pd_dict["node_id"]=list(node_ids.keys())

for name in relation_id_idx:
    pd_dict[name]=[node_ids[item][relation_id_idx[name]] for item in node_ids]
    
df1 = pd.DataFrame(pd_dict)
df1

Unnamed: 0,node_id,/r/LocatedNear,/r/AtLocation,/r/HasA,/r/MadeOf,/r/PartOf
0,/c/en/aquarium/n/wn/artifact,1,0,0,0,0
1,/c/en/bag/n/wn/artifact,1,0,0,0,0
2,/c/en/banana/n/wn/plant,1,0,0,0,0
3,/c/en/barn/n/wn/artifact,1,0,0,0,0
4,/c/en/barrow/n/wn/artifact,1,0,0,0,0
...,...,...,...,...,...,...
190,Q14748,0,0,0,1,1
191,Q339805,0,0,0,1,0
192,Q35197,0,0,0,0,1
193,Q7857,0,0,0,0,1


In [42]:
# distribution of label
label_distri=pd.DataFrame(sorted(label_dict.items(),key=lambda k:k[1],reverse=True))
label_distri

Unnamed: 0,0,1
0,table,523
1,cup,406
2,mirror,284
3,vase,240
4,bed,229
5,toilet,223
6,computer,191
7,sink,166
8,desk,160
9,pan,148


In [43]:
# distribution of relation
relation_dict = dict()

for line in filter_lines:
    relation_s = line[6].split("|")
    
    for relation_ in relation_s:
        relation_dict[relation_] = relation_dict.get(relation_,0)+1

In [44]:
# filter relation having only one
relation_dict_2 = dict()
for line in filter_lines:
    relation_ = line[6]
    relation_dict_2[relation_] = relation_dict_2.get(relation_,0)+1

In [45]:
len(relation_dict), len(relation_dict_2)

(1338, 1388)

In [46]:
relation_distri=pd.DataFrame(sorted(relation_dict.items(),key=lambda k:k[1],reverse=True))
relation_distri

Unnamed: 0,0,1
0,has,1166
1,on,831
2,with,702
3,in,547
4,next to,358
...,...,...
1333,stands over,1
1334,to cook,1
1335,separated by,1
1336,makes up,1


In [48]:
relation_label_idx=dict()
n=0
for item in relation_distri[0][:10]:
    relation_label_idx[item]=n
    n+=1
relation_label_idx

{'has': 0,
 'on': 1,
 'with': 2,
 'in': 3,
 'next to': 4,
 'near': 5,
 'has a': 6,
 'under': 7,
 'behind': 8,
 'of': 9}

In [63]:
# build matrix
node_ids=dict()
for edge in filter_lines:
    subject_=edge[1]
    rel_labels=edge[6].split("|")
    object_=edge[3]
    
    temp1=node_ids.get(subject_,[0]*len(relation_label_idx))   
    temp2=node_ids.get(subject_,[0]*len(relation_label_idx))
    
    for rel_label in rel_labels:
        if rel_label in relation_label_idx:
            temp1[relation_label_idx[rel_label]]=1
            node_ids[subject_]=temp1

            temp2[relation_label_idx[rel_label]]=1
            node_ids[subject_]=temp2

In [64]:
pd_dict=dict()
pd_dict["node_id"]=list(node_ids.keys())

for name in relation_label_idx:
    pd_dict[name]=[node_ids[item][relation_label_idx[name]] for item in node_ids]
    
df2 = pd.DataFrame(pd_dict)
df2

Unnamed: 0,node_id,has,on,with,in,next to,near,has a,under,behind,of
0,/c/en/aquarium/n/wn/artifact,1,1,0,1,1,0,0,0,1,0
1,/c/en/bag/n/wn/artifact,1,1,1,0,1,1,0,1,1,0
2,/c/en/banana/n/wn/plant,0,1,0,1,0,0,0,1,0,0
3,/c/en/barn/n/wn/artifact,1,0,1,1,0,0,0,0,0,0
4,/c/en/barrow/n/wn/artifact,1,1,1,1,1,0,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
153,/c/en/vase/n/wn/artifact,1,1,1,1,1,1,1,1,1,1
154,/c/en/vegetable/n/wn/food,0,1,0,1,0,0,0,0,0,0
155,/c/en/washer/n/wn/person,1,1,1,1,1,1,0,1,0,0
156,/c/en/watering_place/n/wn/location,0,0,0,0,0,0,0,0,1,0


In [21]:
relation_distri_2=pd.DataFrame(sorted(relation_dict_2.items(),key=lambda k:k[1],reverse=True))
relation_distri_2

Unnamed: 0,0,1
0,has,464
1,at location,228
2,on,196
3,in,172
4,with,153
...,...,...
1383,alongside|and|near|next to|under|with,1
1384,reflected in,1
1385,to cook,1
1386,has|separated by,1


In [22]:
# because the table have too many lines, save to excel
relation_distri_excel = "./data/cskg_dim/relation_distribution.xlsx"
with pd.ExcelWriter(relation_distri_excel) as writer:  
    relation_distri.to_excel(writer, sheet_name='Sheet_name_1')
    relation_distri_2.to_excel(writer, sheet_name='Sheet_name_2')

In [66]:
# distribution of source
source_dict = dict()

for line in filter_lines:
    source = line[8]
    
    source_dict[source] = source_dict.get(source,0)+1

In [67]:
source_dict

{'VG': 3446, 'CN': 265, 'CN|WN': 9, 'WD': 14}

In [72]:
dim_idx=dict()
n=0
for item in dim_limit:
    dim_idx[item]=n
    n+=1
dim_idx

{'part-whole': 0, 'spatial': 1}

In [74]:
# build matrix
node_ids=dict()
for edge in filter_lines:
    subject_=edge[1]
    dim=edge[7]
    object_=edge[3]
    
    temp1=node_ids.get(subject_,[0]*len(dim_idx))   
    temp2=node_ids.get(subject_,[0]*len(dim_idx))
    
    temp1[dim_idx[dim]]=1
    node_ids[subject_]=temp1

    temp2[dim_idx[dim]]=1
    node_ids[subject_]=temp2

In [75]:
pd_dict=dict()
pd_dict["node_id"]=list(node_ids.keys())

for name in dim_idx:
    pd_dict[name]=[node_ids[item][dim_idx[name]] for item in node_ids]
    
df3 = pd.DataFrame(pd_dict)
df3

Unnamed: 0,node_id,part-whole,spatial
0,/c/en/aquarium/n/wn/artifact,0,1
1,/c/en/bag/n/wn/artifact,0,1
2,/c/en/banana/n/wn/plant,0,1
3,/c/en/barn/n/wn/artifact,0,1
4,/c/en/barrow/n/wn/artifact,0,1
...,...,...,...
190,Q14748,1,0
191,Q339805,1,0
192,Q35197,1,0
193,Q7857,1,0


## Save File

In [34]:
# save file by labels
lines_by_label=dict()
for line in filter_lines:
    label1_s=line[4].split("\t")
    label2_s=line[5].split("\t")
    for label1_, label2_ in zip(label1_s,label2_s):
        if label1_ in label_limit:
            temp = lines_by_label.get(label1_,[])
            temp.append(line)
            lines_by_label[label1_]=temp
            continue

        elif label1_ in label_limit:
            temp = lines_by_label.get(label2_,[])
            temp.append(line)
            lines_by_label[label2_]=temp
            continue

        else:
            continue

In [35]:
line

['Q806617-/r/PartOf-Q2092603-0000',
 'Q806617',
 '/r/PartOf',
 'Q2092603',
 'bed',
 'member',
 'has part|part of',
 'part-whole',
 'WD']

In [36]:
save_dataByItem(lines_by_label,edges_label)

In [37]:
# save file by source
lines_by_source=dict()
for line in filter_lines:
    source=line[8]
    
    temp=lines_by_source.get(source,[])
    temp.append(line)
    lines_by_source[source]=temp

In [38]:
save_dataByItem(lines_by_source,edges_source)

In [39]:
# save file by relation
lines_by_relation=dict()
for line in filter_lines:
    relation_s = line[6].split("|")
    for relation_ in relation_s:
        temp=lines_by_relation.get(relation_,[])
        temp.append(line)
        lines_by_relation[relation_]=temp

In [76]:
with pd.ExcelWriter(matrix) as writer:  
    df1.to_excel(writer, sheet_name='Sheet_name_1')
    df2.to_excel(writer, sheet_name='Sheet_name_2')
    df3.to_excel(writer, sheet_name='Sheet_name_3')