# Summary of Dimension on CSKG by human and machine

This notebook performs the number of edges per dimension in each cluster, people can see this result from human  and machine perspective. 

Parameters are set up in the first cell so that we can run this notebook in batch mode.

### Parameters for invoking the notebook

- `cskg_path`: a folder containing the CSKG edges file and all the analysis products.
- `cskg_raw `: the name of raw cskg data file 
- `human_dim`: the name of the human dimension file 
- `machine_dim`: the name of the machine dimension file


Tip:
1. We have prepared the `cskg_raw`,`human_dim`, and `machine_dim`, you can download them from 
https://drive.google.com/file/d/1mihQeiXFSmotXl7bIZ-6Na3eVnV74UOe/view?usp=sharing<br>
https://drive.google.com/file/d/1Zct-IW8w5U4S-lRViJEyXaFwXQOzLx4o/view?usp=sharing,<br> and
https://drive.google.com/file/d/1-TUd8I3MsSJ8mZTh-XiVaq4fbmfPma92/view?usp=sharing, <br>
(###################### Professor, maybe you need to warp them into a new folder.######################.)
2. If you use `wc -l file_path` to check the edge numbers of thoese two file, you can see that some edges are missing from human_dim, this is because some of the eages are removed when human do clustering


# Preamble

Set up paths and environment variables, make sure those varible are pointing to the correct file(path)

In [1]:
# Parameters
cskg_path = "../output" 
cskg_raw = 'cskg_dim.tsv'
human_dim = "human_dim.tsv"
machine_dim = "machine_dim.tsv"

In [2]:
import os
from itertools import islice

In [3]:
os.environ['CSKG'] = cskg_path
os.environ['KG'] = "{}/{}".format(cskg_path, cskg_raw)
os.environ['HD'] = "{}/{}".format(cskg_path, human_dim)
os.environ['MD'] = "{}/{}".format(cskg_path, machine_dim)


kg_path = os.environ['KG']
hd_path = os.environ['HD']
md_path = os.environ['MD']

In [4]:
!head -5 $KG
! wc -l $KG

id	node1	relation	node2	node1;label	node2;label	relation;label	relation;dimension	source	sentence
/c/en/0-/r/DefinedAs-/c/en/empty_set-0000	/c/en/0	/r/DefinedAs	/c/en/empty_set	0	empty set	defined as	similarity	CN	[[0]] is the [[empty set]].
/c/en/0-/r/DefinedAs-/c/en/first_limit_ordinal-0000	/c/en/0	/r/DefinedAs	/c/en/first_limit_ordinal	0	first limit ordinal	defined as	similarity	CN	[[0]] is the [[first limit ordinal]].
/c/en/0-/r/DefinedAs-/c/en/number_zero-0000	/c/en/0	/r/DefinedAs	/c/en/number_zero	0	number zero	defined as	similarity	CN	[[0]] is the [[number zero]].
/c/en/0-/r/HasContext-/c/en/internet_slang-0000	/c/en/0	/r/HasContext	/c/en/internet_slang	0	internet slang	has context	rel-other	CN	
5895123 ../output/cskg_dim.tsv


In [5]:
! echo $HD
! wc -l $HD

../output/human_dim.tsv
5895123 ../output/human_dim.tsv


In [6]:
!head -5 $HD

id	dimension
/c/en/0-/r/DefinedAs-/c/en/empty_set-0000	similarity
/c/en/0-/r/DefinedAs-/c/en/first_limit_ordinal-0000	similarity
/c/en/0-/r/DefinedAs-/c/en/number_zero-0000	similarity
/c/en/0-/r/HasContext-/c/en/internet_slang-0000	rel-other


In [7]:
! echo $MD
! wc -l $MD

../output/machine_dim.tsv
5957576 ../output/machine_dim.tsv


In [8]:
!head -5 $MD

id	dimension
/c/en/0.22_inch_calibre-/r/IsA-/c/en/5.6_millimetres-0000	3
/c/en/0/a/wn-/r/SimilarTo-/c/en/cardinal/a/wn-0000	8
/c/en/0/n/wn/quantity-/r/Synonym-/c/en/zero/n/wn/quantity-0000	4
/c/en/0/n/wp/number-/r/Synonym-/c/en/0/n/wp/number-0000	7


# Utilites 

In [9]:
def load_edge(kg_file):
    edge_info = {}
    with open(kg_file) as f:
        for line in islice(f, 1, None):
            content = line.split('\t')
            edge_id = content[0]
            node1 = content[1]
            relation = content[2]
            node2 = content[3]
            edge_info[edge_id] = ([node1,node2],relation)
    return edge_info
            
            

def load_dimension(dim_file): 
    edge_dim = {}
    with open(dim_file) as f:
        for line in islice(f, 1, None):
            content = line.split('\t')
            edge_id = content[0]
            dim = content[1].replace('\n','')
            edge_dim[edge_id] = dim
    return edge_dim


def dim_relations(edge_dim,edge_info,topk=10):
    dim_rel = {}
    for edge_id,dim in edge_dim.items():
        relation = edge_info.get(edge_id,[None,None])[1]
        if relation is None:
            continue
        dim_rel[dim] = dim_rel.get(dim,{})
        dim_rel[dim][relation] = dim_rel[dim].get(relation,0)+1
    
    for dim in dim_rel:
        sort_array = sorted(dim_rel[dim].items(),key=lambda x:x[-1],reverse=True)[:topk]
    
        dim_rel[dim] = sort_array
    
    return dim_rel
        
def dim_nodes(edge_dim,edge_info,topk=10):
    dim_nodes = {}
    for edge_id,dim in edge_dim.items():
        nodes = edge_info.get(edge_id,[None,None])[0]
        if nodes is None:
            continue
        dim_nodes[dim] = dim_nodes.get(dim,{})
        for node in nodes:
            dim_nodes[dim][node] = dim_nodes[dim].get(node,0)+1
    
    for dim in dim_nodes:
        sort_array = sorted(dim_nodes[dim].items(),key=lambda x:x[-1],reverse=True)[:topk]
    
        dim_nodes[dim] = sort_array
    
    return dim_nodes
        

def num_nodes(edge_dim,edge_info):
    dim_nodes_num = {}
    for edge_id,dim in edge_dim.items():
        nodes = edge_info.get(edge_id,[None,None])[0]
        if nodes is None:
            continue
        dim_nodes_num[dim] = dim_nodes_num.get(dim,set())
        for node in nodes:
            dim_nodes_num[dim].add(node)
    
    for dim in dim_nodes_num:
        dim_nodes_num[dim] = len(dim_nodes_num[dim])
    
    sort_array = sorted(dim_nodes_num.items(),key=lambda x:x[-1],reverse=True)
    return sort_array


def num_relations(edge_dim,edge_info):
    dim_rel_num = {}
    for edge_id,dim in edge_dim.items():
        relation = edge_info.get(edge_id,[None,None])[1]
        if relation is None:
            continue
        dim_rel_num[dim] = dim_rel_num.get(dim,set())
        dim_rel_num[dim].add(relation)
        
    for dim in dim_rel_num:
        dim_rel_num[dim] = len(dim_rel_num[dim])
        
    sort_array = sorted(dim_rel_num.items(),key=lambda x:x[-1],reverse=True)
    
    return sort_array


# Summary around

In [10]:
edge_info = load_edge(kg_path)
# edge_info['/c/en/0-/r/DefinedAs-/c/en/empty_set-0000']

## According to human dimension

In [11]:
human_dims = load_dimension(hd_path)

### Number of relations for each dimension

In [12]:
human_dim_relnum = num_relations(human_dims,edge_info)
print(f"{'dimension':<20}{'# relation':<20}")
for dim in human_dim_relnum:
    print(f"{dim[0]:<20}{dim[1]:<20}")

dimension           # relation          
temporal            11                  
desire              7                   
quality             4                   
lexical             4                   
utility             4                   
similarity          3                   
rel-other           3                   
taxonomic           3                   
part-whole          3                   
distinctness        2                   
spatial             2                   
creation            1                   


### Top relations for each dimension
Since the number of relations for each dimension is not very large, so when we set topk=5, it may return 3 relations for one dimension

In [13]:
topk=5
human_dim_rels = dim_relations(human_dims,edge_info,topk)

print(f"dimension\ttop{topk}-relations")
for dim in human_dim_rels:
    print(f"{dim}\t{human_dim_rels[dim]}",)
    print()

dimension	top5-relations
similarity	[('/r/Synonym', 1245289), ('/r/SimilarTo', 30635), ('/r/DefinedAs', 2173)]

rel-other	[('/r/RelatedTo', 1703968), ('/r/HasContext', 238277), ('/r/EtymologicallyRelatedTo', 32075)]

quality	[('at:xAttr', 133281), ('/r/HasProperty', 9482), ('/r/NotHasProperty', 327), ('/r/SymbolOf', 4)]

taxonomic	[('/r/IsA', 316351), ('/r/MannerOf', 12618), ('/r/InstanceOf', 1480)]

distinctness	[('/r/Antonym', 401003), ('/r/DistinctFrom', 8249)]

lexical	[('/r/FormOf', 378859), ('/r/DerivedFrom', 325914), ('fn:HasLexicalUnit', 58765), ('/r/EtymologicallyDerivedFrom', 71)]

spatial	[('/r/LocatedNear', 152274), ('/r/AtLocation', 27797)]

utility	[('/r/UsedFor', 42580), ('/r/CapableOf', 42375), ('/r/ReceivesAction', 6037), ('/r/NotCapableOf', 329)]

part-whole	[('/r/PartOf', 31964), ('/r/HasA', 16973), ('/r/MadeOf', 2757)]

desire	[('at:xWant', 129171), ('at:xIntent', 57685), ('at:oWant', 53910), ('/r/MotivatedByGoal', 9489), ('/r/CausesDesire', 4688)]

temporal	[('at:x

### Number of nodes for each dimension

In [14]:
human_dim_nodenum = num_nodes(human_dims,edge_info)
print(f"{'dimension':<20}{'# node':<20}")
for dim in human_dim_nodenum:
    print(f"{dim[0]:<20}{dim[1]:<20}")

dimension           # node              
lexical             861123              
rel-other           854024              
similarity          335829              
taxonomic           261836              
temporal            233148              
desire              164661              
utility             70975               
distinctness        52132               
part-whole          46128               
quality             45545               
spatial             23137               
creation            752                 


### Top nodes for each dimension

In [15]:
topk=5
human_dim_nodes = dim_nodes(human_dims,edge_info,topk)

print(f"dimension\ttop{topk}-nodes")
for dim in human_dim_nodes:
    print(f"{dim}\t{human_dim_nodes[dim]}",)
    print()

dimension	top5-nodes
similarity	[('/c/en/black', 860), ('/c/en/blue', 855), ('/c/en/green', 833), ('/c/en/red', 831), ('/c/en/good', 823)]

rel-other	[('/c/en/slang', 11000), ('/c/en/mineral', 8288), ('/c/en/us', 7677), ('/c/en/medicine', 7192), ('/c/en/zoology', 6685)]

quality	[('/c/en/helpful', 2534), ('/c/en/curious', 1976), ('/c/en/friendly', 1913), ('/c/en/responsible', 1854), ('/c/en/determined', 1780)]

taxonomic	[('/c/en/plant/n', 2769), ('/c/en/chemical_compound/n', 2526), ('/c/en/person/n', 2039), ('Q171318', 1997), ('/c/en/tangible_thing/n', 1923)]

distinctness	[('/c/en/good', 769), ('/c/en/zero', 699), ('/c/en/neutral', 611), ('/c/en/gray', 598), ('/c/en/grey', 592)]

lexical	[('fn:fe:descriptor', 1828), ('fn:fe:manner', 1774), ('fn:fe:entity', 1333), ('fn:fe:agent', 1314), ('fn:fe:type', 1056)]

spatial	[('/c/en/man/n/wn/person', 2639), ('/c/en/sign/n/wn/communication', 2012), ('/c/en/woman/n/wn/person', 1929), ('/c/en/partition/n/wn/artifact', 1863), ('/c/en/table/n/wn/

## According to machine dimension

In [16]:
machine_dims = load_dimension(md_path)

### Number of relations for each dimension

In [17]:
machine_dim_relnum = num_relations(machine_dims,edge_info)
print(f"{'dimension':<20}{'# relation':<20}")
for dim in machine_dim_relnum:
    print(f"{dim[0]:<20}{dim[1]:<20}")

dimension           # relation          
6                   47                  
3                   46                  
12                  46                  
2                   46                  
0                   46                  
4                   45                  
11                  45                  
9                   45                  
7                   43                  
5                   42                  
10                  42                  
1                   42                  
8                   39                  


### Top relations for each dimension

In [18]:
topk=5
machine_dim_rels = dim_relations(machine_dims,edge_info,topk)

print(f"dimension\ttop{topk}-nodes")
for dim in machine_dim_rels:
    print(f"{dim}\t{machine_dim_rels[dim]}",)
    print()

dimension	top5-nodes
3	[('/r/RelatedTo', 81028), ('/r/LocatedNear', 43329), ('/r/FormOf', 31631), ('/r/DerivedFrom', 29829), ('/r/IsA', 24375)]

8	[('/r/Synonym', 554717), ('/r/SimilarTo', 11124), ('/r/RelatedTo', 3981), ('/r/IsA', 1425), ('/r/FormOf', 1342)]

4	[('/r/Synonym', 279104), ('/r/RelatedTo', 13539), ('/r/SimilarTo', 7041), ('/r/FormOf', 4860), ('/r/DerivedFrom', 4353)]

7	[('/r/Synonym', 382100), ('/r/RelatedTo', 10393), ('/r/SimilarTo', 5479), ('at:xAttr', 4955), ('/r/IsA', 3691)]

5	[('/r/Antonym', 397014), ('/r/DistinctFrom', 8089), ('/r/FormOf', 7208), ('/r/DerivedFrom', 6975), ('/r/RelatedTo', 5678)]

11	[('/r/RelatedTo', 127524), ('/r/HasContext', 117651), ('/r/IsA', 51542), ('/r/DerivedFrom', 31006), ('/r/LocatedNear', 30904)]

12	[('/r/RelatedTo', 338631), ('/r/IsA', 50109), ('/r/LocatedNear', 42181), ('/r/DerivedFrom', 31858), ('/r/FormOf', 26384)]

2	[('/r/RelatedTo', 226293), ('/r/FormOf', 96888), ('/r/DerivedFrom', 78769), ('/r/IsA', 54636), ('/r/LocatedNear', 1

### Number of nodes for each dimension

In [19]:
machine_dim_nodenum = num_nodes(machine_dims,edge_info)
print(f"{'dimension':<20}{'# node':<20}")
for dim in machine_dim_nodenum:
    print(f"{dim[0]:<20}{dim[1]:<20}")

dimension           # node              
0                   521399              
2                   456313              
9                   344413              
11                  341273              
12                  336577              
10                  263519              
6                   244813              
8                   222316              
3                   217931              
1                   155072              
7                   137090              
4                   95402               
5                   85589               


### Top nodes for each dimension

In [20]:
topk=5
machine_dim_nodes = dim_nodes(machine_dims,edge_info,topk)

print(f"dimension\ttop{topk}-nodes")
for dim in machine_dim_nodes:
    print(f"{dim}\t{machine_dim_nodes[dim]}",)
    print()

dimension	top5-nodes
3	[('/c/en/partition/n/wn/artifact', 939), ('/c/en/man/n/wn/person', 874), ('/c/en/person/n/wn', 852), ('/c/en/derogatory', 805), ('/c/en/grass/n/wn/plant', 716)]

8	[('/c/en/green', 808), ('/c/en/red', 801), ('/c/en/blue', 741), ('/c/en/yellow', 675), ('/c/en/zero', 673)]

4	[('/c/en/bad', 1092), ('/c/en/dirty', 680), ('/c/en/weak', 634), ('/c/en/wild', 554), ('/c/en/upset', 544)]

7	[('/c/en/set', 614), ('/c/en/close', 588), ('/c/en/good', 546), ('/c/en/open', 541), ('/c/en/big', 539)]

5	[('/c/en/nonstandard', 1294), ('/c/en/good', 763), ('/c/en/zero', 706), ('/c/en/neutral', 660), ('/c/en/gray', 598)]

11	[('/c/en/us', 6035), ('/c/en/computing', 4716), ('/c/en/historical', 4416), ('/c/en/medicine', 4196), ('/c/en/uk', 4180)]

12	[('/c/en/slang', 1896), ('/c/en/drug', 1845), ('/c/en/money', 1139), ('/c/en/metal', 985), ('/c/en/food', 926)]

2	[('/c/en/slang', 2034), ('/c/en/england', 1464), ('/c/en/tree', 1271), ('/c/en/bird/n', 1226), ('/c/en/plant/n', 1083)]

