# Mapping the Wikidata relations to CSKG relations

## Map relations and Prepare for CSKG

In [3]:
new_columns=['node1', 'relation', 'node2', 'node1;label', 'node2;label','relation;label', 'relation;dimension', 'source', 'sentence']

In [4]:
tmp_dir='../tmp/wikidata20200504'
#tmp_dir='../tmp/wikidata20171227'#
#tmp_dir='../tmp/wikidata20181210'#
wd_file='%s/wikidata_cs.tsv' % tmp_dir
mapped='%s/kgtk_wikidata_tmp.tsv' % tmp_dir
dedup='%s/kgtk_wikidata.tsv' % tmp_dir

stats='%s/stats_mapped.tsv' % tmp_dir

In [5]:
import pandas as pd
import mapping
import csv
from kgtk.kgtkformat import KgtkFormat

In [6]:
df1=pd.read_csv(wd_file, sep='\t')

In [7]:
len(df1)

106103

In [9]:
mapped_fw=set()
mapped_bw=set()
rows=[]
for i, row in df1.iterrows():
    node1=row[0]
    rel=row[1]
    node2=row[2]
    node1_label=KgtkFormat.stringify(row[3].strip()[1:-4])
    node2_label=KgtkFormat.stringify(row[4].strip()[1:-4])
    rel_label=KgtkFormat.stringify(row[5].strip()[1:-4])
    if rel in mapping.fw_mapping.keys() :
        cn_rel=mapping.fw_mapping[rel]
        my_row=[node1,cn_rel,node2,node1_label,node2_label,rel_label, '', '"WD"', '']
    elif rel in mapping.bw_mapping.keys():
        cn_rel=mapping.bw_mapping[rel]
        my_row=[node2,cn_rel,node1,node2_label,node1_label,rel_label, '', '"WD"', '']
    rows.append(my_row)


In [10]:
df2=pd.DataFrame(rows, columns=new_columns)

In [11]:
df2.drop_duplicates(inplace=True)

In [12]:
len(df2)

106103

In [13]:
df2['relation'].value_counts()

/r/IsA                72985
/r/PartOf              7938
/r/HasContext          6152
/r/DistinctFrom        4934
/r/HasPrerequisite     4131
/r/UsedFor             2469
/r/Antonym             2184
/r/MadeOf              1426
/r/Synonym             1070
/r/HasProperty         1049
/r/Causes               651
/r/DerivedFrom          540
/r/SimilarTo            345
/r/CreatedBy            187
/r/RelatedTo             42
Name: relation, dtype: int64

In [14]:
df2.head()

Unnamed: 0,node1,relation,node2,node1;label,node2;label,relation;label,relation;dimension,source,sentence
0,Q8,/r/IsA,Q331769,"""happiness""","""mood""","""instance of""",,"""WD""",
1,Q8,/r/IsA,Q60539479,"""happiness""","""positive emotion""","""instance of""",,"""WD""",
2,Q8,/r/IsA,Q9415,"""happiness""","""emotion""","""instance of""",,"""WD""",
3,Q8,/r/Antonym,Q169251,"""happiness""","""sadness""","""opposite of""",,"""WD""",
4,Q8,/r/Synonym,Q935526,"""happiness""","""joy""","""said to be the same as""",,"""WD""",


In [15]:
df2.to_csv(mapped, index=False, sep='\t', columns=new_columns, quoting=csv.QUOTE_NONE)

In [16]:
%%bash -s "$mapped" "$dedup"
kgtk compact -i $1 -o $2 --columns node1 relation node2 --presorted False

In [17]:
df3=pd.read_csv(dedup, sep='\t')

In [18]:
df3['relation'].value_counts()

/r/IsA                72707
/r/PartOf              6886
/r/HasContext          5541
/r/DistinctFrom        4934
/r/UsedFor             2243
/r/Antonym             2184
/r/HasPrerequisite     2107
/r/MadeOf              1426
/r/Synonym             1070
/r/HasProperty         1049
/r/DerivedFrom          540
/r/Causes               510
/r/SimilarTo            345
/r/CreatedBy            187
/r/RelatedTo             42
Name: relation, dtype: int64

In [88]:
%%bash -s "$dedup" "$stats"
kgtk graph_statistics --directed --degrees --pagerank --hits --log summary.txt -i $1 > $2

In [89]:
%%bash
cat summary.txt

loading the TSV graph now ...
graph loaded! It has 71243 nodes and 101771 edges

###Top relations:
/r/IsA	72707
/r/PartOf	6886
/r/HasContext	5541
/r/DistinctFrom	4934
/r/UsedFor	2243
/r/Antonym	2184
/r/HasPrerequisite	2107
/r/MadeOf	1426
/r/Synonym	1070
/r/HasProperty	1049

###Degrees:
in degree stats: mean=1.428505, std=0.045290, max=1
out degree stats: mean=1.428505, std=0.004051, max=1
total degree stats: mean=2.857011, std=0.045806, max=1

###PageRank
Max pageranks
288	Q1047113	0.007052
308	Q16889133	0.008416
201	Q171318	0.009107
139	Q11862829	0.009335
806	Q5962346	0.009287

###HITS
HITS hubs
23616	Q58837291	0.003989
14803	Q84048852	0.005686
2864	Q84048850	0.007898
982	Q23009870	0.466863
201	Q171318	0.884159
HITS auth
29195	Q41810352	0.027366
62987	Q72204903	0.027396
7138	Q57832811	0.027429
42987	Q57832810	0.027474
8674	Q12758374	0.035754
