# Mapping the Wikidata relations to CSKG relations

## Map relations and Prepare for CSKG

In [1]:
new_columns=['node1', 'relation', 'node2', 'node1;label', 'node2;label','relation;label', 'relation;dimension', 'source', 'sentence']

In [99]:
#tmp_dir='../tmp/wikidata20200504'
#tmp_dir='../tmp/wikidata20171227'#
tmp_dir='../tmp/wikidata20181210'#
wd_file='%s/wikidata_cs.tsv' % tmp_dir
mapped='%s/kgtk_wikidata_tmp.tsv' % tmp_dir
dedup='%s/kgtk_wikidata.tsv' % tmp_dir

stats='%s/stats_mapped.tsv' % tmp_dir

In [100]:
import pandas as pd
import mapping

In [101]:
df1=pd.read_csv(wd_file, sep='\t')

In [102]:
len(df1)

66163

In [103]:
mapped_fw=set()
mapped_bw=set()
rows=[]
for i, row in df1.iterrows():
    node1=row[0]
    rel=row[1]
    node2=row[2]
    node1_label=row[3].strip()[1:-4]
    node2_label=row[4].strip()[1:-4]
    rel_label=row[5].strip()[1:-4]
    if rel in mapping.fw_mapping.keys() :
        cn_rel=mapping.fw_mapping[rel]
        my_row=[node1,cn_rel,node2,node1_label,node2_label,rel_label, '', 'WD', '']
    elif rel in mapping.bw_mapping.keys():
        cn_rel=mapping.bw_mapping[rel]
        my_row=[node2,cn_rel,node1,node2_label,node1_label,rel_label, '', 'WD', '']
    rows.append(my_row)


In [104]:
df2=pd.DataFrame(rows, columns=new_columns)

In [105]:
df2.drop_duplicates(inplace=True)

In [106]:
len(df2)

66163

In [107]:
df2['relation'].value_counts()

/r/IsA                45757
/r/PartOf              5312
/r/HasPrerequisite     3854
/r/HasContext          3435
/r/DistinctFrom        2011
/r/Antonym             1530
/r/UsedFor             1331
/r/MadeOf               834
/r/Synonym              655
/r/HasProperty          651
/r/Causes               315
/r/DerivedFrom          293
/r/SimilarTo             77
/r/CreatedBy             68
/r/RelatedTo             40
Name: relation, dtype: int64

In [108]:
df2.to_csv(mapped, index=False, sep='\t', columns=new_columns)

In [109]:
%%bash -s "$mapped" "$dedup"
kgtk compact -i $1 -o $2 --columns node1 relation node2 --presorted False

In [110]:
df3=pd.read_csv(dedup, sep='\t')

In [111]:
df3['relation'].value_counts()

/r/IsA                45606
/r/PartOf              4416
/r/HasContext          3189
/r/DistinctFrom        2011
/r/HasPrerequisite     1965
/r/Antonym             1530
/r/UsedFor             1215
/r/MadeOf               834
/r/Synonym              655
/r/HasProperty          650
/r/DerivedFrom          293
/r/Causes               238
/r/SimilarTo             77
/r/CreatedBy             68
/r/RelatedTo             40
Name: relation, dtype: int64

In [112]:
%%bash -s "$dedup" "$stats"
kgtk graph_statistics --directed --degrees --pagerank --hits --log summary.txt -i $1 > $2

In [113]:
%%bash
cat summary.txt

loading the TSV graph now ...
graph loaded! It has 47056 nodes and 62787 edges

###Top relations:
/r/IsA	45606
/r/PartOf	4416
/r/HasContext	3189
/r/DistinctFrom	2011
/r/HasPrerequisite	1965
/r/Antonym	1530
/r/UsedFor	1215
/r/MadeOf	834
/r/Synonym	655
/r/HasProperty	650

###Degrees:
in degree stats: mean=1.334304, std=0.035105, max=1
out degree stats: mean=1.334304, std=0.004760, max=1
total degree stats: mean=2.668608, std=0.035834, max=1

###PageRank
Max pageranks
271	Q483394	0.006469
764	Q386724	0.006726
14293	Q1744628	0.006793
595	Q5962346	0.007980
258	Q16889133	0.007421

###HITS
HITS hubs
20700	Q213901	0.002638
7480	Q591041	0.002679
29404	Q317309	0.002641
4195	Q61476	0.031417
2139	Q13442814	0.999481
HITS auth
30646	Q34524203	0.037400
31312	Q36536067	0.037400
30647	Q34524288	0.037400
7479	Q18918145	0.072521
34987	Q43916273	0.037450
