# Extracting commonsense knowledge from Wikidata

## Define relevant properties

In [None]:
%env properties="P1889,P461,P527,P186,P463,P276,P170,P366,P279,P1552,P828,P1542,P462" #,P31"

## Filter relevant properties

In [None]:
%%bash
kgtk filter -p " ; $properties ; " input/wikidata/wikidata_edges_20200504.tsv.gz > tmp/kgtk_wikidata_filter.tsv

## Remove columns

In [1]:
%env ignore_cols=rank,node2;magnitude,node2;unit,node2;item,node2;lower,node2;upper,node2;entity-type,node2;longitude,node2;latitude,node2;date,node2;calendar,node2;precision

env: ignore_cols=rank,node2;magnitude,node2;unit,node2;item,node2;lower,node2;upper,node2;entity-type,node2;longitude,node2;latitude,node2;date,node2;calendar,node2;precision


In [2]:
%%bash
kgtk remove_columns -c "$ignore_cols" -i tmp/kgtk_wikidata_filter.tsv > tmp/kgtk_wikidata_cols.tsv

## Deduplicate

In [3]:
%%bash
kgtk compact -i tmp/kgtk_wikidata_cols.tsv -o tmp/kgtk_wikidata_compact.tsv

## Add labels

In [4]:
%%bash
kgtk --debug lift --verbose \
     --input-file tmp/kgtk_wikidata_compact.tsv \
     --label-file input/wikidata/wiki_labels.tsv \
     --output-file tmp/kgtk_wikidata.tsv \
     --columns-to-lift node1 node2 label \
     --prefilter-labels

Opening the input file: tmp/kgtk_wikidata_compact.tsv
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file tmp/kgtk_wikidata_compact.tsv
header: id	node1	label	node2
node1 column found, this is a KGTK edge file
KgtkReader: Special columns: node1=1 label=2 node2=3 id=0
KgtkReader: Reading an edge file.
Opening the label file: input/wikidata/wiki_labels.tsv
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file input/wikidata/wiki_labels.tsv
header: node1	label	node2
node1 column found, this is a KGTK edge file
KgtkReader: Special columns: node1=0 label=1 node2=2 id=-1
KgtkReader: Reading an edge file.
Lifting with in-memory buffering.
Reading input data to prefilter the labels.
Loading input rows without labels from tmp/kgtk_wikidata_compact.tsv
Labels needed: 5611919
Loading labels from the label file.
Loading labels from input/wikidata/wiki_labels.tsv
Filtering for needed labels
label_match_column_idx=0 (node1).
label_select_column_idx=1 (label).
label_value_column_idx=2 (

## Add PageRank

In [5]:
%%bash
kgtk --debug lift --verbose \
     --input-file tmp/kgtk_wikidata.tsv \
     --label-file input/wikidata/wikidata-pagerank-only-sorted2.tsv \
     --output-file tmp/kgtk_wikidata_with_pr.tsv \
     --columns-to-lift node1 node2 \
     --property vertex_pagerank \
     --lift-suffix ";pagerank" \
     --prefilter-labels

Opening the input file: tmp/kgtk_wikidata.tsv
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file tmp/kgtk_wikidata.tsv
header: id	node1	label	node2	node1;label	node2;label	label;label
node1 column found, this is a KGTK edge file
KgtkReader: Special columns: node1=1 label=2 node2=3 id=0
KgtkReader: Reading an edge file.
Opening the label file: input/wikidata/wikidata-pagerank-only-sorted2.tsv
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file input/wikidata/wikidata-pagerank-only-sorted2.tsv
header: node1	relation	node2	id
node1 column found, this is a KGTK edge file
KgtkReader: Special columns: node1=0 label=1 node2=2 id=3
KgtkReader: Reading an edge file.
Lifting with in-memory buffering.
Reading input data to prefilter the labels.
Loading input rows without labels from tmp/kgtk_wikidata.tsv
Labels needed: 5611908
Loading labels from the label file.
Loading labels from input/wikidata/wikidata-pagerank-only-sorted2.tsv
Filtering for needed labels
label_match_column_id

## Filter on PageRank

In [6]:
import pandas as pd
df=pd.read_csv('tmp/kgtk_wikidata_with_pr.tsv', sep='\t')

In [7]:
len(df)

8262087

In [8]:
df.head()

Unnamed: 0,id,node1,label,node2,node1;label,node2;label,label;label,node1;pagerank,node2;pagerank
0,P1005-P1552-1,P1005,P1552,Q26921380,'Portuguese National Library ID'@en,'VIAF component'@en,'has quality'@en,3.630733e-09,4e-06
1,P1006-P1552-1,P1006,P1552,Q26921380,'Nationale Thesaurus voor Auteurs ID'@en,'VIAF component'@en,'has quality'@en,7.545395e-08,4e-06
2,P1015-P1552-1,P1015,P1552,Q26921380,'NORAF ID'@en,'VIAF component'@en,'has quality'@en,3.637108e-09,4e-06
3,P1017-P1552-1,P1017,P1552,Q26921380,'Vatican Library ID'@en,'VIAF component'@en,'has quality'@en,2.345845e-07,4e-06
4,P1048-P1552-1,P1048,P1552,Q26921380,'NCL ID'@en,'VIAF component'@en,'has quality'@en,2.483013e-09,4e-06


In [9]:
limit=1.419947235126343e-08 # PageRank of the 752876th most popular node

In [10]:
cols=['node1;pagerank', 'node2;pagerank']

In [None]:
new_rows=[]
for i, row in df.iterrows():
    store=True
    for c in cols:
        if row[c]<limit:
            store=False
    if store:
        a_row=[*row[:7], "", "", "WD", "", "", ""] # copy id, node1, relation, node2, node1:label, node2:label, relation:label
        new_rows.append(a_row)

In [None]:
new_columns=['node1', 'relation', 'node2', 'node1;label', 'node2;label','relation;label', 'relation;dimension', 'weight', 'source', 'origin', 'sentence', 'question']

In [None]:
df2=pd.DataFrame(new_rows, columns=new_columns)

In [None]:
df2['label'].value_counts()

In [None]:
df2.to_csv('tmp/kgtk_wikidata_ready.tsv', index=False, sep='\t', columns=df.columns)

## Compute statistics

In [None]:
%%bash
kgtk graph_statistics --directed --degrees --pagerank --hits --log p463_summary.txt -i tmp/kgtk_wikidata_ready.tsv > tmp/stats/wiki_stats.tsv