# Extracting commonsense knowledge from Wikidata

## Define relevant properties

In [None]:
%env properties="P1889,P461,P527,P186,P463,P276,P170,P366,P279,P1552,P828,P1542,P462" #,P31"

## Filter relevant properties

In [None]:
%%bash
kgtk filter -p " ; $properties ; " input/wikidata/wikidata_edges_20200504.tsv.gz > tmp/kgtk_wikidata_filter.tsv

## Remove columns

In [1]:
%env ignore_cols=id,rank,node2;magnitude,node2;unit,node2;item,node2;lower,node2;upper,node2;entity-type,node2;longitude,node2;latitude,node2;date,node2;calendar,node2;precision

env: ignore_cols=id,rank,node2;magnitude,node2;unit,node2;item,node2;lower,node2;upper,node2;entity-type,node2;longitude,node2;latitude,node2;date,node2;calendar,node2;precision


In [2]:
%%bash
kgtk remove_columns -c "$ignore_cols" -i tmp/kgtk_wikidata_filter.tsv > tmp/kgtk_wikidata_cols.tsv

## Deduplicate

In [3]:
%%bash
kgtk compact -i tmp/kgtk_wikidata_cols.tsv -o tmp/kgtk_wikidata_compact.tsv

## Add labels

In [4]:
%%bash
kgtk --debug lift --verbose \
     --input-file tmp/kgtk_wikidata_compact.tsv \
     --label-file input/wikidata/wiki_labels.tsv \
     --output-file tmp/kgtk_wikidata.tsv \
     --columns-to-lift node1 node2 label \
     --prefilter-labels

Opening the input file: tmp/kgtk_wikidata_compact.tsv
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file tmp/kgtk_wikidata_compact.tsv
header: node1	label	node2
node1 column found, this is a KGTK edge file
KgtkReader: Special columns: node1=0 label=1 node2=2 id=-1
KgtkReader: Reading an edge file.
Opening the label file: input/wikidata/wiki_labels.tsv
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file input/wikidata/wiki_labels.tsv
Traceback (most recent call last):
  File "/Users/filipilievski/mcs/kgtk/kgtk/cli/lift.py", line 277, in run
    kl.process()
  File "/Users/filipilievski/mcs/kgtk/kgtk/lift/kgtklift.py", line 680, in process
    very_verbose=self.very_verbose,
  File "/Users/filipilievski/mcs/kgtk/kgtk/io/kgtkreader.py", line 494, in open
    source: ClosableIter[str] = cls._openfile(file_path, options=options, error_file=error_file, verbose=verbose)
  File "/Users/filipilievski/mcs/kgtk/kgtk/io/kgtkreader.py", line 668, in _openfile
    return ClosableIte

CalledProcessError: Command 'b'kgtk --debug lift --verbose \\\n     --input-file tmp/kgtk_wikidata_compact.tsv \\\n     --label-file input/wikidata/wiki_labels.tsv \\\n     --output-file tmp/kgtk_wikidata.tsv \\\n     --columns-to-lift node1 node2 label \\\n     --prefilter-labels\n'' returned non-zero exit status 1.

## Add PageRank

In [None]:
%%bash
kgtk --debug lift --verbose \
     --input-file tmp/kgtk_wikidata.tsv \
     --label-file input/wikidata/wikidata-pagerank-only-sorted2.tsv \
     --output-file tmp/kgtk_wikidata_with_pr.tsv \
     --columns-to-lift node1 node2 \
     --property vertex_pagerank \
     --lift-suffix ";pagerank" \
     --prefilter-labels

## Filter concepts vs instances

In [5]:
import pandas as pd
df=pd.read_csv('tmp/kgtk_wikidata_with_pr.tsv', sep='\t')

In [6]:
len(df)

8256446

In [7]:
df.head()

Unnamed: 0,node1,label,node2,node1;label,node2;label,label;label,node1;pagerank,node2;pagerank
0,P1005,P1552,Q26921380,'Portuguese National Library ID'@en,'VIAF component'@en,'has quality'@en,3.630733e-09,4e-06
1,P1006,P1552,Q26921380,'Nationale Thesaurus voor Auteurs ID'@en,'VIAF component'@en,'has quality'@en,7.545395e-08,4e-06
2,P1015,P1552,Q26921380,'NORAF ID'@en,'VIAF component'@en,'has quality'@en,3.637108e-09,4e-06
3,P1017,P1552,Q26921380,'Vatican Library ID'@en,'VIAF component'@en,'has quality'@en,2.345845e-07,4e-06
4,P1048,P1552,Q26921380,'NCL ID'@en,'VIAF component'@en,'has quality'@en,2.483013e-09,4e-06


In [8]:
limit=1.419947235126343e-08 # PageRank of the 752876th most popular node

In [9]:
cols=['node1;pagerank', 'node2;pagerank']
cols=['node1;label', 'node2;label']

In [10]:
from tqdm import tqdm

In [11]:
new_rows=[]
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    store=True
    if row['node1;label']=='' or row['node2;label']=='' or not isinstance(row['node1;label'], str) or not isinstance(row['node2;label'], str): continue
    node1_label=row['node1;label'].strip()[1:-3]
    node2_label=row['node2;label'].strip()[1:-3]
    if node1_label[0].islower() and node2_label[0].islower():
        a_row=[*row[:6], "", "", "WD", "", "", ""] # copy id, node1, relation, node2, node1:label, node2:label, relation:label
        a_row[3]=node1_label
        a_row[4]=node2_label
        new_rows.append(a_row)

100%|██████████| 8256446/8256446 [18:00<00:00, 7642.88it/s] 


## Filter by usage

In [12]:
from wordfreq import word_frequency

In [13]:
threshold=1e-06

In [14]:
len(new_rows)

1255113

In [15]:
filtered_rows=[]
for a_row in new_rows:
    node1_label=a_row[3]
    node2_label=a_row[4]
    wf1=word_frequency(node1_label, 'en')
    wf2=word_frequency(node2_label, 'en')
    if wf1 >threshold and wf2>threshold:
        filtered_rows.append(a_row)

In [16]:
len(filtered_rows)

207798

In [17]:
new_columns=['node1', 'relation', 'node2', 'node1;label', 'node2;label','relation;label', 'relation;dimension', 'weight', 'source', 'origin', 'sentence', 'question']

In [18]:
df2=pd.DataFrame(filtered_rows, columns=new_columns)

In [19]:
df2['relation'].value_counts()

P279     187128
P527       8153
P461       3114
P366       3071
P186       2358
P1552      1773
P828        868
P1542       778
P170        368
P276        170
P463         17
Name: relation, dtype: int64

In [20]:
df2.to_csv('tmp/kgtk_wikidata_ready.tsv', index=False, sep='\t', columns=new_columns)

In [21]:
with pd.option_context('display.max_rows', None, 'display.width', 1000):  # more options can be specified also
    print(df2[df2['relation']=='P186'].head(100))

          node1 relation      node2            node1;label             node2;label      relation;label relation;dimension weight source origin sentence question
149    Q1007164     P186     Q12117          grain whisky'                 cereal'  'material used'@en                               WD                         
220    Q1018244     P186     Q11469           crown glass'                  glass'  'material used'@en                               WD                         
424      Q10289     P186     Q11427                barrel'                  steel'  'material used'@en                               WD                         
425      Q10289     P186     Q11474                barrel'                plastic'  'material used'@en                               WD                         
426      Q10289     P186       Q287                barrel'                   wood'  'material used'@en                               WD                         
475    Q1030197     P186     Q1142

## Compute statistics

In [22]:
%%bash
kgtk graph_statistics --directed --degrees --pagerank --hits --log summary.txt -i tmp/kgtk_wikidata_ready.tsv > tmp/stats/wiki_stats.tsv

In [23]:
%%bash
cat summary.txt

loading the TSV graph now ...
graph loaded! It has 174958 nodes and 207798 edges

###Top relations:
P279	187128
P527	8153
P461	3114
P366	3071
P186	2358
P1552	1773
P828	868
P1542	778
P170	368
P276	170

###Degrees:
in degree stats: mean=1.187702, std=0.491553, max=1
out degree stats: mean=1.187702, std=0.001702, max=1
total degree stats: mean=2.375404, std=0.491586, max=1

###PageRank
Max pageranks
6643	Q27096213	0.017968
2320	Q17334923	0.020545
3446	Q8054	0.079553
6107	Q35120	0.026404
3963	Q8066	0.019283

###HITS
HITS hubs
1754	Q127980	0.000023
391	Q2095	0.000023
37007	Q2449730	0.000536
3446	Q8054	1.000000
421	Q283	0.000058
HITS auth
141724	Q63035501	0.003413
118460	Q62636719	0.003413
120538	Q62653627	0.003413
117590	Q62631098	0.003413
118967	Q62640253	0.003413
