# Extracting commonsense knowledge from Wikidata

## Define files

In [12]:
edge_file='input/wikidata-20181210'
label_file=''
tmp_dir='tmp/wikidata20200504'

## Define relevant properties

In [1]:
%env properties="P1889,P461,P527,P186,P463,P276,P170,P366,P279,P1552,P828,P1542,P462" #,P31"

env: properties="P1889,P461,P527,P186,P463,P276,P170,P366,P279,P1552,P828,P1542,P462" #,P31"


## Filter relevant properties

In [None]:
%%bash
kgtk filter -p " ; $properties ; " input/wikidata/wikidata_edges_20200504.tsv.gz > tmp/kgtk_wikidata_filter.tsv

## Remove columns

In [1]:
%env ignore_cols=id,rank,node2;magnitude,node2;unit,node2;item,node2;lower,node2;upper,node2;entity-type,node2;longitude,node2;latitude,node2;date,node2;calendar,node2;precision

env: ignore_cols=id,rank,node2;magnitude,node2;unit,node2;item,node2;lower,node2;upper,node2;entity-type,node2;longitude,node2;latitude,node2;date,node2;calendar,node2;precision


In [2]:
%%bash
kgtk remove_columns -c "$ignore_cols" -i tmp/kgtk_wikidata_filter.tsv > tmp/kgtk_wikidata_cols.tsv

## Deduplicate

In [3]:
%%bash
kgtk compact -i tmp/kgtk_wikidata_cols.tsv -o tmp/kgtk_wikidata_compact.tsv

## Add labels

In [4]:
%%bash
kgtk --debug lift --verbose \
     --input-file tmp/kgtk_wikidata_compact.tsv \
     --label-file input/wikidata/wiki_labels.tsv \
     --output-file tmp/kgtk_wikidata.tsv \
     --columns-to-lift node1 node2 label \
     --prefilter-labels

Opening the input file: tmp/kgtk_wikidata_compact.tsv
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file tmp/kgtk_wikidata_compact.tsv
header: node1	label	node2
node1 column found, this is a KGTK edge file
KgtkReader: Special columns: node1=0 label=1 node2=2 id=-1
KgtkReader: Reading an edge file.
Opening the label file: input/wikidata/wiki_labels.tsv
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file input/wikidata/wiki_labels.tsv
Traceback (most recent call last):
  File "/Users/filipilievski/mcs/kgtk/kgtk/cli/lift.py", line 277, in run
    kl.process()
  File "/Users/filipilievski/mcs/kgtk/kgtk/lift/kgtklift.py", line 680, in process
    very_verbose=self.very_verbose,
  File "/Users/filipilievski/mcs/kgtk/kgtk/io/kgtkreader.py", line 494, in open
    source: ClosableIter[str] = cls._openfile(file_path, options=options, error_file=error_file, verbose=verbose)
  File "/Users/filipilievski/mcs/kgtk/kgtk/io/kgtkreader.py", line 668, in _openfile
    return ClosableIte

CalledProcessError: Command 'b'kgtk --debug lift --verbose \\\n     --input-file tmp/kgtk_wikidata_compact.tsv \\\n     --label-file input/wikidata/wiki_labels.tsv \\\n     --output-file tmp/kgtk_wikidata.tsv \\\n     --columns-to-lift node1 node2 label \\\n     --prefilter-labels\n'' returned non-zero exit status 1.

## Add PageRank

In [None]:
%%bash
kgtk --debug lift --verbose \
     --input-file tmp/kgtk_wikidata.tsv \
     --label-file input/wikidata/wikidata-pagerank-only-sorted2.tsv \
     --output-file tmp/kgtk_wikidata_with_pr.tsv \
     --columns-to-lift node1 node2 \
     --property vertex_pagerank \
     --lift-suffix ";pagerank" \
     --prefilter-labels

## Filter concepts vs instances

In [3]:
import pandas as pd
df=pd.read_csv('tmp/wikidata20200504/kgtk_wikidata_with_pr.tsv', sep='\t')

In [4]:
len(df)

8256446

In [5]:
df.head()

Unnamed: 0,node1,label,node2,node1;label,node2;label,label;label,node1;pagerank,node2;pagerank
0,P1005,P1552,Q26921380,'Portuguese National Library ID'@en,'VIAF component'@en,'has quality'@en,3.630733e-09,4e-06
1,P1006,P1552,Q26921380,'Nationale Thesaurus voor Auteurs ID'@en,'VIAF component'@en,'has quality'@en,7.545395e-08,4e-06
2,P1015,P1552,Q26921380,'NORAF ID'@en,'VIAF component'@en,'has quality'@en,3.637108e-09,4e-06
3,P1017,P1552,Q26921380,'Vatican Library ID'@en,'VIAF component'@en,'has quality'@en,2.345845e-07,4e-06
4,P1048,P1552,Q26921380,'NCL ID'@en,'VIAF component'@en,'has quality'@en,2.483013e-09,4e-06


In [6]:
from tqdm import tqdm

In [7]:
new_rows=[]
for i, row in tqdm(df.iterrows(), total=df.shape[0]):
    store=True
    if row['node1;label']=='' or row['node2;label']=='' or not isinstance(row['node1;label'], str) or not isinstance(row['node2;label'], str): continue
    node1_label=row['node1;label'].strip()[1:-4]
    node2_label=row['node2;label'].strip()[1:-4]
    if node1_label[0].islower() and node2_label[0].islower():
        a_row=[*row[:6], "", "", "WD", "", "", ""] # copy id, node1, relation, node2, node1:label, node2:label, relation:label
        a_row[3]=node1_label
        a_row[4]=node2_label
        new_rows.append(a_row)

100%|██████████| 8256446/8256446 [19:42<00:00, 6982.58it/s] 


In [9]:
new_columns=['node1', 'relation', 'node2', 'node1;label', 'node2;label','relation;label', 'relation;dimension', 'weight', 'source', 'origin', 'sentence', 'question']

In [10]:
df1=pd.DataFrame(new_rows, columns=new_columns)

In [13]:
df1.to_csv('%s/kgtk_wikidata_concepts.tsv' % tmp_dir, index=False, sep='\t', columns=new_columns)

## Filter by usage

In [14]:
from wordfreq import word_frequency

In [15]:
threshold=1e-06

In [16]:
len(new_rows)

1255113

In [17]:
filtered_rows=[]
for a_row in new_rows:
    node1_label=a_row[3]
    node2_label=a_row[4]
    wf1=word_frequency(node1_label, 'en')
    wf2=word_frequency(node2_label, 'en')
    if wf1 >=threshold and wf2>=threshold:
        filtered_rows.append(a_row)
#    elif wf1>lower_t and wf2>lower_t:
#        print(node1_label, wf1, node2_label, wf2)

In [18]:
len(filtered_rows)

208062

## Map relations

In [19]:
mapping={'P1889': '/r/DistinctFrom', 
        'P461': '/r/Antonym', 
        'P31': '/r/IsA',
        'P279': '/r/IsA',
         'P463': '/r/PartOf',
         'P186': '/r/MadeOf',
         'P276': '/r/AtLocation',
         'P170': '/r/CreatedBy',
         'P366': '/r/UsedFor',
         'P462': '/r/HasProperty',
         'P1552': '/r/HasProperty',
         'P1542': '/r/Causes'
        }

In [20]:
inverse_mapping={'P527': '/r/PartOf', 
                'P828': '/r/Causes'}

In [21]:
from copy import copy

In [22]:
for a_row in filtered_rows:
    row=copy(a_row)
    relation=row[1]
    if relation in mapping.keys():
        row[1]=mapping[row[1]]
        ready_rows.append(row)
    elif relation in inverse_mapping.keys():
        row[1]=inverse_mapping[row[1]]
        tmp=row[0]
        row[0]=row[2]
        row[2]=tmp
        ready_rows.append(row)
    else:
        print(row)
        input('c')

NameError: name 'ready_rows' is not defined

In [2]:
len(filtered_rows)

NameError: name 'filtered_rows' is not defined

In [37]:
df2=pd.DataFrame(ready_rows, columns=new_columns)

In [38]:
df2['relation'].value_counts()

/r/IsA            187320
/r/PartOf           8193
/r/Antonym          3127
/r/UsedFor          3080
/r/MadeOf           2367
/r/HasProperty      1782
/r/Causes           1651
/r/CreatedBy         369
/r/AtLocation        173
Name: relation, dtype: int64

In [39]:
df2.to_csv('%s/kgtk_wikidata_ready.tsv' % tmp_dir, index=False, sep='\t', columns=new_columns)

In [40]:
with pd.option_context('display.max_rows', None, 'display.width', 1000):  # more options can be specified also
    print(df2[df2['relation']=='/r/AtLocation'].head(100))

           node1       relation      node2                                        node1;label                      node2;label relation;label relation;dimension weight source origin sentence question
807      Q104499  /r/AtLocation       Q634                                  planetary science                           planet  'location'@en                               WD                         
913    Q10480682  /r/AtLocation    Q175185                             agricultural structure                       rural area  'location'@en                               WD                         
1044    Q1050571  /r/AtLocation  Q71180315                                            catcher                       home plate  'location'@en                               WD                         
2003    Q1072280  /r/AtLocation    Q133215                                       casino token                           casino  'location'@en                               WD                         


## Compute statistics

In [27]:
%%bash
kgtk graph_statistics --directed --degrees --pagerank --hits --log summary.txt -i tmp/wikidata20200504/kgtk_wikidata_ready.tsv > tmp/stats/wiki_stats.tsv


In [28]:
%%bash
cat summary.txt

loading the TSV graph now ...
graph loaded! It has 175128 nodes and 208062 edges

###Top relations:
P279	187320
P527	8176
P461	3127
P366	3080
P186	2367
P1552	1782
P828	872
P1542	779
P170	369
P276	173

###Degrees:
in degree stats: mean=1.188057, std=0.491150, max=1
out degree stats: mean=1.188057, std=0.001703, max=1
total degree stats: mean=2.376113, std=0.491183, max=1

###PageRank
Max pageranks
6664	Q27096213	0.017967
3975	Q8066	0.019265
2326	Q17334923	0.020543
6126	Q35120	0.026408
3453	Q8054	0.079477

###HITS
HITS hubs
1760	Q127980	0.000023
394	Q2095	0.000023
424	Q283	0.000058
37097	Q2449730	0.000535
3453	Q8054	1.000000
HITS auth
120195	Q62650194	0.003412
118028	Q62633105	0.003412
117796	Q62631641	0.003412
120090	Q62649439	0.003412
118937	Q62639082	0.003412
