# Extracting commonsense knowledge from Wikidata

## Define files

In [2]:
year='2020'

In [22]:
if year=='2020':
    data_dir='input/wikidata-20200504'
    edge_file='%s/wikidata_edges_20200504.tsv' % data_dir
    node_file='%s/wikidata_nodes_20200504_clean.tsv' % data_dir
    label_file='%s/wikidata_labels_etc.tsv' % data_dir
    tmp_dir='tmp/wikidata20200504'
    trimmed_edge_file='%s/edges_trimmed.tsv' % tmp_dir
    concept_file='%s/concepts.tsv' % tmp_dir
    concepts_edge_file='%s/concept_edges.tsv' % tmp_dir
    compact_concepts_edge_file='%s/compact_concept_edges.tsv' % tmp_dir
    concept_edges_with_labels_file='%s/compact_concept_edges_with_labels.tsv' % tmp_dir
    ready_file='%s/wikidata_cs.tsv' % tmp_dir
    stats_file='%s/stats.tsv' % tmp_dir
else:
    pass

## Step I: Filter out named entities

### Extract concept nodes

In [11]:
from tqdm import tqdm

In [None]:
concepts=set()
with open(node_file, 'r') as f:
    header=next(f)
    for line in tqdm(f, total=84000000):
        data=line.split('\t')
        label=data[1].strip()
        if label=='' or not isinstance(label, str): continue
        label=label[1:-4]
        if label[0].islower():
            node_id=data[0]
            concepts.add(node_id)

In [None]:
len(concepts)

In [None]:
with open(concept_file, 'w') as w:
    w.write('id\n')
    for c in concepts:
        w.write('%s\n' % c)

### Only keep edges with relevant nodes and trim columns

In [None]:
%env ignore_cols=id,rank,node2;magnitude,node2;unit,node2;item,node2;lower,node2;upper,node2;entity-type,node2;longitude,node2;latitude,node2;date,node2;calendar,node2;precision

In [None]:
%%bash -s "$edge_file" "$concept_file" "$concepts_edge_file"
kgtk ifexists $1 --filter-on $2 / ifexists --filter-on $2 --input-keys node2 > $3

In [None]:
%%bash -s "$concepts_edge_file" "$trimmed_edge_file"
kgtk remove_columns -i $1 -c "$ignore_cols" > $2

### Deduplicate

In [None]:
import pandas as pd
df=pd.read_csv(trimmed_edge_file, sep='\t')

In [None]:
len(df)

### Add labels

In [7]:
%%bash -s "$trimmed_edge_file" "$node_file" "$concept_edges_with_labels_file"
kgtk --debug lift --verbose \
     --input-file $1 \
     --label-file $2 \
     --output-file $3 \
     --columns-to-lift node1 node2 label \
     --prefilter-labels \
     --label-value-column label \
     --expert

Opening the input file: tmp/wikidata20200504/edges_trimmed.tsv
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file tmp/wikidata20200504/edges_trimmed.tsv
header: node1	label	node2
node1 column found, this is a KGTK edge file
KgtkReader: Special columns: node1=0 label=1 node2=2 id=-1
KgtkReader: Reading an edge file.
Opening the label file: input/wikidata-20200504/wikidata_nodes_20200504_clean.tsv
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file input/wikidata-20200504/wikidata_nodes_20200504_clean.tsv
header: id	label	type	description	alias
node1 column not found, assuming this is a KGTK node file
KgtkReader: Special columns: node1=-1 label=1 node2=-1 id=0
KgtkReader: Reading an node file.
Lifting with in-memory buffering.
Reading input data to prefilter the labels.
Loading input rows without labels from tmp/wikidata20200504/edges_trimmed.tsv
Labels needed: 1209037
Loading labels from the label file.
Loading labels from input/wikidata-20200504/wikidata_nodes_20200504

CalledProcessError: Command 'b'kgtk --debug lift --verbose \\\n     --input-file $1 \\\n     --label-file $2 \\\n     --output-file $3 \\\n     --columns-to-lift node1 node2 label \\\n     --prefilter-labels \\\n     --label-value-column label \\\n     --expert\n'' returned non-zero exit status 1.

## Step II: Filter by usage

In [8]:
from wordfreq import word_frequency

In [9]:
threshold=1e-06

In [12]:
filtered_rows=[]
rels=[]
with open(concept_edges_with_labels_file, 'r') as f:
    header=next(f)
    for line in tqdm(f, total=3500000):
        a_row=line.strip().split('\t')
        node1_label=a_row[3][1:-4]
        node2_label=a_row[4][1:-4]
        if (any(x.isupper() for x in node1_label) or any(x.isupper() for x in node2_label)): continue
        wf1=word_frequency(node1_label, 'en')
        if wf1 >=threshold:
            wf2=word_frequency(node2_label, 'en')
            if wf2>=threshold:
                filtered_rows.append(a_row)
                rel='%s (%s)' % (a_row[5].strip()[1:-4], a_row[1])
                rels.append(rel)

 96%|█████████▋| 3376431/3500000 [00:24<00:00, 140421.84it/s]


In [13]:
len(filtered_rows)

420822

In [14]:
from collections import Counter

In [15]:
dist_rels=Counter(rels)

In [16]:
len(dist_rels)

414

In [51]:
s=0
for rel, freq in dist_rels.most_common(50):
    print(rel, freq)
    s+=freq

subclass of (P279) 172535
instance of (P31) 141499
part of (P361) 9118
different from (P1889) 7767
has part (P527) 6252
cell component (P681) 5607
property constraint (P2302) 5180
facet of (P1269) 4792
strand orientation (P2548) 4345
use (P366) 3045
opposite of (P461) 3028
properties for this type (P1963) 2382
molecular function (P680) 2369
see also (P1659) 2344
sport (P641) 2338
followed by (P156) 2244
follows (P155) 2234
material used (P186) 2047
is a list of (P360) 1914
Wikidata property (P1687) 1746
has quality (P1552) 1739
said to be the same as (P460) 1664
field of this occupation (P425) 1616
biological process (P682) 1509
uses (P2283) 1431
subject item of this property (P1629) 1341
regulates (molecular biology) (P128) 1277
decays to (P816) 1238
has parts of the class (P2670) 1187
practiced by (P3095) 1081
studied by (P2579) 1010
has cause (P828) 839
has list (P2354) 823
sex or gender (P21) 806
health specialty (P1995) 804
has effect (P1542) 768
depicts (P180) 731
used by (P1535)

In [50]:
s

409775

In [47]:
wanted='P689'
for row in filtered_rows:
    if row[1]==wanted:
        print(row)
        input('c')

['Q2359404', 'P689', 'Q2191986', "'fall in older adults'@en", "'elderly'@en", "'afflicts'@en"]


c 


['Q2879095', 'P689', 'Q186029', "'optic nerve disease'@en", "'optic nerve'@en", "'afflicts'@en"]


c 


['Q3065932', 'P689', 'Q2200417', "'cognitive disorder'@en", "'cognition'@en", "'afflicts'@en"]


c 


['Q6742919', 'P689', 'Q8441', "'male breast cancer'@en", "'man'@en", "'afflicts'@en"]


c 


['Q7314317', 'P689', 'Q7895', "'reproductive system disease'@en", "'reproductive system'@en", "'afflicts'@en"]


KeyboardInterrupt: Interrupted by user

## Step III: Compute statistics

In [23]:
%%bash -s "$ready_file" "$stats_file"
kgtk graph_statistics --directed --degrees --pagerank --hits --log summary.txt -i $1 > $2

In [24]:
%%bash
cat summary.txt

loading the TSV graph now ...
graph loaded! It has 194595 nodes and 420822 edges

###Top relations:
P279	172535
P31	141499
P361	9118
P1889	7767
P527	6252
P681	5607
P2302	5180
P1269	4792
P2548	4345
P366	3045

###Degrees:
in degree stats: mean=2.162553, std=0.888783, max=1
out degree stats: mean=2.162553, std=0.003335, max=1
total degree stats: mean=4.325106, std=0.888878, max=1

###PageRank
Max pageranks
105	Q2996394	0.013144
8246	Q53869507	0.015716
2201	Q8054	0.069251
11444	P2302	0.014661
9720	Q21502402	0.015182

###HITS
HITS hubs
512	Q14349455	0.003317
11953	Q14633865	0.003356
2201	Q8054	0.999862
10041	Q40260	0.014562
11971	Q14327652	0.004324
HITS auth
94794	Q61517789	0.003509
169886	Q61638672	0.003510
70746	Q61598819	0.003509
29407	Q61586361	0.003535
169907	Q61646611	0.003535
