# Extracting commonsense knowledge from Wikidata

## Define files

In [1]:
year='2018'

In [6]:
if year=='2020':
    data_dir='input/wikidata-20200504'
    edge_file='%s/wikidata_edges_20200504.tsv' % data_dir
    node_file='%s/wikidata_nodes_20200504_clean.tsv' % data_dir
    tmp_dir='tmp/wikidata20200504'
elif year=='2017':
    data_dir='input/wikidata-20171227'
    edge_file='%s/wikidata20171227-all-edges.tsv' % data_dir
    node_file='%s/wikidata20171227-all-nodes.tsv' % data_dir
    tmp_dir='tmp/wikidata20171227'
elif year=='2018':
    data_dir='input/wikidata-20181210'
    edge_file='%s/wikidata-20181210-all-edges.tsv' % data_dir
    node_file='%s/wikidata-20181210-all-nodes.tsv' % data_dir
    tmp_dir='tmp/wikidata20181210'


trimmed_edge_file='%s/edges_trimmed.tsv' % tmp_dir
concept_file='%s/concepts.tsv' % tmp_dir
concepts_edge_file='%s/concept_edges.tsv' % tmp_dir
compact_concepts_edge_file='%s/compact_concept_edges.tsv' % tmp_dir
concept_edges_with_labels_file='%s/compact_concept_edges_with_labels.tsv' % tmp_dir
usage_file='%s/usage.tsv' % tmp_dir
ready_file='%s/wikidata_cs.tsv' % tmp_dir
stats_file='%s/stats.tsv' % tmp_dir

In [7]:
%%bash -s "$tmp_dir"
mkdir -p $1

## Step I: Filter out named entities

### Extract concept nodes

In [8]:
from tqdm import tqdm
import pandas as pd

In [9]:
concepts=set()
with open(node_file, 'r') as f:
    header=next(f)
    for line in tqdm(f, total=84000000):
        data=line.split('\t')
        label=data[1].strip()
        if label=='' or not isinstance(label, str): continue
        label=label[1:-4]
        if label[0].islower():
            node_id=data[0]
            concepts.add(node_id)

 63%|██████▎   | 53004762/84000000 [01:09<00:40, 763962.97it/s]


In [10]:
len(concepts)

946945

In [11]:
with open(concept_file, 'w') as w:
    w.write('id\n')
    for c in concepts:
        w.write('%s\n' % c)

### Only keep edges with relevant nodes and trim columns

In [12]:
%env ignore_cols=rank
#%env ignore_cols=id,rank,node2;magnitude,node2;unit,node2;item,node2;lower,node2;upper,node2;entity-type,node2;longitude,node2;latitude,node2;date,node2;calendar,node2;precision

env: ignore_cols=rank


In [13]:
%%bash -s "$edge_file" "$concept_file" "$concepts_edge_file"
kgtk ifexists $1 --filter-on $2 / ifexists --filter-on $2 --input-keys node2 > $3

In [14]:
%%bash -s "$concepts_edge_file" "$trimmed_edge_file"
kgtk remove_columns -i $1 -c "$ignore_cols" > $2

### Deduplicate

In [15]:
df=pd.read_csv(trimmed_edge_file, sep='\t')

In [16]:
len(df)

2065872

### Add labels

In [17]:
%%bash -s "$trimmed_edge_file" "$node_file" "$concept_edges_with_labels_file"
kgtk --debug lift --verbose \
     --input-file $1 \
     --label-file $2 \
     --output-file $3 \
     --columns-to-lift node1 node2 label \
     --prefilter-labels \
     --label-value-column label \
     --expert

Opening the input file: tmp/wikidata20181210/edges_trimmed.tsv
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file tmp/wikidata20181210/edges_trimmed.tsv
header: id	node1	label	node2
node1 column found, this is a KGTK edge file
KgtkReader: Special columns: node1=1 label=2 node2=3 id=0
KgtkReader: Reading an edge file.
Opening the label file: input/wikidata-20181210/wikidata-20181210-all-nodes.tsv
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file input/wikidata-20181210/wikidata-20181210-all-nodes.tsv
header: id	label	type	description	alias
node1 column not found, assuming this is a KGTK node file
KgtkReader: Special columns: node1=-1 label=1 node2=-1 id=0
KgtkReader: Reading an node file.
Lifting with in-memory buffering.
Reading input data to prefilter the labels.
Loading input rows without labels from tmp/wikidata20181210/edges_trimmed.tsv
Labels needed: 800378
Loading labels from the label file.
The label file is a node file, defaulting to the ID column for the mat

## Step II: Filter by usage

In [18]:
from wordfreq import word_frequency

In [19]:
threshold=1e-06

In [20]:
filtered_rows=[]
rels=[]
with open(concept_edges_with_labels_file, 'r') as f:
    header=next(f).split('\t')
    if len(header)>6:
        header=header[1:]
    for line in tqdm(f, total=3500000):
        a_row=line.strip().split('\t')
        if year in ['2017', '2018']:
            a_row=a_row[1:]
            
        node1_label=a_row[3][1:-4]
        node2_label=a_row[4][1:-4]
#        print(node1_label, node2_label)
#        input('c')
        if (any(x.isupper() for x in node1_label) or any(x.isupper() for x in node2_label)): continue
        wf1=word_frequency(node1_label, 'en')
        if wf1 >=threshold:
            wf2=word_frequency(node2_label, 'en')
            if wf2>=threshold:
                filtered_rows.append(a_row)
                rel='%s (%s)' % (a_row[5].strip()[1:-4], a_row[1])
                rels.append(rel)

 59%|█████▉    | 2065872/3500000 [00:18<00:12, 112048.74it/s]


In [21]:
len(filtered_rows)

160387

In [22]:
from collections import Counter

In [23]:
dist_rels=Counter(rels)

In [24]:
len(dist_rels)

349

In [25]:
s=0
for rel, freq in dist_rels.most_common(50):
    print(rel, freq)
    s+=freq

subclass of (P279) 59382
instance of (P31) 39041
part of (P361) 5711
has part (P527) 4972
property constraint (P2302) 4418
different from (P1889) 3214
facet of (P1269) 2695
opposite of (P461) 2179
followed by (P156) 2078
follows (P155) 2071
use (P366) 1880
properties for this type (P1963) 1823
sport (P641) 1772
see also (P1659) 1772
is a list of (P360) 1639
regulates (molecular biology) (P128) 1293
field of this occupation (P425) 1258
decays to (P816) 1241
material used (P186) 1229
has quality (P1552) 1154
said to be the same as (P460) 1070
Wikidata property (P1687) 1057
subject item of this property (P1629) 1024
uses (P2283) 777
has parts of the class (P2670) 747
health specialty (P1995) 659
symptoms (P780) 561
has effect (P1542) 485
practiced by (P3095) 480
has cause (P828) 457
has list (P2354) 446
field of work (P101) 418
afflicts (P689) 413
used by (P1535) 408
taxon rank (P105) 391
subproperty of (P1647) 372
depicts (P180) 345
main subject (P921) 338
studied by (P2579) 332
anatomic

In [26]:
s

154637

In [27]:
wanted='P689'
for row in filtered_rows:
    if row[1]==wanted:
        print(row)
        break

['Q3055380', 'P689', 'Q9639', "'intestinal disease'@en", "'intestine'@en", "'afflicts'@en"]


In [28]:
df1=pd.DataFrame(filtered_rows, columns=header)

In [29]:
len(df1)

160387

In [30]:
df1.to_csv(usage_file, index=False, sep='\t', columns=header)

## Step III: Map properties

In [31]:
from mapping import fw_mapping, bw_mapping

In [32]:
import mapping

In [33]:
import importlib
importlib.reload(mapping)

<module 'mapping' from '/Users/filipilievski/mcs/cskg/mapping.py'>

In [34]:
mapped_fw=set()
mapped_bw=set()
rows=[]
for row in filtered_rows:
    rel=row[1]
    if rel in mapping.fw_mapping.keys() or rel in mapping.bw_mapping.keys():
        rows.append(row)

In [35]:
len(rows)

145956

In [36]:
df2=pd.DataFrame(rows, columns=header)

In [37]:
df2.to_csv(ready_file, index=False, sep='\t', columns=header)

## Step IV: Compute statistics

In [38]:
%%bash -s "$ready_file" "$stats_file"
kgtk graph_statistics --directed --degrees --pagerank --hits --log summary.txt -i $1 > $2

In [39]:
%%bash
cat summary.txt

loading the TSV graph now ...
graph loaded! It has 76066 nodes and 145956 edges

###Top relations:
P279	59382
P31	39041
P361	5711
P527	4972
P1889	3214
P1269	2695
P461	2179
P156	2078
P155	2071
P366	1880

###Degrees:
in degree stats: mean=1.918807, std=0.102935, max=1
out degree stats: mean=1.918807, std=0.005634, max=1
total degree stats: mean=3.837615, std=0.103896, max=1

###PageRank
Max pageranks
71397	P1918	0.008616
288	Q151885	0.012225
1906	Q7187	0.010134
310	Q2996394	0.009513
2502	Q3249551	0.008727

###HITS
HITS hubs
310	Q2996394	0.000000
36441	Q62536	0.000000
2683	Q8054	0.000281
1906	Q7187	1.000000
14572	Q20747295	0.000770
HITS auth
14572	Q20747295	0.013871
49434	Q18969817	0.013878
14571	Q26738515	0.013878
70327	Q20969150	0.013878
37064	Q18256813	0.013878
