# Extracting commonsense knowledge from Wikidata

## Define files

In [6]:
from tqdm import tqdm
import pandas as pd

In [1]:
year='2020'

In [9]:
if year=='2020':
    data_dir='input/wikidata-20200504'
    edge_file='%s/wikidata_edges_20200504.tsv' % data_dir
    node_file='%s/wikidata_nodes_20200504_clean.tsv' % data_dir
    tmp_dir='tmp/wikidata20200504'
elif year=='2017':
    data_dir='input/wikidata-20171227'
    edge_file='%s/wikidata20171227-all-edges.tsv' % data_dir
    node_file='%s/wikidata20171227-all-nodes.tsv' % data_dir
    tmp_dir='tmp/wikidata20171227'
elif year=='2018':
    data_dir='input/wikidata-20181210'
    edge_file='%s/wikidata-20181210-all-edges.tsv' % data_dir
    node_file='%s/wikidata-20181210-all-nodes.tsv' % data_dir
    tmp_dir='tmp/wikidata20181210'


trimmed_edge_file='%s/edges_trimmed.tsv' % tmp_dir
concept_file='%s/concepts.tsv' % tmp_dir
concepts_edge_file='%s/concept_edges.tsv' % tmp_dir
compact_concepts_edge_file='%s/compact_concept_edges.tsv' % tmp_dir
concept_edges_with_labels_file='%s/compact_concept_edges_with_labels.tsv' % tmp_dir
usage_file='%s/usage.tsv' % tmp_dir
ready_file='%s/wikidata_cs.tsv' % tmp_dir
stats_file='%s/stats.tsv' % tmp_dir
rels_file='%s/rels.tsv' % tmp_dir

In [7]:
%%bash -s "$tmp_dir"
mkdir -p $1

## Step I: Filter out named entities

### Extract concept nodes

In [9]:
concepts=set()
with open(node_file, 'r') as f:
    header=next(f)
    for line in tqdm(f, total=84000000):
        data=line.split('\t')
        label=data[1].strip()
        if label=='' or not isinstance(label, str): continue
        label=label[1:-4]
        if label[0].islower():
            node_id=data[0]
            concepts.add(node_id)

 63%|██████▎   | 53004762/84000000 [01:09<00:40, 763962.97it/s]


In [10]:
len(concepts)

946945

In [11]:
with open(concept_file, 'w') as w:
    w.write('id\n')
    for c in concepts:
        w.write('%s\n' % c)

### Only keep edges with relevant nodes and trim columns

In [12]:
%env ignore_cols=rank
#%env ignore_cols=id,rank,node2;magnitude,node2;unit,node2;item,node2;lower,node2;upper,node2;entity-type,node2;longitude,node2;latitude,node2;date,node2;calendar,node2;precision

env: ignore_cols=rank


In [13]:
%%bash -s "$edge_file" "$concept_file" "$concepts_edge_file"
kgtk ifexists $1 --filter-on $2 / ifexists --filter-on $2 --input-keys node2 > $3

In [14]:
%%bash -s "$concepts_edge_file" "$trimmed_edge_file"
kgtk remove_columns -i $1 -c "$ignore_cols" > $2

### Deduplicate

In [15]:
df=pd.read_csv(trimmed_edge_file, sep='\t')

In [16]:
len(df)

2065872

### Add labels

In [17]:
%%bash -s "$trimmed_edge_file" "$node_file" "$concept_edges_with_labels_file"
kgtk --debug lift --verbose \
     --input-file $1 \
     --label-file $2 \
     --output-file $3 \
     --columns-to-lift node1 node2 label \
     --prefilter-labels \
     --label-value-column label \
     --expert

Opening the input file: tmp/wikidata20181210/edges_trimmed.tsv
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file tmp/wikidata20181210/edges_trimmed.tsv
header: id	node1	label	node2
node1 column found, this is a KGTK edge file
KgtkReader: Special columns: node1=1 label=2 node2=3 id=0
KgtkReader: Reading an edge file.
Opening the label file: input/wikidata-20181210/wikidata-20181210-all-nodes.tsv
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file input/wikidata-20181210/wikidata-20181210-all-nodes.tsv
header: id	label	type	description	alias
node1 column not found, assuming this is a KGTK node file
KgtkReader: Special columns: node1=-1 label=1 node2=-1 id=0
KgtkReader: Reading an node file.
Lifting with in-memory buffering.
Reading input data to prefilter the labels.
Loading input rows without labels from tmp/wikidata20181210/edges_trimmed.tsv
Labels needed: 800378
Loading labels from the label file.
The label file is a node file, defaulting to the ID column for the mat

## Step II: Filter by usage

In [3]:
from wordfreq import word_frequency

In [4]:
threshold=1e-06

In [7]:
filtered_rows=[]
rels=[]
with open(concept_edges_with_labels_file, 'r') as f:
    header=next(f).split('\t')
    if len(header)>6:
        header=header[1:]
    for line in tqdm(f, total=3500000):
        a_row=line.strip().split('\t')
        if year in ['2017', '2018']:
            a_row=a_row[1:]
            
        node1_label=a_row[3][1:-4]
        node2_label=a_row[4][1:-4]
#        print(node1_label, node2_label)
#        input('c')
        if (any(x.isupper() for x in node1_label) or any(x.isupper() for x in node2_label)): continue
        wf1=word_frequency(node1_label, 'en')
        if wf1 >=threshold:
            wf2=word_frequency(node2_label, 'en')
            if wf2>=threshold:
                filtered_rows.append(a_row)
                rel='%s (%s)' % (a_row[5].strip()[1:-4], a_row[1])
                rels.append(rel)

 96%|█████████▋| 3376431/3500000 [00:24<00:00, 136861.82it/s]


In [8]:
len(filtered_rows)

420822

In [11]:
from collections import Counter

In [12]:
dist_rels=Counter(rels)

In [13]:
len(dist_rels)

414

In [14]:
s=0
with open(rels_file, 'w') as w:
    for rel, freq in dist_rels.most_common(500):
        w.write('%s\t%s\n' % (rel, freq))
        s+=freq

In [15]:
s

420822

In [16]:
wanted='P689'
for row in filtered_rows:
    if row[1]==wanted:
        print(row)
        break

['Q2359404', 'P689', 'Q2191986', "'fall in older adults'@en", "'elderly'@en", "'afflicts'@en"]


In [17]:
df1=pd.DataFrame(filtered_rows, columns=header)

In [18]:
len(df1)

420822

In [19]:
df1.to_csv(usage_file, index=False, sep='\t', columns=header)

## Step III: Map properties

In [20]:
from mapping import fw_mapping, bw_mapping

In [21]:
import mapping

In [34]:
import importlib
importlib.reload(mapping)

<module 'mapping' from '/Users/filipilievski/mcs/cskg/mapping.py'>

In [37]:
blacklisted_rels=['P681', 'P2548', 'P680', 'P682', 'P816']

In [61]:
mapped_fw=set()
mapped_bw=set()
rows=[]
#blacklisted_nodes=set()
for row in filtered_rows:
    rel=row[1]
    if rel in mapping.fw_mapping.keys() or rel in mapping.bw_mapping.keys():
        rows.append(row)
#    elif rel in blacklisted_rels:
#        blacklisted_nodes.add(row[0])
#        blacklisted_nodes.add(row[2])

In [62]:
df2=pd.DataFrame(rows, columns=header)

In [63]:
df2.to_csv(ready_file, index=False, sep='\t', columns=header)

## Step IV: Compute statistics

In [64]:
%%bash -s "$ready_file" "$stats_file"
kgtk graph_statistics --directed --degrees --pagerank --hits --log summary.txt -i $1 > $2

In [65]:
%%bash
cat summary.txt

loading the TSV graph now ...
graph loaded! It has 193193 nodes and 387932 edges

###Top relations:
P279	172535
P31	141499
P361	9118
P1889	7767
P527	6252
P1269	4792
P366	3045
P461	3028
P1963	2382
P1659	2344

###Degrees:
in degree stats: mean=2.008002, std=0.894863, max=1
out degree stats: mean=2.008002, std=0.002790, max=1
total degree stats: mean=4.016005, std=0.894949, max=1

###PageRank
Max pageranks
463	Q5058355	0.007770
1058	Q3249551	0.010063
95	Q2996394	0.016572
2070	Q8054	0.069876
98	Q11862829	0.008882

###HITS
HITS hubs
8680	Q7187	0.000015
1248	Q84467700	0.000076
11767	Q2449730	0.000314
1247	Q423042	0.002960
2070	Q8054	0.999996
HITS auth
83896	Q62640253	0.003417
106987	Q62627511	0.003417
68085	Q56709542	0.003417
71373	Q62631641	0.003417
107201	Q62652568	0.003417
