# Extracting commonsense knowledge from Wikidata

## Define files

In [14]:
from tqdm import tqdm
import pandas as pd
import pickle as pkl

In [83]:
year='2018'

In [84]:
if year=='2020':
    data_dir='../input/wikidata-20200504'
    edge_file='%s/wikidata_edges_20200504.tsv' % data_dir
    node_file='%s/wikidata_nodes_20200504_clean.tsv' % data_dir
    tmp_dir='../tmp/wikidata20200504'
elif year=='2017':
    data_dir='../input/wikidata-20171227'
    edge_file='%s/wikidata20171227-all-edges.tsv' % data_dir
    node_file='%s/wikidata20171227-all-nodes.tsv' % data_dir
    tmp_dir='../tmp/wikidata20171227'
elif year=='2018':
    data_dir='../input/wikidata-20181210'
    edge_file='%s/wikidata-20181210-all-edges.tsv' % data_dir
    node_file='%s/wikidata-20181210-all-nodes.tsv' % data_dir
    tmp_dir='../tmp/wikidata20181210'


trimmed_edge_file='%s/edges_trimmed.tsv' % tmp_dir
concept_file='%s/concepts.tsv' % tmp_dir
concepts_edge_file='%s/concept_edges.tsv' % tmp_dir
compact_concepts_edge_file='%s/compact_concept_edges.tsv' % tmp_dir
concept_edges_with_labels_file='%s/compact_concept_edges_with_labels.tsv' % tmp_dir
usage_file='%s/usage.tsv' % tmp_dir
ready_file='%s/wikidata_cs.tsv' % tmp_dir
nopro_file='%s/noprotein.tsv' % tmp_dir
stats_file='%s/stats.tsv' % tmp_dir
rels_file='%s/rels.tsv' % tmp_dir
black_nodes_file='../tmp/blacklist.pkl'

In [7]:
%%bash -s "$tmp_dir"
mkdir -p $1

## Step I: Filter out named entities

### Extract concept nodes

In [9]:
concepts=set()
with open(node_file, 'r') as f:
    header=next(f)
    for line in tqdm(f, total=84000000):
        data=line.split('\t')
        label=data[1].strip()
        if label=='' or not isinstance(label, str): continue
        label=label[1:-4]
        if label[0].islower():
            node_id=data[0]
            concepts.add(node_id)

 63%|██████▎   | 53004762/84000000 [01:09<00:40, 763962.97it/s]


In [10]:
len(concepts)

946945

In [11]:
with open(concept_file, 'w') as w:
    w.write('id\n')
    for c in concepts:
        w.write('%s\n' % c)

### Only keep edges with relevant nodes and trim columns

In [12]:
%env ignore_cols=rank
#%env ignore_cols=id,rank,node2;magnitude,node2;unit,node2;item,node2;lower,node2;upper,node2;entity-type,node2;longitude,node2;latitude,node2;date,node2;calendar,node2;precision

env: ignore_cols=rank


In [13]:
%%bash -s "$edge_file" "$concept_file" "$concepts_edge_file"
kgtk ifexists $1 --filter-on $2 / ifexists --filter-on $2 --input-keys node2 > $3

In [14]:
%%bash -s "$concepts_edge_file" "$trimmed_edge_file"
kgtk remove_columns -i $1 -c "$ignore_cols" > $2

### Deduplicate

In [15]:
df=pd.read_csv(trimmed_edge_file, sep='\t')

In [16]:
len(df)

2065872

### Add labels

In [17]:
%%bash -s "$trimmed_edge_file" "$node_file" "$concept_edges_with_labels_file"
kgtk --debug lift --verbose \
     --input-file $1 \
     --label-file $2 \
     --output-file $3 \
     --columns-to-lift node1 node2 label \
     --prefilter-labels \
     --label-value-column label \
     --expert

Opening the input file: tmp/wikidata20181210/edges_trimmed.tsv
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file tmp/wikidata20181210/edges_trimmed.tsv
header: id	node1	label	node2
node1 column found, this is a KGTK edge file
KgtkReader: Special columns: node1=1 label=2 node2=3 id=0
KgtkReader: Reading an edge file.
Opening the label file: input/wikidata-20181210/wikidata-20181210-all-nodes.tsv
KgtkReader: File_path.suffix: .tsv
KgtkReader: reading file input/wikidata-20181210/wikidata-20181210-all-nodes.tsv
header: id	label	type	description	alias
node1 column not found, assuming this is a KGTK node file
KgtkReader: Special columns: node1=-1 label=1 node2=-1 id=0
KgtkReader: Reading an node file.
Lifting with in-memory buffering.
Reading input data to prefilter the labels.
Loading input rows without labels from tmp/wikidata20181210/edges_trimmed.tsv
Labels needed: 800378
Loading labels from the label file.
The label file is a node file, defaulting to the ID column for the mat

## Step II: Filter by usage

In [85]:
from wordfreq import word_frequency

In [86]:
threshold=1e-06

In [87]:
filtered_rows=[]
rels=[]
with open(concept_edges_with_labels_file, 'r') as f:
    header=next(f).strip().split('\t')
    if len(header)>6:
        header=header[1:]
    for line in tqdm(f, total=3500000):
        a_row=line.strip().split('\t')
        if year in ['2017', '2018']:
            a_row=a_row[1:]
            
        node1_label=a_row[3][1:-4]
        node2_label=a_row[4][1:-4]
#        print(node1_label, node2_label)
#        input('c')
        if (any(x.isupper() for x in node1_label) or any(x.isupper() for x in node2_label)): continue
        wf1=word_frequency(node1_label, 'en')
        if wf1 >=threshold:
            wf2=word_frequency(node2_label, 'en')
            if wf2>=threshold:
                filtered_rows.append(a_row)
                rel='%s (%s)' % (a_row[5].strip()[1:-4], a_row[1])
                rels.append(rel)

 59%|█████▉    | 2065872/3500000 [00:16<00:11, 128933.96it/s]


In [88]:
len(filtered_rows)

160387

In [89]:
header

['node1', 'label', 'node2', 'node1;label', 'node2;label', 'label;label']

In [90]:
from collections import Counter

In [91]:
dist_rels=Counter(rels)

In [92]:
len(dist_rels)

349

In [93]:
s=0
with open(rels_file, 'w') as w:
    for rel, freq in dist_rels.most_common(500):
        w.write('%s\t%s\n' % (rel, freq))
        s+=freq


In [94]:
s

160387

In [95]:
wanted='P689'
for row in filtered_rows:
    if row[1]==wanted:
        print(row)
        break

['Q3055380', 'P689', 'Q9639', "'intestinal disease'@en", "'intestine'@en", "'afflicts'@en"]


In [96]:
df1=pd.DataFrame(filtered_rows, columns=header)

In [97]:
len(df1)

160387

In [98]:
df1.to_csv(usage_file, index=False, sep='\t', columns=header)

## Step III: Map properties

In [99]:
from mapping import fw_mapping, bw_mapping

In [100]:
import mapping

In [101]:
import importlib
importlib.reload(mapping)

<module 'mapping' from '/Users/filipilievski/mcs/cskg/wikidata/mapping.py'>

In [102]:
mapped_fw=set()
mapped_bw=set()
rows=[]
if year=='2020':
    blacklisted_nodes=set()
for row in filtered_rows:
    rel=row[1]
    if rel in mapping.fw_mapping.keys() or rel in mapping.bw_mapping.keys():
        rows.append(row)
    elif year=='2020':
        blacklisted_nodes.add(row[0])
        blacklisted_nodes.add(row[2])

In [103]:
if year=='2020':
    with open(black_nodes_file, 'wb') as w:
        pkl.dump(blacklisted_nodes, w)
else:
    with open(black_nodes_file, 'rb') as f:
        blacklisted_nodes=pkl.load(f)

In [104]:
len(rows)

146216

In [105]:
len(blacklisted_nodes)

25219

In [106]:
the_rows=[]
for row in rows:
    if row[0] in blacklisted_nodes or row[2] in blacklisted_nodes:
        continue
    the_rows.append(row)

In [107]:
len(the_rows)

66188

In [108]:
df2=pd.DataFrame(the_rows, columns=header)

In [109]:
df2.drop_duplicates(inplace=True)

In [110]:
df2.to_csv(ready_file, index=False, sep='\t', columns=header)

## Protein analysis

In [79]:
csub=0
cobj=0
with open(ready_file, 'r') as f:
    for line in f:
        data=line.strip().split('\t')
        if len(data)<3: 
            print(data)
            continue
        if data[0]=='Q8054' or data[0]=='Q7187':
            csub+=1
        elif data[2]=='Q8054' or data[2]=='Q7187':
            cobj+=1
#            elif data[0]=='Q11173' or data[2]=='Q11173':
#                print(data)
#                input('c')

In [80]:
print(csub, cobj)

0 0


## Step IV: Compute statistics

In [111]:
%%bash -s "$ready_file" "$stats_file"
kgtk graph_statistics --directed --degrees --pagerank --hits --log summary.txt -i $1 > $2

In [112]:
%%bash
cat summary.txt

loading the TSV graph now ...
graph loaded! It has 47056 nodes and 66163 edges

###Top relations:
P279	30186
P31	15570
P361	2595
P527	2327
P1889	2011
P155	1928
P156	1926
P461	1530
P1269	931
P366	814

###Degrees:
in degree stats: mean=1.406048, std=0.035338, max=1
out degree stats: mean=1.406048, std=0.004493, max=1
total degree stats: mean=2.812096, std=0.036066, max=1

###PageRank
Max pageranks
1378	Q3249551	0.007343
201	Q386724	0.008876
1261	Q5962346	0.007850
1226	Q16889133	0.009857
6706	Q16686448	0.010196

###HITS
HITS hubs
38867	Q23927052	0.002626
10912	Q591041	0.002672
1373	Q13442814	0.999466
9475	Q737498	0.002695
6294	Q61476	0.031202
HITS auth
30127	Q33947320	0.037269
44180	Q37893165	0.037269
39578	Q43916273	0.037318
30759	Q44687559	0.037269
15665	Q18918145	0.072281
