Skip to content

Commit

Permalink
Merge pull request #73 from h-peng17/master
Browse files Browse the repository at this point in the history
Use anchor link instead of alias to get wikidata item id.
  • Loading branch information
zzy14 committed Apr 15, 2021
2 parents d4f455e + fad6332 commit 9a4ab4a
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 5 deletions.
14 changes: 12 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@ Run the following command to create training instances.
```shell
# Download Wikidump
wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
# Download alise
wget -c https://cloud.tsinghua.edu.cn/f/a519318708df4dc8a853/?dl=1 -O alias_entity.txt
# Download anchor2id
wget -c https://cloud.tsinghua.edu.cn/f/1c956ed796cb4d788646/?dl=1 -O anchor2id.txt
# WikiExtractor
python3 pretrain_data/WikiExtractor.py enwiki-latest-pages-articles.xml.bz2 -o pretrain_data/output -l --min_text_length 100 --filter_disambig_pages -it abbr,b,big --processes 4
# Modify anchors with 4 processes
Expand All @@ -32,6 +32,16 @@ Run the following command to create training instances.
python3 code/merge.py
```

If you want to get anchor2id by yourself, run the following code(this will take about half a day) after `python3 pretrain_data/extract.py 4`
```shell
# extract anchors
python3 pretrain_data/utils.py get_anchors
# query Mediawiki api using anchor link to get wikibase item id. For more details, see https://en.wikipedia.org/w/api.php?action=help.
python3 pretrain_data/create_anchors.py 256
# aggregate anchors
python3 pretrain_data/utils.py agg_anchors
```

Run the following command to pretrain:

```
Expand Down
43 changes: 43 additions & 0 deletions pretrain_data/create_anchor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import requests
import sys
import json
import os
from multiprocessing import Pool, Lock
from nltk.tokenize import sent_tokenize
import math
import pdb
from tqdm import tqdm


anchors = json.load(open("pretrain_data/all_anchors_name.json"))
part = int(math.ceil(len(anchors) / 256.)) # IMPORTANT! This number must be consistant with workers' number
anchors = [anchors[i:i+part] for i in range(0, len(anchors), part)]
print(len(anchors))

def run_proc(idx, n, input_names):
folder = "pretrain_data/anchor"
target = "{}/{}".format(folder, idx)
fout = open(target+"_anchor2id", "a+")
for input_name in tqdm(input_names):
try:
entity_url = f"https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&ppprop=wikibase_item&redirects=1&titles={input_name}&format=json"
entity_info = requests.get(entity_url)
id = list(entity_info.json()['query']['pages'].items())[0][1]['pageprops']['wikibase_item']
except:
id = '#UNK#'
fout.write(f"{input_name}\t{id}\n")
fout.close()

folder = "pretrain_data/anchor"
if not os.path.exists(folder):
os.makedirs(folder)

n = int(sys.argv[1])
p = Pool(n)
for i in range(n):
p.apply_async(run_proc, args=(i, n, anchors[i]))
p.close()
p.join()



2 changes: 1 addition & 1 deletion pretrain_data/create_ids.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@

# load entity dict
d_ent = {}
with open("alias_entity.txt", "r") as fin:
with open("anchor2id.txt", "r") as fin:
for line in fin:
v = line.strip().split("\t")
if len(v) != 2:
Expand Down
4 changes: 2 additions & 2 deletions pretrain_data/extract.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,14 +36,14 @@ def run_proc(idx, n, file_list):
content = "".join(content[1:])

try:
lookup = [(x.get_text().strip(), parse.unquote(x.get('href')))
lookup = [(x.get_text().strip(), x.get('href'))
for x in doc.find_all("a")]
lookup = "[_end_]".join(
["[_map_]".join(x) for x in lookup])
fout.write(content+"[_end_]"+lookup+"\n")
except Exception as e:
logging.warning(
'Error {} when parsing file {}'.format(str(e), input_name))
fout.write(content+"[_end_]"+lookup+"\n")
logging.info('Finished {}'.format(target))


Expand Down
51 changes: 51 additions & 0 deletions pretrain_data/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
import os
import sys
import json
from collections import Counter
from tqdm import tqdm, trange

def get_all_anchors_name():
input_folder = "pretrain_data/ann"
file_list = []
for path, _, filenames in os.walk(input_folder):
for filename in filenames:
file_list.append(os.path.join(path, filename))

anchors = {}
for i in trange(len(file_list)):
input_name = file_list[i]
fin = open(input_name, "r")
for doc in fin.readlines():
doc = doc.strip()
segs = doc.split("[_end_]")
map_segs = segs[1:]
maps = {}
for x in map_segs:
v = x.split("[_map_]")
if len(v) != 2:
continue
if anchors.get(v[1], -1) != -1:
continue
anchors[v[1]] = 1
fin.close()
print(len(anchors))
json.dump(list(anchors.keys()), open("pretrain_data/all_anchors_name.json", 'w'))

def aggregate_anchor2id():
fout = open("anchor2id.txt", 'w')
files = os.listdir("pretrain_data/anchor")
for file in files:
f = open(os.path.join("pretrain_data/anchor", file))
fout.write(f.read())
f.close()
fout.close()


if __name__ == "__main__":
mode = sys.argv[1]
if mode == 'get_anchors':
get_all_anchors_name()
elif mode == 'agg_anchors':
aggregate_anchor2id()


0 comments on commit 9a4ab4a

Please sign in to comment.