Merge pull request #73 from h-peng17/master

Use anchor link instead of alias to get wikidata item id.
thunlp · Apr 15, 2021 · 9a4ab4a · 9a4ab4a
2 parents d4f455e + fad6332
commit 9a4ab4a
Show file tree

Hide file tree

Showing 5 changed files with 109 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -18,8 +18,8 @@ Run the following command to create training instances.
 ```shell
   # Download Wikidump
   wget https://dumps.wikimedia.org/enwiki/latest/enwiki-latest-pages-articles.xml.bz2
-  # Download alise
-  wget -c https://cloud.tsinghua.edu.cn/f/a519318708df4dc8a853/?dl=1 -O alias_entity.txt
+  # Download anchor2id
+  wget -c https://cloud.tsinghua.edu.cn/f/1c956ed796cb4d788646/?dl=1 -O anchor2id.txt
   # WikiExtractor
   python3 pretrain_data/WikiExtractor.py enwiki-latest-pages-articles.xml.bz2 -o pretrain_data/output -l --min_text_length 100 --filter_disambig_pages -it abbr,b,big --processes 4
   # Modify anchors with 4 processes
@@ -32,6 +32,16 @@ Run the following command to create training instances.
   python3 code/merge.py
 ```
 
+If you want to get anchor2id by yourself, run the following code(this will take about half a day) after `python3 pretrain_data/extract.py 4`
+```shell
+  # extract anchors
+  python3 pretrain_data/utils.py get_anchors
+  # query Mediawiki api using anchor link to get wikibase item id. For more details, see https://en.wikipedia.org/w/api.php?action=help.
+  python3 pretrain_data/create_anchors.py 256 
+  # aggregate anchors 
+  python3 pretrain_data/utils.py agg_anchors
+```
+
 Run the following command to pretrain:
 
 ```

diff --git a/pretrain_data/create_anchor.py b/pretrain_data/create_anchor.py
@@ -0,0 +1,43 @@
+import requests
+import sys
+import json 
+import os
+from multiprocessing import Pool, Lock
+from nltk.tokenize import sent_tokenize
+import math
+import pdb
+from tqdm import tqdm
+
+
+anchors = json.load(open("pretrain_data/all_anchors_name.json"))
+part = int(math.ceil(len(anchors) / 256.)) # IMPORTANT! This number must be consistant with workers' number
+anchors = [anchors[i:i+part] for i in range(0, len(anchors), part)]
+print(len(anchors))
+
+def run_proc(idx, n, input_names):
+    folder = "pretrain_data/anchor"
+    target = "{}/{}".format(folder, idx)
+    fout = open(target+"_anchor2id", "a+")
+    for input_name in tqdm(input_names):
+        try:
+            entity_url = f"https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&ppprop=wikibase_item&redirects=1&titles={input_name}&format=json"
+            entity_info = requests.get(entity_url)
+            id = list(entity_info.json()['query']['pages'].items())[0][1]['pageprops']['wikibase_item']
+        except:
+            id = '#UNK#'
+        fout.write(f"{input_name}\t{id}\n")
+    fout.close()
+
+folder = "pretrain_data/anchor"
+if not os.path.exists(folder):
+    os.makedirs(folder)
+
+n = int(sys.argv[1])
+p = Pool(n)
+for i in range(n):
+    p.apply_async(run_proc, args=(i, n, anchors[i]))
+p.close()
+p.join()
+
+
+
diff --git a/pretrain_data/create_ids.py b/pretrain_data/create_ids.py
@@ -24,7 +24,7 @@
 
 # load entity dict
 d_ent = {}
-with open("alias_entity.txt", "r") as fin:
+with open("anchor2id.txt", "r") as fin:
     for line in fin:
         v = line.strip().split("\t")
         if len(v) != 2:

diff --git a/pretrain_data/extract.py b/pretrain_data/extract.py
@@ -36,14 +36,14 @@ def run_proc(idx, n, file_list):
                 content = "".join(content[1:])
 
                 try:
-                    lookup = [(x.get_text().strip(), parse.unquote(x.get('href')))
+                    lookup = [(x.get_text().strip(), x.get('href'))
                               for x in doc.find_all("a")]
                     lookup = "[_end_]".join(
                         ["[_map_]".join(x) for x in lookup])
+                    fout.write(content+"[_end_]"+lookup+"\n")
                 except Exception as e:
                     logging.warning(
                         'Error {} when parsing file {}'.format(str(e), input_name))
-                fout.write(content+"[_end_]"+lookup+"\n")
         logging.info('Finished {}'.format(target))
 
 

diff --git a/pretrain_data/utils.py b/pretrain_data/utils.py
@@ -0,0 +1,51 @@
+import os 
+import sys
+import json 
+from collections import Counter
+from tqdm import tqdm, trange
+
+def get_all_anchors_name():
+    input_folder = "pretrain_data/ann"
+    file_list = []
+    for path, _, filenames in os.walk(input_folder):
+        for filename in filenames:
+            file_list.append(os.path.join(path, filename))
+
+    anchors = {}
+    for i in trange(len(file_list)):
+        input_name = file_list[i]
+        fin = open(input_name, "r")
+        for doc in fin.readlines():
+            doc = doc.strip()
+            segs = doc.split("[_end_]")
+            map_segs = segs[1:]
+            maps = {}
+            for x in map_segs:
+                v = x.split("[_map_]")
+                if len(v) != 2:
+                    continue
+                if anchors.get(v[1], -1) != -1:
+                    continue
+                anchors[v[1]] = 1
+        fin.close()
+    print(len(anchors))
+    json.dump(list(anchors.keys()), open("pretrain_data/all_anchors_name.json", 'w'))
+
+def aggregate_anchor2id():
+    fout = open("anchor2id.txt", 'w')
+    files = os.listdir("pretrain_data/anchor")
+    for file in files:
+        f = open(os.path.join("pretrain_data/anchor", file))
+        fout.write(f.read())
+        f.close()
+    fout.close()
+
+
+if __name__ == "__main__":
+    mode = sys.argv[1]
+    if mode == 'get_anchors':
+        get_all_anchors_name()
+    elif mode == 'agg_anchors':
+        aggregate_anchor2id()
+
+