In [3]:
import os
print (os.environ['CONDA_DEFAULT_ENV'])

bootleg


In [1]:
import numpy as np
from tqdm import tqdm
import os
import pickle
import re
from pathlib import Path
from collections import defaultdict
import ujson

Load alias map to filter.

In [4]:
from bootleg.symbols.entity_profile import EntityProfile
root_dir = Path("/opt/data/cchang/bootleg_train/data")
entity_dump = EntityProfile.load_from_cache(load_dir=root_dir / "entity_db")

In [5]:
curr_aliases = entity_dump._entity_symbols.get_alias2qids_dict()

Load count files for all of wikipedia --- these were computed with `compute_statistics.py` (in utils/preprocessing) over the merged data file of test, dev, and train.

In [11]:
# number of times alias phrase occurs in the text across ALL of wikipedia
alias_text_counts = ujson.load(
    open(root_dir / 'stats/alias_text_counts.json'))

# number of times alias occurs as an alias across ALL of wikipedia
alias_counts = ujson.load(
    open(root_dir / 'stats/alias_counts.json'))

Simple function to find aliases to remove based on the count files above.

In [12]:
def get_norm_value(alias, verbose=False):
    if verbose:
        print('# times occurs as alias:', alias_counts.get(alias, 0))
        print('# times occurs in text:', alias_text_counts.get(alias, 0))
    return alias_counts.get(alias, 0) / (alias_text_counts[alias]) if alias in alias_text_counts else -1


def get_aliases_to_remove(curr_aliases, keep_wikidata=False, norm_threshold=0.017, min_seen=500, min_alias_count=10000):
    """
    Remove aliases which are frequent words but infrequent aliases due to rarity 
    or mislabel (e.g. band "themselves").
    """
    aliases_to_remove = set()
    cnts = defaultdict(int)
    grps = defaultdict(list)
    for alias in tqdm(curr_aliases):
        # If alias is not seen in Wikipedia
        if alias not in alias_counts:
            # If alias is seen in text but only a few times, skip as it's too few to make a decision
            if (alias in alias_text_counts and alias_text_counts[alias] < min_seen):
                continue
            # if alias occurs in Wikidata (so it's in our alias map), but not as alias in Wikipedia
            # and occurs more than min_seen times, only keep if one candidate (indicating a fairly unique alias)
            # and if that one candidate is a type we care about (e.g., people and locations)
            elif len(curr_aliases[alias]) == 1:
                continue
            # else make sure we don't think it's a person or location name - we want to keep those
            # even if more general alias
            else:
                if keep_wikidata:
                    continue
                cnts["not_in_wikipedia"] += 1
                grps["not_in_wikipedia"].append(alias)
                aliases_to_remove.add(alias)
                continue 
        # length greater than max_alias_len and weak labels cause some aliases to occur as aliases 
        # but not occur in the text
        if alias not in alias_text_counts:
            continue 
        # filter out aliases which occur commonly in the text but uncommonly as an alias
        # we require that the alias is a common phrase in text 
        # and that the phrase isn't very commonly an alias 
        if (get_norm_value(alias) < norm_threshold):
            if alias_text_counts[alias] > min_seen:
                if alias_counts[alias] < min_alias_count:
                    aliases_to_remove.add(alias)
                    cnts["removed_filter"] += 1
                    grps["removed_filter"].append(alias)
                else:
                    cnts["grt_min_alias_cnt"] += 1
                    grps["grt_min_alias_cnt"].append(alias)
            else:
                cnts["lt_min_seen"] += 1
                grps["lt_min_seen"].append(alias)
    
    return aliases_to_remove, cnts, grps

In [13]:
aliases_to_remove, cnts, grps = get_aliases_to_remove(curr_aliases)
print(ujson.dumps(cnts, indent=4))
print(f"Will remove {len(aliases_to_remove)} out of {len(curr_aliases)}")

100%|███████████████████████████████████████████████████████████████████████████████████████████████| 2326245/2326245 [00:01<00:00, 1405098.30it/s]

{
    "not_in_wikipedia": 24852,
    "lt_min_seen": 26,
    "removed_filter": 14
}
Will remove 24866 out of 2326245





Sanity checks on the filter step. 

In [23]:
# sample what aliases are getting removed
num_to_sample = 50
for alias in np.random.choice(list(aliases_to_remove), num_to_sample): 
    print(alias)

三墩站
下多布林卡农村居民点
巴尔卡尼乡
岭南四大园林
王氏大宗祠
七十三
善夫
墨西哥卷
雷涛
第25届fism世界魔术大会
新化站
阿史那 (突厥)
乌戈尔语
六十
加拉西米夫卡
圣克雷阿克
vaio fs
微毛诃子
葛麻姆
乐民河
黑皮油松
哥姆巴尼甘杰乌帕齐拉
aich
煤炭龟
沃夫昌西克
帕拉图
美国领导的对叙利亚的军事干预
刺齿复叶耳蕨
超铁暴龙
解放军东北军需学校
印度驻中华民国大使列表
与立
舞鹤东交流道
mondi
psp slim lite
朱家相
gba机模拟器
nokia internet tablet
新达尼利夫卡
欲海奇女子
烫衣机
詹姆斯·金
马蹄决明
罗吉兹纳
双结节神螺
马徐妍
蒙莫尔
中国主席
布兰维尔
库佩瓦哈


In [27]:
# check for existence of certain words in aliases_to_remove
sanity_checks = [('the', True), 
                 ('你好', False),
                 ('七十三', True),
                 ('我', False)
                ]
for s, bool_val in sanity_checks: 
    assert (s in aliases_to_remove) is bool_val, f'{s} {bool_val} {s in aliases_to_remove}'

Remove aliases and save new candidate mapping.

In [28]:
print("Loading edit mode, may take some minutes")
entity_dump_edit = EntityProfile.load_from_cache(load_dir=root_dir / "entity_db", edit_mode=True)

Loading edit mode, may take some minutes


In [29]:
for alias in tqdm(aliases_to_remove):
    for qid in list(entity_dump_edit.get_qid_cands(alias)):
        entity_dump_edit.remove_mention(qid, alias)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 24866/24866 [00:00<00:00, 74148.39it/s]


In [30]:
new_dir = root_dir / 'entity_db_filt'
entity_dump_edit.save(new_dir)

Creating trie:   0%|          | 0/2301379 [00:00<?, ?it/s]

There were 1.4773750868501017e-05% of items that lost information because max_connections was too small.


Creating trie:   0%|          | 0/1252201 [00:00<?, ?it/s]

There were 0.08978430779084189% of items that lost information because max_connections was too small.


Prepping trie data:   0%|          | 0/1252201 [00:00<?, ?it/s]

Creating trie with 1252201 values. This can take a few minutes.
There were 0.000967097135364051% of items that lost information because max_connections was too small.
