In [99]:
import pandas as pd
import numpy as np
import os
import requests
import re

In [299]:
data_dir = '../bert_ner/model_test/'
os.listdir(data_dir)

['test_predictions.txt',
 'special_tokens_map.json',
 'test_results.txt',
 'added_tokens.json',
 'pytorch_model.bin',
 'vocab.txt',
 'tokenizer_config.json',
 'eval_results.txt',
 'config.json',
 'training_args.bin']

In [300]:
df_test_result = pd.read_csv(os.path.join(data_dir,"test_predictions.txt"), sep=' ', names=['形態素', 'IOB2'])
df_test_result

Unnamed: 0,形態素,IOB2
0,[NULL],O
1,平成,O
2,三十,O
3,年,O
4,七,O
...,...,...
209857,時,O
209858,五十,O
209859,九,O
209860,分,O


In [301]:
IOB2_string = ''.join(df_test_result['IOB2'].values)

In [302]:
IOB2_string = IOB2_string.replace('BB','BI').replace('OBO','OOO').replace('OIO','OOO')

In [303]:
matches = re.finditer(r'O[I]+', IOB2_string)
IOB2_list = list(IOB2_string)
for m in matches:
    print(m.span()[0], IOB2_list[m.span()[0]+1])
    IOB2_list[m.span()[0]+1] = 'B'

12332 I
14532 I
24477 I
72163 I


In [304]:
IOB2_list[12332+1]

'B'

In [305]:
df_test_result['IOB2'] = pd.Series(IOB2_list)

In [306]:
p = re.compile('[\u3041-\u309F]+') #平仮名かどうか
special_tokens = ['、', '。', '　', ' ', '・', '[SPACE]', '[BAR]', '[NULL]', '[LBAR]', '[SEP]']
USELESS_or_not = [True if ((p.fullmatch(w) is not None) or (w in special_tokens)) else False for w in list(df_test_result['形態素'].values)]

In [307]:
sum(USELESS_or_not)

135525

In [308]:
USELESS_B = df_test_result[(df_test_result['IOB2']=='B') & USELESS_or_not]


In [309]:
for index, item in USELESS_B.iterrows():
    df_test_result['IOB2'][index] = 'O'
    df_test_result['IOB2'][index+1] = 'B'

In [310]:
USELESS_E = df_test_result[((df_test_result['IOB2']=='I') * (df_test_result['IOB2']!='I').shift(periods=-1, fill_value=False)) & USELESS_or_not]


  f"evaluating in Python space because the {repr(op_str)} "


In [311]:
for index, item in USELESS_E.iterrows():
    df_test_result['IOB2'][index] = 'O'


In [312]:
mention_list = []
mention_len = 0
mention_name = ''
for index,item in df_test_result.iterrows():
#     print(index, item['形態素'], item['IOB2'])
    if item['IOB2']=='O':
        if mention_len>0:
            for i in range(mention_len):
                mention_list.append(mention_name)
            mention_len=0
            mention_name=''
        mention_list.append(np.nan)
    elif item['IOB2']=='B':
        if mention_len>0:
            for i in range(mention_len):
                mention_list.append(mention_name)

        mention_len=1
        mention_name = item['形態素']
    elif item['IOB2']=='I':
        mention_len += 1
        mention_name = mention_name + item['形態素']
# mention_list

In [313]:
df_test_result['メンション'] = pd.Series(mention_list)
df_test_result[:10]

Unnamed: 0,形態素,IOB2,メンション
0,[NULL],O,
1,平成,O,
2,三十,O,
3,年,O,
4,七,O,
5,月,O,
6,十,O,
7,八,O,
8,日,O,
9,（,O,


In [314]:
def judge_mention_by_end(mention):
    if type(mention) is not str:
        return False
    end_list = [
        '法',
        '法案',
        '法制',
        'に関する法律',
        'に関する法律案',
        'の一部を改正する法律案',
        '改正案',
        '法律',
    ]
    for end in end_list:
        if mention.endswith(end):
            return True
    return False

In [315]:
mention_list = list(df_test_result['メンション'].values)
fake_mention_or_not = [not judge_mention_by_end(mention) for mention in mention_list]
fake_mention = df_test_result[(df_test_result['IOB2']!='O') & fake_mention_or_not]

for index, item in fake_mention.iterrows():
    df_test_result['IOB2'][index] = 'O'
    df_test_result['メンション'][index] = np.nan


In [316]:
from wikipedia2vec import Wikipedia2Vec
pretrained_emb_path = '../pretrained_embedding/jawiki_20180420_300d.pkl'
wiki2vec = Wikipedia2Vec.load(pretrained_emb_path)

In [317]:
wiki2vec.get_entity('リゾート法') is None
wiki2vec.get_entity('リゾート法').title

'総合保養地域整備法'

In [318]:
def get_official_name(mention_name):
    e = wiki2vec.get_entity(mention_name)
    if e is None:
        e_dash = None
        if mention_name.endswith('案'):
            e_dash = wiki2vec.get_entity(mention_name[:-1])
        elif mention_name.endswith('法'):
            e_dash = wiki2vec.get_entity(mention_name+'案')
        if e_dash is None:
            return None
        return e_dash.title
    return e.title

In [319]:
n = '総合保養地域整備法'
n[:-1]

'総合保養地域整備'

In [320]:
def get_url_from_title(title):
    S = requests.Session()
    URL = "https://ja.wikipedia.org/w/api.php"
    PARAMS = {
        "action": "query",
        "format": "json",
        "titles": title,
        "prop": "info",
        "inprop": "url"
    }
    R = S.get(url=URL, params=PARAMS)
    DATA = R.json()

    PAGES = DATA["query"]["pages"]
    
    for k, v in PAGES.items():
        return v["fullurl"]

In [321]:
ZEN = "".join(chr(0xff01 + i) for i in range(94))
HAN = "".join(chr(0x21 + i) for i in range(94))

ZEN2HAN = str.maketrans(ZEN, HAN)

title_list = []
url_list = []
for index,item in df_test_result.iterrows():
    if index%10000==0:
        print(index, item['形態素'], item['IOB2'])
    mention_name = item['メンション']
    if type(mention_name) is not str:
        title_list.append(np.nan)
        url_list.append(np.nan)
    else:
        official_name = get_official_name(mention_name.translate(ZEN2HAN))
        if official_name is None:
            title_list.append(np.nan)
            url_list.append(np.nan)
        else:
            title_list.append(official_name)
            url_list.append(get_url_from_title(official_name))

df_test_result['wikipediaタイトル'] = pd.Series(title_list)
df_test_result['wikipediaページ'] = pd.Series(url_list)
df_test_result[:20]

0 [NULL] O
10000 三 O
20000 下げ O
30000 ませ O
40000 [SPACE] O
50000 に O
60000 なら O
70000 の O
80000 交渉 O
90000 もらい O
100000 、 O
110000 伺い O
120000 人材 O
130000 もらっ O
140000 て O
150000 し O
160000 、 O
170000 総務 O
180000 退席 O
190000 ます O
200000 、 O


Unnamed: 0,形態素,IOB2,メンション,wikipediaタイトル,wikipediaページ
0,[NULL],O,,,
1,平成,O,,,
2,三十,O,,,
3,年,O,,,
4,七,O,,,
5,月,O,,,
6,十,O,,,
7,八,O,,,
8,日,O,,,
9,（,O,,,


In [322]:
get_official_name('IR法案')

'特定複合観光施設区域の整備の推進に関する法律'

In [323]:
get_official_name('ＩＲ法案'.translate(ZEN2HAN))

'特定複合観光施設区域の整備の推進に関する法律'

In [324]:
pd.set_option('display.max_rows', 1000)
df_test_result[df_test_result['IOB2']=='B']

Unnamed: 0,形態素,IOB2,メンション,wikipediaタイトル,wikipediaページ
53,健康,B,健康増進法の一部を改正する法律案,,
78,北方,B,北方領土問題等の解決の促進のための特別措置に関する法律の一部を改正する法律案,,
113,北方,B,北方地域旧漁業権者等に対する特別措置に関する法律の一部を改正する法律案,,
635,施設,B,施設区域整備法案,,
642,ＩＲ,B,ＩＲ整備法案,,
656,カジノ,B,カジノ法案,特定複合観光施設区域の整備の推進に関する法律,https://ja.wikipedia.org/wiki/%E7%89%B9%E5%AE%...
1184,カジノ,B,カジノ法案,特定複合観光施設区域の整備の推進に関する法律,https://ja.wikipedia.org/wiki/%E7%89%B9%E5%AE%...
1194,カジノ,B,カジノ法案,特定複合観光施設区域の整備の推進に関する法律,https://ja.wikipedia.org/wiki/%E7%89%B9%E5%AE%...
2045,ＩＲ,B,ＩＲ整備法,,
2093,ＩＲ,B,ＩＲ法案,特定複合観光施設区域の整備の推進に関する法律,https://ja.wikipedia.org/wiki/%E7%89%B9%E5%AE%...


In [325]:
len(df_test_result)

209862

In [326]:
df_test = pd.read_table('../data/PoliInfo2-EntityLinking-FormalRun-v20200703/AnswerSheet/PoliInfo2-EntityLinking-JA-Formal-Test.tsv')


In [327]:
len(df_test)

209862

In [328]:
df_test_result['形態素'] = df_test['形態素']

In [333]:
df_test_result['IOB2'] = df_test_result['IOB2'].replace('O', np.nan)

In [334]:
df_test_result.to_csv('../submission/submission.tsv', sep='\t', index=False)