In [1]:
import numpy as np
import pandas as pd
import json
import re
import MeCab
from collections import Counter

import shinra_util as util

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
%load_ext autoreload
%autoreload 2

In [138]:
def read_jasonl(filename):
    with open(filename) as f:
        return [json.loads(line.rstrip('\r\n')) for line in f.readlines()]

In [5]:
flatten = lambda l: [item for sublist in l for item in sublist if len(item) is not 0]
mecab_param = MeCab.Tagger("-Owakati -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")

In [3]:
NITE_existence_category = pd.read_csv("../data/NITE_existence_list.csv")
NITE_existence_category = NITE_existence_category.loc[NITE_existence_category.label == 1].reset_index(drop=True)
NITE_existence_category.category = NITE_existence_category.category.apply(lambda x: re.sub(r'の', '', x) if re.match(r'.+の化合物', x) else x)
NITE_existence_category.head()

Unnamed: 0,category,label
0,アルコール,1
1,アミン,1
2,フェノールエーテル,1
3,ケトン,1
4,無機化合物,1


In [3]:
with open("../data/compound_train.json") as f:
    train = json.load(f)['entry']

In [5]:
train_type_dict = util.train2dict(train, '種類')
#train_type_list = list(set(flatten(train_type_dict.values())))

In [8]:
wiki_data = util.read_jasonl("../data/jawiki-cirrussearch-dump_of_Compound.jsonl")

In [9]:
train_wiki_doc_df = \
pd.DataFrame({
    "_id": [entry['index']['_id'] for entry in wiki_data]
    , "document": [entry['opening_text'] if ('opening_text' in entry.keys() and entry['opening_text'] != None) else entry['text'] for entry in wiki_data]
    , "category": [entry['category'] for entry in wiki_data]
}).pipe(
    lambda x: x.loc[x._id.isin(train_type_dict.keys())]
).reset_index(drop=True)

In [9]:
# 名詞・複合名詞のリストを返す
def get_noun_list(doc):
    mecab_param = MeCab.Tagger("-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    mecab_param.parse("")
    node = mecab_param.parseToNode(doc)
    
    noun_list = []
    noun = []
    while node:
        if len(node.surface) == 0:
            node = node.next
            continue

        hinshi = node.feature.split(',')
        if hinshi[0] == '名詞':
            noun.append(node.surface)
        elif len(noun) > 0:            
            noun_list.append(''.join(noun))
            noun = []
        
        node = node.next
    
    if len(noun) > 0:
        noun_list.append(''.join(noun))

    return noun_list

In [11]:
# NITEに存在するカテゴリ名と一致する単語を抽出

type_list = NITE_existence_category.category.tolist()

type_using_NITE_df = \
train_wiki_doc_df.set_index('_id').apply(
    lambda x: pd.Series(type_list, index=type_list).isin(util.get_noun_list(x.document))
    , axis=1
)

In [12]:
# 各ページに存在するカテゴリ名と一致する単語を抽出
type_using_categories_df = \
train_wiki_doc_df.set_index('_id').apply(
    lambda x: pd.Series(x.category, index=x.category).isin(util.get_noun_list(x.document))
    , axis=1
).fillna(False)

In [19]:
type_using_NITE_list = \
type_using_NITE_df.apply(
    lambda x: x.index.values[x.values].tolist()
    , axis=1
).tolist()

type_using_categories_list = \
type_using_categories_df.apply(
    lambda x: x.index.values[x.values].tolist()
    , axis=1
).tolist()

#extracted_type_list = [list(set(type1 + type2)) for type1, type2 in zip(type_using_NITE_list, type_using_categories_list)]
extracted_type_list = type_using_NITE_list.copy()

In [20]:
extracted_type_dict = \
dict(
    zip(
        type_using_NITE_df.index
        , extracted_type_list
    )
)

## Validation

In [21]:
result_precision = []
result_recall = []
for _id, types in extracted_type_dict.items():
    result_precision += [(_type in train_type_dict[_id]) for _type in types]
    result_recall += [(train_type in types) for train_type in train_type_dict[_id]]

In [22]:
precision = sum(result_precision) / len(result_precision)
recall = sum(result_recall) / len(result_recall)
f1 = 2 * precision * recall / (precision + recall)

print(f"Precision:{precision}")
print(f"Recall:{recall}")
print(f"f1:{f1}")

Precision:0.4584487534626039
Recall:0.4384105960264901
f1:0.4482058226134055


## 出力

In [5]:
wiki_data = util.read_jasonl("../data/jawiki-cirrussearch-dump_of_Compound.jsonl")

In [6]:
all_wiki_doc_df = \
pd.DataFrame({
    "_id": [entry['index']['_id'] for entry in wiki_data]
    , "document": [entry['opening_text'] if ('opening_text' in entry.keys() and entry['opening_text'] != None) else entry['text'] for entry in wiki_data]
    , "category": [entry['category'] for entry in wiki_data]
}).reset_index(drop=True)

In [8]:
# NITEに存在するカテゴリ名と一致する単語を抽出
type_list = NITE_existence_category.category.tolist()

type_using_NITE_df = \
all_wiki_doc_df.set_index('_id').apply(
    lambda x: pd.Series(type_list, index=type_list).isin(util.get_noun_list(x.document))
    , axis=1
)

# 各ページに存在するカテゴリ名と一致する単語を抽出
type_using_categories_df = \
all_wiki_doc_df.set_index('_id').apply(
    lambda x: pd.Series(x.category, index=x.category).isin(util.get_noun_list(x.document))
    , axis=1
).fillna(False)

type_using_NITE_list = \
type_using_NITE_df.apply(
    lambda x: x.index.values[x.values].tolist()
    , axis=1
).tolist()

type_using_categories_list = \
type_using_categories_df.apply(
    lambda x: x.index.values[x.values].tolist()
    , axis=1
).tolist()

extracted_type_list = type_using_NITE_list.copy()

extracted_type_dict = \
dict(
    zip(
        type_using_NITE_df.index
        , extracted_type_list
    )
)

In [10]:
len(extracted_type_dict.keys())

5820

In [11]:
with open("../output/type.json", 'w') as f:
    json.dump(extracted_type_dict, f)