In [1]:
import numpy as np
import pandas as pd
import MeCab
import json
import re
from collections import Counter

import shinra_util as util

  return f(*args, **kwds)
  return f(*args, **kwds)


In [2]:
%load_ext autoreload
%autoreload 2

In [141]:
# 名詞・複合名詞のリストを返す
def get_noun_list(doc, join=True):
    mecab_param = MeCab.Tagger("-Ochasen -d /usr/local/lib/mecab/dic/mecab-ipadic-neologd")
    mecab_param.parse("")
    node = mecab_param.parseToNode(doc)
    
    noun_list = []
    noun = []
    while node:
        if len(node.surface) == 0:
            node = node.next
            continue

        hinshi = node.feature.split(',')
        if hinshi[0] in ['名詞', '接頭詞']:
            if join:
                noun.append(node.surface)
            else:
                noun_list.append(node.surface)
        elif (len(noun) > 0) and join:            
            noun_list.append(''.join(noun))
            noun = []
        
        node = node.next
    
    if (len(noun) > 0) and join:
        noun_list.append(''.join(noun))

    return noun_list

In [4]:
def read_jasonl(filename):
    with open(filename) as f:
        return [json.loads(line.rstrip('\r\n')) for line in f.readlines()]

In [2]:
flatten = lambda l: [item for sublist in l for item in sublist if len(item) is not 0]

In [4]:
wiki_data = util.read_jasonl("../data/jawiki-cirrussearch-dump_of_Compound.jsonl")

In [7]:
with open("../data/compound_train.json") as f:
    train = json.load(f)['entry']
    
train_char_dict = dict([[str(entry['WikipediaID']), entry['Attributes']['特性']] for entry in train])

In [210]:
train_char_dict = util.train2dict(train, '特性')

In [140]:
train_wiki_doc_df = \
pd.DataFrame({
    "_id": [entry['index']['_id'] for entry in wiki_data]
    , "document": [entry['opening_text'] if ('opening_text' in entry.keys() and entry['opening_text'] != None) else entry['text'] for entry in wiki_data]
    , "auxiliary_text": [' '.join(text['auxiliary_text']) for text in wiki_data]
}).pipe(
    lambda x: x.loc[x._id.isin(train_char_dict.keys())]
).reset_index(drop=True)

In [None]:
Counter(flatten(train_char_dict.values())).most_common()

In [3]:
def extract_characteristic(noun_list):
    result_char = [noun for noun in noun_list \
                   if (is_characteristic(noun) or is_color(noun) or is_status(noun) or is_smell(noun) or is_taste(noun))]
    
    return list(set(result_char))

def is_characteristic(word):
    patt = r'[^特|危険|男|女]+性$'
    return re.match(patt, word)

def is_color(word):
    patt = r'[^着|染]+色$'
    return re.match(patt, word)

def is_status(word):
    patt = r'(気体|液体|個体|結晶|粉末)$'
    return re.match(patt, word)

def is_smell(word):
    patt = r'\w+臭$'
    return re.match(patt, word)

def is_taste(word):
    patt = r'[^意]+味$'
    return re.match(patt, word)

In [211]:
train_wiki_doc_df["extracted_char"] = \
train_wiki_doc_df.apply(
    lambda x: extract_characteristic(util.get_noun_list(x.document)) + extract_characteristic(util.get_noun_list(x.auxiliary_text, join=False))
    , axis=1
)

In [212]:
predict_df = pd.DataFrame()
for _id, group in train_wiki_doc_df.groupby('_id'):
    if (not train_char_dict.get(_id)) or (len(group.extracted_char.values[0]) == 0):
        continue
    
    tmp_df = pd.DataFrame({"_id": _id, "extracted_char": group.extracted_char.values[0]})
    predict_df = predict_df.append(tmp_df)
    
extracted_dict = util.df2dict(predict_df, 'extracted_char')

In [213]:
util.validation(extracted_dict, train_char_dict)

{'precision': 0.8076923076923077,
 'recall': 0.43005181347150256,
 'f1': 0.5612622766060216}

## 出力

In [5]:
wiki_doc_df = \
pd.DataFrame({
    "_id": [entry['index']['_id'] for entry in wiki_data]
    , "document": [entry['opening_text'] if ('opening_text' in entry.keys() and entry['opening_text'] != None) else entry['text'] for entry in wiki_data]
    , "auxiliary_text": [' '.join(text['auxiliary_text']) for text in wiki_data]
}).reset_index(drop=True)

In [6]:
wiki_doc_df["extracted_char"] = \
wiki_doc_df.apply(
    lambda x: extract_characteristic(util.get_noun_list(x.document)) + extract_characteristic(util.get_noun_list(x.auxiliary_text, join=False))
    , axis=1
)

NameError: name 'train_char_dict' is not defined

In [11]:
predict_df = pd.DataFrame()
for _id, group in wiki_doc_df.groupby('_id'):
    tmp_df = pd.DataFrame({"_id": _id, "extracted_char": group.extracted_char.values[0]})
    predict_df = predict_df.append(tmp_df)
    
extracted_dict = util.df2dict(predict_df, 'extracted_char')

In [14]:
with open("../output/characteristic.json", 'w') as f:
    json.dump(extracted_dict, f)