In [110]:
import numpy as np
import pandas as pd
import json
import re

In [179]:
flatten = lambda l: [i for sub_l in l for i in sub_l]

def true_positive(true, predict):
    return list(set(predict) & set(true))

def false_positive(true, predict):
    return list(set(predict) - set(true))

def get_FP_dict(result: dict):
    fp_dict = {}
    for _id, entry in result.items():
        fp_dict[_id] = {'title': entry['title'], 'data': false_positive(entry['true'], entry['predict'])}
        
    return fp_dict

def get_TP_dict(result: dict):
    tp_dict = {}
    for _id, entry in result.items():
        tp_dict[_id] = {'title': entry['title'], 'data': true_positive(entry['true'], entry['predict'])}
        
    return tp_dict

In [154]:
with open("../../data/compound_train.json", 'r') as f:
    title_dict = {str(entry['WikipediaID']): entry['Name'] for entry in json.load(f)['entry']}

In [376]:
train_df = pd.read_pickle("../../data/train_IOB_repl_compound.pkl")
train_df = train_df.loc[:, ['_id', 'title', 'sentence', 'repl_words']]

train_material_df = pd.read_pickle("../../dump/train_raw-material_with_extracted.pkl")
train_material_df = pd.merge(train_material_df, train_df, on=['_id', 'sentence'])
train_material_df['repl_sentence'] = train_material_df.repl_words.apply(lambda x: ''.join(x))
train_material_df.head()

Unnamed: 0,_id,sentence,extracted,title,repl_words,repl_sentence
0,10166,水に良く溶けるため、水溶液（アンモニア水）として使用されることも多く、化学工業では基礎的な窒...,[窒素],アンモニア,"[水, に, 良く, 溶ける, ため, 、, 水溶液, （, [compound], ）, ...",水に良く溶けるため、水溶液（[compound]）として使用されることも多く、化学工業では基...
1,10166,窒素原子上の孤立電子対のはたらきにより、金属錯体の配位子となり、その場合はアンミンと呼ばれる。,[窒素],アンモニア,"[窒素, 原子, 上, の, 孤立, 電子, 対, の, はたらき, により, 、, 金属,...",窒素原子上の孤立電子対のはたらきにより、金属錯体の配位子となり、その場合はアンミンと呼ばれる。
2,10166,アモンの塩が意味する化合物は食塩と尿から合成されていた塩化アンモニウムである。,[塩化アンモニウム],アンモニア,"[アモン, の, 塩, が, 意味, する, 化合, 物, は, [compound], と...",アモンの塩が意味する化合物は[compound]と尿から合成されていた[compound]である。
3,10166,アンモニア分子は窒素を中心とする四面体構造を取っており、各頂点には3つの水素原子と一対の孤立...,"[窒素, 水素]",アンモニア,"[[title-compound], 分子, は, 窒素, を, 中心, と, する, 四,...",[title-compound]分子は窒素を中心とする四面体構造を取っており、各頂点には3つ...
4,10166,塩化水素（塩酸）を近づけると塩化アンモニウム (NH4Cl) の白煙を生じる。,[塩化アンモニウム],アンモニア,"[[compound], （, [compound], ）, を, 近づける, と, [co...",[compound]（[compound]）を近づけると[compound]([compou...


In [377]:
test_df = pd.read_pickle("../../data/test_IOB_repl_compound.pkl")
test_df = test_df[['_id', 'title', 'sentence', 'repl_words']]
test_df['repl_sentence'] = test_df.repl_words.apply(lambda x: ''.join(x))

In [378]:
predict_material_df = pd.merge(
    pd.read_pickle("../../dump/pred_raw-material_with_tag_seq.pkl")
    , test_df
    , on=['_id', 'sentence']
)
predict_material_repl_df = pd.merge(
    pd.read_pickle("../../dump/pred_raw-material_using_compound-list_with_tag_seq.pkl")
    , test_df
    , on=['_id', 'sentence']
)

In [379]:
with open("../../output/result/raw-material.json", 'r') as f:
    result_materials = json.load(f)
    
with open("../../output/result/raw-material_using_compound-list.json", 'r') as f:
    result_materials_repl = json.load(f)

In [380]:
materials_fp_dict = get_FP_dict(result_materials)
materials_repl_fp_dict = get_FP_dict(result_materials_repl)

In [381]:
with open("../../output/error_analysis/raw-material_FP.json", 'w') as f:
    json.dump(materials_fp_dict, f)
    
with open("../../output/error_analysis/raw-material_using_compound-list_FP.json", 'w') as f:
    json.dump(materials_repl_fp_dict, f)

In [418]:
materials_tp_dict = get_TP_dict(result_materials)
materials_repl_tp_dict = get_TP_dict(result_materials_repl)

In [180]:
with open("../../output/error_analysis/raw-material_TP.json", 'w') as f:
    json.dump(materials_tp_dict, f)
    
with open("../../output/error_analysis/raw-material_using_compound-list_TP.json", 'w') as f:
    json.dump(materials_repl_tp_dict, f)

In [450]:
materials_FP_diff = {}
for _id, entry in result_materials.items():
    materials_FP_diff[_id] = {
        'title': entry['title']
        , 'data': list(set(materials_fp_dict[_id]['data']) - set(materials_repl_fp_dict[_id]['data']))
    }

In [445]:
with open("../../output/error_analysis/raw-material_FP_diff.json", 'w') as f:
    json.dump(materials_FP_diff, f, ensure_ascii=False)

In [451]:
materials_TP_diff = {}
for _id, entry in result_materials.items():
    materials_TP_diff[_id] = {
        'title': entry['title']
        , 'data': list(set(materials_repl_tp_dict[_id]['data']) - set(materials_tp_dict[_id]['data']))
    }

In [453]:
with open("../../output/error_analysis/raw-material_TP_diff.json", 'w') as f:
    json.dump(materials_TP_diff, f, ensure_ascii=False)

In [414]:
# Drop
#merge_df = pd.merge(predict_material_df, predict_material_repl_df, on=['_id', 'repl_sentence'], how='left')
# Add
merge_df = pd.merge(predict_material_repl_df, predict_material_df, on=['_id', 'repl_sentence'], how='left')
merge_df = merge_df[merge_df.sentence_y.isna()]
merge_df = merge_df[['_id', 'title_x', 'repl_sentence', 'extracted_x']].rename(
    columns={'title_x': 'title', 'extracted_x': 'extracted'}
)

In [415]:
patt = r'合成|製造|製法|反応|生成|得(る|られる)' # サブタイトルからそれっぽいのを取った

In [442]:
print(len(merge_df))
print(len(merge_df[merge_df.repl_sentence.str.contains(patt)].repl_sentence.values))

99
32


  


In [399]:
len(merge_df.groupby('_id').extracted.apply(lambda x: list(set(x.sum()))).sum())

104

In [398]:
merge_df.to_csv("../../output/error_analysis/raw-material_diff_add_sentence.csv", index=False)

In [419]:
def with_sents(predict_df, extracted_dict):
    contain_sents_dict = {}
    for _id, entry in predict_df.groupby('_id'):
        contain_sent = [entry.apply(lambda x: x.repl_sentence if item in x.extracted else np.nan, axis=1).dropna().tolist() 
                        for item in extracted_dict[_id]['data']]
        contain_sents_dict[_id] = {
            'title': extracted_dict[_id]['title']
            , 'data': [{'extracted': item, 'sentence': sents} for item, sents in zip(extracted_dict[_id]['data'], contain_sent)]
        }
    
    return contain_sents_dict

In [459]:
fp_sent_df = pd.DataFrame()
for _id, entry in with_sents(predict_material_repl_df, materials_repl_fp_dict).items():
    sents = list(set(flatten([extraction['sentence'] for extraction in entry['data']])))
    if not sents:
        continue
    fp_sent_df = fp_sent_df.append(pd.DataFrame({'title': entry['title'], 'sentence': sents}))
    
tp_sent_df = pd.DataFrame()
for _id, entry in with_sents(predict_material_repl_df, materials_repl_tp_dict).items():
    sents = list(set(flatten([extraction['sentence'] for extraction in entry['data']])))
    if not sents:
        continue
    tp_sent_df = tp_sent_df.append(pd.DataFrame({'title': entry['title'], 'sentence': sents}))
    
sent_df = pd.merge(tp_sent_df, fp_sent_df, on='sentence', how='right')
sent_df = sent_df[sent_df.title_x.isna()]
sent_df = sent_df.drop(columns='title_x').rename(columns={'title_y': 'title'})[['title', 'sentence']]

In [461]:
sent_df.to_csv("../../output/error_analysis/raw-material_using_compound-list_FP_sentence.csv", index=False)

In [440]:
print(len(sent_df))
print(len(sent_df[sent_df.sentence.str.contains(patt)].sentence.values))

68
47


  


In [198]:
# diff をとっていないデータから作成
contain_sents_dict = {}
for _id, entry in predict_material_df.groupby('_id'):
    contain_sent = [entry.apply(lambda x: x.repl_sentence if material in x.extracted else np.nan, axis=1).dropna().tolist() 
                    for material in materials_fp_dict[_id]['data']]
    contain_sents_dict[_id] = {
        'title': materials_fp_dict[_id]['title']
        , 'data': [{'extracted': material, 'sentence': sents} for material, sents in zip(materials_fp_dict[_id]['data'], contain_sent)]
    }

In [229]:
# diff を取ったデータから作成
contain_sents_dict = {}
for _id, entry in predict_material_df.groupby('_id'):
    contain_sent = [entry.apply(lambda x: x.repl_sentence if material in x.extracted else np.nan, axis=1).dropna().tolist() 
                    for material in materials_FP_diff[_id]['data']]
    contain_sents_dict[_id] = {
        'title': materials_FP_diff[_id]['title']
        , 'data': [{'extracted': material, 'sentence': sents} for material, sents in zip(materials_FP_diff[_id]['data'], contain_sent)]
    }

In [173]:
# トレーニングデータから アノテーション-文 の対応付データを作成
contain_sents_dict = {}
for _id, entry in train_material_df.groupby('_id'):
    materials = list(set(entry.extracted.values.sum()))
    contain_sent = [entry.apply(lambda x: x.repl_sentence if material in x.extracted else np.nan, axis=1).dropna().tolist() for material in materials]
    contain_sents_dict[_id] = {
        'title': entry.title.tolist()[0]
        , 'data': [{'extracted': material, 'sentence': sents} for material, sents in zip(materials, contain_sent)]
    }

In [178]:
print(len(train_material_df))
print(train_material_df.repl_words.apply(lambda x: "[title-compound]" in x).sum())

1106
444


In [230]:
with open("../../output/error_analysis/raw-material_FP_diff_with_repl_sentence.json", 'w') as f:
    json.dump(contain_sents_dict, f, ensure_ascii=False)

In [125]:
len(flatten([v['data'] for v in contain_sents_dict.values()]))

174