In [67]:
import numpy as np
import pandas as pd
import json
import re
import itertools

In [114]:
flatten = lambda l: [i for sub_l in l for i in sub_l]

def true_positive(true, predict, partial=True):
    if partial:
        return partial_match(true, predict)
    else:
        return list(set(predict) & set(true))

def false_positive(true, predict, partial=True):
    if partial:
        return list(set(predict) - set(partial_match(true, predict)))
    else:
        return list(set(predict) - set(true))

def get_FP_dict(result: dict):
    fp_dict = {}
    for _id, entry in result.items():
        fp_dict[_id] = {'title': entry['title'], 'data': false_positive(entry['true'], entry['predict'])}
        
    return fp_dict

def get_TP_dict(result: dict):
    tp_dict = {}
    for _id, entry in result.items():
        tp_dict[_id] = {'title': entry['title'], 'data': true_positive(entry['true'], entry['predict'])}
        
    return tp_dict

def partial_match(true, pred):
    # True data を返すか Predict data を返すか
    return [t for t, p in itertools.product(true, pred) \
            if (re.search(fr'{re.escape(t)}', p) or re.search(fr'{re.escape(p)}', t))]

def with_sents(predict_df, extracted_dict):
    contain_sents_dict = {}
    for _id, entry in predict_df.groupby('_id'):
        contain_sent = [entry.apply(lambda x: x.repl_sentence if item in x.extracted else np.nan, axis=1).dropna().tolist() 
                        for item in extracted_dict[_id]['data']]
        contain_sents_dict[_id] = {
            'title': extracted_dict[_id]['title']
            , 'data': [{'extracted': item, 'sentence': sents} for item, sents in zip(extracted_dict[_id]['data'], contain_sent)]
        }
    
    return contain_sents_dict

In [104]:
test_df = pd.read_pickle("../../data/test_IOB_repl_compound.pkl")
test_df = test_df[['_id', 'title', 'sentence', 'repl_words']]
test_df['repl_sentence'] = test_df.repl_words.apply(lambda x: ''.join(x))

train_df = pd.read_pickle("../../data/train_IOB_repl_compound.pkl")
train_df = train_df.loc[:, ['_id', 'title', 'sentence', 'repl_words']]
train_production_df = pd.read_pickle("../../dump/train_production_with_extracted.pkl")
train_production_df = pd.merge(train_production_df, train_df, on=['_id', 'sentence'])
train_production_df['repl_sentence'] = train_production_df.repl_words.apply(lambda x: ''.join(x))
train_production_df.head(1)

Unnamed: 0,_id,sentence,extracted,title,repl_words,repl_sentence
0,10166,現在ではアンモニアの工業生産はハーバー・ボッシュ法によるものが一般的である。,[ハーバー・ボッシュ法],アンモニア,"[現在, で, は, [title-compound], の, 工業, 生産, は, ハーバ...",現在では[title-compound]の工業生産はハーバー・ボッシュ法によるものが一般的である。


In [48]:
predict_production_df = pd.merge(
    pd.read_pickle("../../dump/pred_production_with_tag_seq.pkl")
    , test_df
    , on=['_id', 'sentence']
)
predict_production_repl_df = pd.merge(
    pd.read_pickle("../../dump/pred_production_using_compound-list_with_tag_seq.pkl")
    , test_df
    , on=['_id', 'sentence']
)

In [57]:
with open("../../output/result/production.json", 'r') as f:
    result_production = json.load(f)
    
with open("../../output/result/production_using_compound-list.json", 'r') as f:
    result_production_repl = json.load(f)

In [69]:
production_fp_dict = get_FP_dict(result_production)
production_repl_fp_dict = get_FP_dict(result_production_repl)

In [115]:
production_tp_dict = get_TP_dict(result_production)
production_repl_tp_dict = get_TP_dict(result_production_repl)

In [124]:
production_TP_diff = {}
for _id, entry in result_production.items():
    production_TP_diff[_id] = {
        'title': entry['title']
        , 'data': list(set(production_repl_tp_dict[_id]['data']) - set(production_tp_dict[_id]['data']))
    }

In [125]:
with open("../../output/error_analysis/production_TP_add.json", 'w') as f:
    json.dump(production_TP_diff, f, ensure_ascii=False)

In [85]:
fp_sent_df = pd.DataFrame()
for _id, entry in with_sents(predict_production_repl_df, production_repl_fp_dict).items():
    sents = list(set(flatten([extraction['sentence'] for extraction in entry['data']])))
    if not sents:
        continue
    fp_sent_df = fp_sent_df.append(pd.DataFrame({'title': entry['title'], 'sentence': sents}))
    
tp_sent_df = pd.DataFrame()
for _id, entry in with_sents(predict_production_repl_df, production_repl_tp_dict).items():
    sents = list(set(flatten([extraction['sentence'] for extraction in entry['data']])))
    if not sents:
        continue
    tp_sent_df = tp_sent_df.append(pd.DataFrame({'title': entry['title'], 'sentence': sents}))

In [100]:
sent_df = pd.merge(tp_sent_df, fp_sent_df, on='sentence', how='right')
sent_df = sent_df[sent_df.title_x.isna()]
sent_df = sent_df.drop(columns='title_x').rename(columns={'title_y': 'title'})[['title', 'sentence']]

In [102]:
sent_df.to_csv("../../output/error_analysis/production_diff_add_sentence.csv", index=False)

In [109]:
tp_sent_df = pd.DataFrame()
for _id, entry in with_sents(predict_production_df, production_tp_dict).items():
    sents = list(set(flatten([extraction['sentence'] for extraction in entry['data']])))
    if not sents:
        continue
    tp_sent_df = tp_sent_df.append(pd.DataFrame({'title': entry['title'], 'sentence': sents}))
    
repl_tp_sent_df = pd.DataFrame()
for _id, entry in with_sents(predict_production_repl_df, production_repl_tp_dict).items():
    sents = list(set(flatten([extraction['sentence'] for extraction in entry['data']])))
    if not sents:
        continue
    repl_tp_sent_df = repl_tp_sent_df.append(pd.DataFrame({'title': entry['title'], 'sentence': sents}))

In [110]:
sent_df = pd.merge(tp_sent_df, repl_tp_sent_df, on='sentence', how='right')
sent_df = sent_df[sent_df.title_x.isna()]
sent_df = sent_df.drop(columns='title_x').rename(columns={'title_y': 'title'})[['title', 'sentence']]

In [113]:
sent_df.to_csv("../../output/error_analysis/production_diff_add_TP_sentence.csv", index=False)

In [52]:
# Drop
#merge_df = pd.merge(predict_production_df, predict_production_repl_df, on=['_id', 'repl_sentence'], how='left')
# Add
merge_df = pd.merge(predict_production_repl_df, predict_production_df, on=['_id', 'repl_sentence'], how='left')
merge_df = merge_df[merge_df.sentence_y.isna()]
merge_df = merge_df[['_id', 'title_x', 'repl_sentence', 'extracted_x']].rename(columns={'title_x': 'title', 'extracted_x': 'extracted'})

In [53]:
patt = r'合成|製造|製法|反応|生成|得(る|られる)' # サブタイトルからそれっぽいのを取った

In [54]:
print(len(merge_df))
print(len(merge_df[merge_df.repl_sentence.str.contains(patt)].repl_sentence.values))

66
46


  


In [56]:
merge_df.to_csv("../../output/error_analysis/production_diff_add_sentence.csv", index=False)

In [105]:
# トレーニングデータから アノテーション-文 の対応付データを作成
contain_sents_dict = {}
for _id, entry in train_production_df.groupby('_id'):
    productions = list(set(entry.extracted.values.sum()))
    contain_sent = [entry.apply(lambda x: x.repl_sentence if prod in x.extracted else np.nan, axis=1).dropna().tolist() for prod in productions]
    contain_sents_dict[_id] = {
        'title': entry.title.tolist()[0]
        , 'data': [{'extracted': prod, 'sentence': sents} for prod, sents in zip(productions, contain_sent)]
    }

In [107]:
print(len(train_production_df))
print(train_production_df.repl_words.apply(lambda x: "[title-compound]" in x).sum())

520
198
