In [1]:
import numpy as np
import pandas as pd
import json
import re
from collections import Counter

In [33]:
def extract_words(word_seq, tag_seq):
    words_list = []
    words = []
    for word, tag in zip(word_seq, tag_seq):
        if tag == 'B' and words:
            words_list.append(''.join(words))
            words = [word]
        elif tag == 'B' or (tag == 'I' and words):
            words.append(word)
        elif words:
            words_list.append(''.join(words))
            words = []
            
    if words:
        words_list.append(''.join(words))
        
    return words_list

def annotaions_in_section(section_df, _set=True):
    annotaions = section_df.apply(lambda x: extract_words(x.words, x.raw_material_tag_seq), axis=1).sum()
    if _set:
        annotaions = list(set(annotaions))
    
    return annotaions

In [3]:
with open("../../data/compound_train.json", 'r') as f:
    annotations = {entry['WikipediaID']: entry['Attributes']['原材料'] for entry in json.load(f)['entry']}

In [4]:
train_df = pd.read_pickle("../../data/train_IOB_repl_compound.pkl")
train_df.head(1)

Unnamed: 0,_id,label,sentence,title,words,repl_words,furigana_tag_seq,another_name_tag_seq,use_tag_seq,type_tag_seq,...,melting_tag_seq,boiling_tag_seq,rational_formula_tag_seq,h2,h3,h4,h5,h6,cat_production_headline,cat_raw_material_headline
0,10166,False,アンモニア (英: ammonia) は分子式が NH 3 で表される無機化合物。,アンモニア,"[アンモニア, (, 英, :, ammonia, ), は, 分子, 式, が, NH, ...","[[title-compound], (, 英, :, [title-compound], ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, B, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",...,"[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...",,,,,,2,2


In [5]:
flatten = lambda l: [i for sub_l in l for i in sub_l]

def get_only_one(l):
    return [item for i, item in enumerate(l) if l.count(item) is 1]

def get_duplicated(l):
    return [item for i, item in enumerate(l) if l.count(item) > 1]

def get_idx(l, unique_list):
    return [i for i, items in enumerate(l) if (set(unique_list) & set(items))]

In [7]:
only_one_material_in_entry = []
headline_list = []
for _, entry in train_df.fillna('NO_HEADING').groupby('_id'):
    section_group = \
    entry.groupby('h2').apply(
        lambda g: list(set(g.apply(lambda x: extract_words(x.words, x.raw_material_tag_seq), axis=1).sum()))
    )
    headings = section_group.index.values
    materials = section_group
    if not materials.sum():
        continue
        
    only_one_materials = get_only_one(materials.sum())
    if not only_one_materials:
        continue
    
    only_one_materials_idx = get_idx(materials, only_one_materials)
    
    only_one_material_in_entry.append(only_one_materials)
    headline_list.append(headings[only_one_materials_idx])

In [9]:
len(flatten([materials for materials in only_one_material_in_entry]))

861

In [10]:
Counter(flatten(headline_list)).most_common()

[('NO_HEADING', 111),
 ('合成', 48),
 ('製法', 23),
 ('合成法', 14),
 ('製造', 11),
 ('生合成', 7),
 ('生成', 7),
 ('性質', 6),
 ('用途', 4),
 ('調製', 4),
 ('生産', 3),
 ('反応', 3),
 ('化学合成', 2),
 ('概要', 2),
 ('歴史', 2),
 ('工業生産', 2),
 ('沿革', 2),
 ('合成方法', 2),
 ('その他', 1),
 ('構造と誘導体', 1),
 ('発見', 1),
 ('実験室的研究', 1),
 ('多形', 1),
 ('解糖系', 1),
 ('その他の代謝', 1),
 ('参考文献', 1),
 ('正塩', 1),
 ('天然での存在', 1),
 ('生化学', 1),
 ('四塩化ケイ素', 1),
 ('おもな誘導体', 1),
 ('マンガン(VI)酸塩', 1),
 ('効果・毒性・特徴', 1),
 ('合成と性質', 1),
 ('1,3-ジチオラン', 1),
 ('単離', 1),
 ('製造と性質', 1),
 ('錯体化学', 1),
 ('合成方法と反応性', 1),
 ('生成、性質', 1),
 ('有機化学', 1),
 ('無機化学', 1),
 ('自然発生', 1),
 ('合成・単離', 1),
 ('工業的な製造と利用', 1),
 ('食料品中の含有量', 1),
 ('生合成と代謝', 1),
 ('合成と反応', 1),
 ('誘導体', 1),
 ('化学', 1),
 ('パラフィン（固形）', 1),
 ('生成方法', 1),
 ('製造と用途', 1),
 ('特徴', 1),
 ('発見と利用', 1),
 ('チタン酸リチウム増殖材の合成', 1),
 ('全合成', 1),
 ('合成と構造', 1),
 ('合成と主な反応', 1),
 ('用途と製法', 1),
 ('人工的な合成法', 1),
 ('バンレイシ科からの抽出', 1),
 ('天然における存在', 1),
 ('生理学', 1),
 ('インドールの合成', 1),
 ('生合成と反応', 1),
 ('存在と合成法', 1),
 ('

In [297]:
duplicated_materials_in_entry = []
for _, entry in train_df.fillna('NO_HEADING').groupby('_id'):
    ssection_group = \
    entry.groupby('h2').apply(
        lambda g: list(set(g.apply(lambda x: extract_words(x.words, x.raw_material_tag_seq), axis=1).sum()))
    )
    headings = section_group.index.values
    materials = section_group
    if not materials.sum():
        continue
        
    only_one_materials = get_only_one(materials.sum())
    if not only_one_materials:
        continue
    
    # 複数のセクションに分布している原材料のみ抽出（重複あり）
    all_materials = entry.apply(
        lambda x: extract_words(x.words, x.raw_material_tag_seq)
        , axis=1
    ).sum()
    
    duplicated_materials = [material for material in all_materials if material not in only_one_materials]
    if not duplicated_materials:
        continue
        
    duplicated_materials_in_entry.append(duplicated_materials)

In [298]:
len(flatten([materials for materials in duplicated_materials_in_entry]))

1961

In [304]:
heading_df = pd.DataFrame()
for _id, entry in train_df.fillna('NO_HEADING').groupby('_id'):
    section_group = \
    entry.groupby('h2').apply(
        lambda g: g.apply(lambda x: extract_words(x.words, x.raw_material_tag_seq), axis=1).sum()
    )
    
    annotaions = section_group.sum()
    if not annotaions:
        continue
    
    heading_count = len(section_group)
    heading_mean = len(annotaions) / heading_count
    var = section_group.apply(lambda x: (len(x) - heading_mean) ** 2).sum() / heading_count
    
    heading_df = heading_df.append(pd.DataFrame({
        '_id': [_id]
        , 'title': [entry.title.iloc[0]]
        , 'count_heading': [heading_count]
        , 'count_heading_with_annotaions': len([anotation for anotation in section_group if anotation])
        , 'annotations': [len(annotaions)]
        , 'unique_annotations': [len(set(annotaions))]
        , 'annotations_var': [var]
        , 'annotations_std': [np.sqrt(var)]
    }))
    break

In [227]:
heading_df.sort_values('annotations_var', ascending=False).to_csv("../../output/error_analysis/variance_annotaions_in_heading.csv", index=False)

In [273]:
print(heading_df.unique_annotations.sum())
print(heading_df.annotations.sum())

1077
1968


## 1つのセクションに全てのアノテーションデータがある記事の数

In [178]:
converge_df = pd.DataFrame()
scatter_df = pd.DataFrame()
for _id, entry in train_df.fillna('NO_HEADING').groupby('_id'):
    section_group = \
    entry.groupby('h2').apply(
        lambda g: annotaions_in_section(g)
    )
    annotaions = set(section_group.sum())
    if not annotaions:
        continue
    
    # 全てのアノテーションデータが存在するセクションがあれば True
    if section_group.apply(lambda x: set(x) == annotaions).any():
        converge_df = converge_df.append(entry)
    else:
        scatter_df = scatter_df.append(entry)

In [179]:
print("Converge:", len(converge_df.groupby('_id')))
print("Scatter:", len(scatter_df.groupby('_id')))

Converge: 243
Scatter: 45


In [None]:
scatter_count_df = pd.DataFrame()
for _id, entry in scatter_df.groupby('_id'):
    section_group = \
    entry.groupby('h2').apply(
        lambda g: annotaions_in_section(g)
    )
    
    scatter_count_df = \
    scatter_count_df.append(
        pd.DataFrame({_id: section_group.apply(lambda x: len(x)).to_dict()}).T
    )
    scatter_count_df.loc[_id, 'total'] = len(set(section_group.sum()))

scatter_count_df.fillna(0, inplace=True)

In [204]:
converge_count_df = pd.DataFrame()
for _id, entry in converge_df.groupby('_id'):
    section_group = \
    entry.groupby('h2').apply(
        lambda g: annotaions_in_section(g)
    )
    
    converge_count_df = \
    converge_count_df.append(
        pd.DataFrame({_id: section_group.apply(lambda x: len(x)).to_dict()}).T
    )
    converge_count_df.loc[_id, 'total'] = len(set(section_group.sum()))

converge_count_df.fillna(0, inplace=True)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


In [236]:
scatter_count_df.apply(lambda x: x.drop('total').argmax(), axis=1)

will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  """Entry point for launching an IPython kernel.


10166             その他
11006              製法
1162928            発見
1244436            多形
1270151    NO_HEADING
1273610            正塩
1326610    NO_HEADING
1429429            合成
156868             製造
161388             性質
161391     NO_HEADING
1668122            製法
1668370            合成
1747207    NO_HEADING
1859145            合成
1885021         製造と性質
1892810    NO_HEADING
1906013            製法
1933768    NO_HEADING
2017242          化学合成
2051615    NO_HEADING
2077209    NO_HEADING
218484           無機化学
223549           自然発生
226179            合成法
236551     NO_HEADING
245531            合成法
2602070           生合成
267435            誘導体
2707869    NO_HEADING
271215      パラフィン（固形）
27129            生成方法
27347             合成法
291072             特徴
3097272            合成
3350823            合成
3621491            合成
3692796    NO_HEADING
372433     NO_HEADING
37785      NO_HEADING
384083             生産
433929       インドールの合成
56097      NO_HEADING
652294     NO_HEADING
912412             合成
dtype: obj

In [232]:
# エントロピーデータ読み込み
entropy_s = pd.read_pickle("../../data/annotaions_entropy_in_section_raw-material.pkl")
entropy_s.drop('NO_HEADING', inplace=True)
entropy_s = entropy_s[entropy_s >= 1.]

In [203]:
def max_entropy_heading(count, entropy):
    heading_df = pd.concat([entropy, count], axis=1).dropna()
    if heading_df.empty:
        return 'NO_HEADING'
    
    return heading_df.sort_values('entropy', ascending=False).entropy.argmax()

In [238]:
selection_heading_df = pd.DataFrame()
for _id, counts in scatter_count_df.iterrows():
    grep_counts = counts[counts > 0].drop('total')
    heading = max_entropy_heading(grep_counts, entropy_s)
    
    selection_heading_df = selection_heading_df.append(pd.DataFrame({
        '_id': [_id]
        , 'max_entropy_heading': [heading]
        , 'counts': [grep_counts.get(heading, 0)]
        , 'loss': [counts['total'] - grep_counts.get(heading, 0)]
    }))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  
will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  


In [233]:
selection_heading_df = pd.DataFrame()
for _id, counts in pd.concat([scatter_count_df, converge_count_df]).fillna(0).iterrows():
    grep_counts = counts[counts > 0].drop('total')
    heading = max_entropy_heading(grep_counts, entropy_s)
    
    selection_heading_df = selection_heading_df.append(pd.DataFrame({
        '_id': [_id]
        , 'max_entropy_heading': [heading]
        , 'counts': [grep_counts.get(heading, 0)]
        , 'loss': [counts['total'] - grep_counts.get(heading, 0)]
    }))

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  
will be corrected to return the positional maximum in the future.
Use 'series.values.argmax' to get the position of the maximum now.
  


In [239]:
print(selection_heading_df.counts.sum())
print(selection_heading_df.loss.sum())

118.0
159.0
