In [9]:
import pandas as pd
import numpy as np
import re
from collections import Counter

In [2]:
train_df = pd.read_pickle("../../data/train_IOB_repl_compound.pkl")
test_df = pd.read_pickle("../../data/test_IOB_repl_compound.pkl")

## h2タグのみ考慮

In [30]:
def freq_headlines(df, n=None, _min=None):
    h_list = []
    for i, h in df.loc[:, ['h2']].iteritems():
        h_list += h.tolist()

    h_count = Counter(h_list)
    
    if n:
        return [h for h, _ in h_count.most_common(n + 1)]
    elif _min:
        return [h for h, count in h_count.most_common() if count >= _min]

def assign_cat_headline(df, headlines, col):
    '''
    Other headline         = 0
    Contains freq headline = 1
    No headline            = 2
    '''
    df[col] = 0

    df.loc[
        df.loc[:, ['h2']].fillna('NO_TITLE').apply(
            lambda x: x.str.match('|'.join([re.escape(h) for h in headlines]))
        ).sum(1) > 0
        , col
    ] = 1

    df.loc[
        df.loc[:, ['h2']].isna().all(1)
        , col
    ] = 2
    
    return df

In [24]:
production_headlines = freq_headlines(train_df.loc[train_df.production_tag_seq.apply(lambda x: 'B' in x)], _min=5)

train_df = assign_cat_headline(train_df, headlines=production_headlines, col='cat_production_headline')
test_df = assign_cat_headline(test_df, headlines=production_headlines, col='cat_production_headline')

In [25]:
raw_material_headlines = freq_headlines(train_df.loc[train_df.raw_material_tag_seq.apply(lambda x: 'B' in x)], _min=5)

train_df = assign_cat_headline(train_df, headlines=raw_material_headlines, col='cat_raw_material_headline')
test_df = assign_cat_headline(test_df, headlines=raw_material_headlines, col='cat_raw_material_headline')

In [26]:
train_df.to_pickle("../../data/train_IOB_repl_compound.pkl")
test_df.to_pickle("../../data/test_IOB_repl_compound.pkl")

In [27]:
test_df.groupby('cat_raw_material_headline').count()

Unnamed: 0_level_0,_id,label,sentence,title,words,repl_words,furigana_tag_seq,another_name_tag_seq,use_tag_seq,type_tag_seq,...,density_tag_seq,melting_tag_seq,boiling_tag_seq,rational_formula_tag_seq,h2,h3,h4,h5,h6,cat_production_headline
cat_raw_material_headline,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,633,633,633,633,633,633,633,633,633,633,...,633,633,633,633,633,142,0,0,0,633
1,492,492,492,492,492,492,492,492,492,492,...,492,492,492,492,492,142,0,0,0,492
2,439,439,439,439,439,439,439,439,439,439,...,439,439,439,439,0,1,0,0,0,439


## 直近のサブタイトルを考慮

In [96]:
raw_material_headlines = freq_headlines(train_df.loc[train_df.raw_material_tag_seq.apply(lambda x: 'B' in x)], _min=10)
production_headlines = freq_headlines(train_df.loc[train_df.production_tag_seq.apply(lambda x: 'B' in x)], _min=10)

In [81]:
def labeling_headline(headline_df, headlines):
    label_df = pd.DataFrame()
    
    # Other headlines
    label_df['label'] = [0] * len(headline_df)
    
    # exact match to freq-headlines
    is_match = \
    headline_df.apply(
        lambda x: [i for i in x if i == i], axis=1
    ).apply(
        lambda x: set(x) <= set(headlines)
    ).tolist()
    label_df['label'][is_match] = 1
    
    # no title
    is_match = \
    headline_df.apply(
        lambda x: [i for i in x if i == i], axis=1
    ).apply(
        lambda x: len(x) == 0
    ).tolist()
    label_df['label'][is_match] = 2
    
    return label_df.label.tolist()

In [101]:
train_df['cat_raw_material_headline'] = labeling_headline(train_df.loc[:, 'h2':'h6'], raw_material_headlines)
test_df['cat_raw_material_headline'] = labeling_headline(test_df.loc[:, 'h2':'h6'], raw_material_headlines)

train_df['cat_production_headline'] = labeling_headline(train_df.loc[:, 'h2':'h6'], production_headlines)
test_df['cat_production_headline'] = labeling_headline(test_df.loc[:, 'h2':'h6'], production_headlines)

In [83]:
test_df.groupby('cat_raw_material_headline').count()

Unnamed: 0_level_0,_id,label,sentence,title,words,repl_words,furigana_tag_seq,another_name_tag_seq,use_tag_seq,type_tag_seq,...,density_tag_seq,melting_tag_seq,boiling_tag_seq,rational_formula_tag_seq,h2,h3,h4,h5,h6,cat_production_headline
cat_raw_material_headline,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,776,776,776,776,776,776,776,776,776,776,...,776,776,776,776,775,259,0,0,0,776
1,350,350,350,350,350,350,350,350,350,350,...,350,350,350,350,350,26,0,0,0,350
2,438,438,438,438,438,438,438,438,438,438,...,438,438,438,438,0,0,0,0,0,438


In [105]:
train_df.assign(
    annotation_count = train_df.production_tag_seq.apply(lambda x: x.count('B'))
).groupby('cat_production_headline').sum()['annotation_count']

cat_production_headline
0    215
1    247
2    131
Name: annotation_count, dtype: int64

In [84]:
train_df.to_pickle("../../data/train_IOB_repl_compound_fix_cat_headlines.pkl")
test_df.to_pickle("../../data/test_IOB_repl_compound_fix_cat_headlines.pkl")