In [3]:
from bs4 import BeautifulSoup
import pathlib
import pandas as pd
import re
import json
import numpy as np
import difflib

In [4]:
def get_paragraph(headline):
    paragraph = []
    for tag in headline.parent.next_siblings:
        next_headlines = headline_tag_list[:headline_tag_dict[headline_tag] + 1]
        if tag.name in next_headlines:
            break
        if isinstance(tag, str):
            continue

        paragraph.append(tag)
    
    return paragraph

def get_paragraph_text(paragraph: list):
    paragraph_text = [re.sub(r'\n', '', tag.get_text()) for tag in paragraph]
    paragraph_text = [text for text in paragraph_text if re.search(r'。', text)]
    paragraph_text = ''.join(paragraph_text)
    
    return paragraph_text

def split_sentence(text: str):
    return [s.strip() for s in re.findall(r'[^。]+(?:。|$)', text.strip())]

def similar_sentences(sentences, candidates):
    return np.array([_is_similar(s, candidates) for s in sentences])

def _is_similar(s, candidates):
    similar = difflib.get_close_matches(s, candidates, n=1)
    if similar:
        return True
    else:
        return False

In [3]:
html_path = pathlib.Path("../../data/html/compound/")

In [4]:
headline_tag_list = ["h1", "h2", "h3", "h4", "h5", "h6"]
headline_tag_dict = {tag: i for i, tag in enumerate(headline_tag_list)}

In [5]:
target_filename = "../../data/train_IOB_repl_compound.pkl"
target_df = pd.read_pickle(target_filename)
target_df = target_df.assign(h2 = np.nan, h3 = np.nan, h4 = np.nan, h5 = np.nan, h6 = np.nan)

In [29]:
pageid_list = target_df._id.unique()
for path in html_path.glob("**/*.html"):
    pageid = re.sub(r'\.html$', '', path.name)
    
    # 全化合物記事のサブタイトルを取得する場合は不要
    if pageid not in pageid_list:
        continue
    
    with path.open() as f:
        print(pageid)
        soup = BeautifulSoup(f, 'lxml')
        
        for headline in soup.find_all(attrs={"class": "mw-headline"}):
            headline_name = str(headline.next_element)
            headline_tag = str(headline.parent.name)

            para = get_paragraph(headline)
            para_text = get_paragraph_text(para)
            para_sentences = split_sentence(para_text)
            if len(para_sentences) == 0:
                continue
            
            ### insert headline
            # exact matching
            target_df.loc[(target_df._id == pageid)
                          & (target_df.sentence.isin(para_sentences))
                          , [headline_tag]] = headline_name
            # partial matching
            target_df.loc[(target_df._id == pageid)
                        & target_df.sentence.str.contains('|'.join([re.escape(s) for s in para_sentences]))
                        & target_df[headline_tag].notna()
                        , [headline_tag]] = headline_name
            # fuzzy matching
            target_df.loc[(target_df._id == pageid) 
                        & target_df[headline_tag].isna() 
                        & similar_sentences(target_df.sentence.tolist(), para_sentences)
                        , [headline_tag]] = headline_name

3351755
2585955
841166
156868
2281679
1750599
1793404
221691
3574695
1124544
27129
3559104
161577
1586835
597426
1448447
2567100
2091962
3606300
2122889
267435
3346550
847067
161388
2522025
1654006
224041
3576446
458140
1140516
3329765
1688669
1237720
1670979
3100174
1671954
1747207
142523
477361
1164069
778307
1826878
1564095
1139287
2639020
470471
2550711
3598354
626464
501484
3505871
1791634
3682608
3681474
2372114
153172
488081
299263
2727625
2890783
2560867
2524390
899679
1105111
205750
3608743
3692832
1687568
1306689
2999402
3208855
3350763
1616604
3518556
620899
3579415
1238745
928404
1342797
1898168
177729
160146
3122842
3083097
464583
247647
3352154
2343828
2335596
226179
1413947
247807
3330343
1925357
3457795
2761823
358428
268606
1215871
1360982
612360
226952
322402
2427899
3124092
145521
2370416
3472595
3377786
2814756
1513907
3094939
1298963
1737545
1410927
1670912
1807175
1162928
3082087
3621491
1270151
1392735
3386984
1886366
3350532
2813380
1668122
804622
500378
1237927

In [31]:
target_df.count()

_id                         7435
label                       7435
sentence                    7435
title                       7435
words                       7435
repl_words                  7435
furigana_tag_seq            7435
another_name_tag_seq        7435
use_tag_seq                 7435
type_tag_seq                7435
trademark_tag_seq           7435
property_tag_seq            7435
raw_material_tag_seq        7435
production_tag_seq          7435
formation_tag_seq           7435
cas_tag_seq                 7435
chemical_formula_tag_seq    7435
density_tag_seq             7435
melting_tag_seq             7435
boiling_tag_seq             7435
rational_formula_tag_seq    7435
h2                          4970
h3                           865
h4                            10
h5                             0
h6                             0
dtype: int64

In [32]:
target_df.to_pickle(target_filename)

## 検証

In [15]:
target_df = pd.concat([
    pd.read_pickle("../../data/train_IOB_repl_compound.pkl")
    , pd.read_pickle("../../data/test_IOB_repl_compound.pkl")
])

In [25]:
target_df.count()

_id                         8999
label                       8999
sentence                    8999
title                       8999
words                       8999
repl_words                  8999
furigana_tag_seq            8999
another_name_tag_seq        8999
use_tag_seq                 8999
type_tag_seq                8999
trademark_tag_seq           8999
property_tag_seq            8999
raw_material_tag_seq        8999
production_tag_seq          8999
formation_tag_seq           8999
cas_tag_seq                 8999
chemical_formula_tag_seq    8999
density_tag_seq             8999
melting_tag_seq             8999
boiling_tag_seq             8999
rational_formula_tag_seq    8999
h2                          6095
h3                          1150
h4                            10
h5                             0
h6                             0
dtype: int64

In [36]:
tag = "h4"
target_df.fillna('NO_TITLE').loc[target_df.production_tag_seq.apply(lambda x: 'B' in x)].groupby(tag).count().sort_values(
    '_id', ascending=False)[['_id']].rename(columns={'_id': 'count'}).to_csv(
    f"../../output/production_count_in_{tag}_headline.csv"
)