In [1]:
from __future__ import annotations

import ctypes
import gc
import glob
import os
import re
import sys
from pathlib import Path

import blingfire as bf
import hydra
import numpy as np
import pandas as pd
from faiss import read_index, write_index
from hydra.core.hydra_config import HydraConfig
from omegaconf import DictConfig, OmegaConf
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

libc = ctypes.CDLL("libc.so.6")
sys.path.append(os.pardir)

import utils



In [4]:
df = pd.read_parquet("../input/wikipedia-20230701/a.parquet")

In [10]:
def extract_sections(title: str, text: str) -> list[tuple[str, str]]:
    pattern = re.compile(r"={2,}\s?(.*?)\s?={2,}")
    sections = []

    matches = list(pattern.finditer(text))
    start_idx = 0

    for i, match in enumerate(matches):
        if i == 0:
            end_idx = match.start()
            sections.append((title, text[start_idx:end_idx].strip()))

        start_idx = match.end()
        end_idx = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        section = (match.group(1).strip(), text[start_idx:end_idx].strip())
        if section[0] not in ["See also", "References", "Further reading", "External links"]:
            sections.append(section)

        start_idx = end_idx

    return sections


def compress_and_split_sections(
    sections: list[tuple[str, str]], max_sentence_length: int, max_sentence_num: int, filter_len=3
) -> list[str]:
    combined_sections = []
    document = ""
    for title, content in sections:
        document += f"{title or 'No Title'}: {content}" + "\n"
    offset = (0, len(document))

    document_sentences = []
    try:
        _, sentence_offsets = bf.text_to_sentences_and_offsets(document)
        for o in sentence_offsets:
            if o[1] - o[0] > filter_len:
                sentence = document[o[0] : o[1]]
                abs_offsets = (o[0] + offset[0], o[1] + offset[0])
                document_sentences.append(sentence)
    except:
        document_sentences = [document]

    buffer = ""

    for text in document_sentences:
        if len((buffer + text).split(" ")) <= max_sentence_length:
            buffer += text + "\n"
        else:
            combined_sections.append(buffer.strip())
            buffer = text + "\n"

    if buffer:
        combined_sections.append(buffer.strip())

    # 空のセクションをフィルタリング
    sections = [section for section in combined_sections if len(section) > 0]
    return sections[:max_sentence_num]

In [13]:
max_sentence_length = 300
max_sentence_num = 10000

row = df.iloc[1]
section = extract_sections(row["title"], row["text"])
secs = compress_and_split_sections(section, max_sentence_length, max_sentence_num)

for sec in secs:
    print(sec)
    print()

A & C Black: A & C Black is a British book publishing company, owned since 2002 by Bloomsbury Publishing.
The company is noted for publishing Who's Who since 1849 and the Encyclopedia Britannica between 1827 and 1903.
It offers a wide variety of books in fiction and nonfiction, and has published popular travel guides, novels, and science books.
History: The firm was founded in 1807 by Charles and Adam Black in Edinburgh.
In 1851, the company purchased the copyrights to Sir Walter Scott's Waverly novels for £27,000.
The company moved to the Soho district of London in 1889.
During the years 1827–1903 the firm published the seventh, eighth and ninth editions of the Encyclopædia Britannica.
This was purchased from Archibald Constable after his company's failure to publish the seventh edition of the encyclopedia.
Adam Black retired in 1870 due to his disapproval of his sons' extravagant plans for its ninth edition.
This edition, however, would sell half a million sets and was released in 24