In [1]:
from __future__ import annotations

import ctypes
import gc
import glob
import os
import re
import sys
from pathlib import Path

import blingfire as bf
import hydra
import numpy as np
import pandas as pd
from faiss import read_index, write_index
from hydra.core.hydra_config import HydraConfig
from omegaconf import DictConfig, OmegaConf
from sentence_transformers import SentenceTransformer
from tqdm.auto import tqdm

libc = ctypes.CDLL("libc.so.6")
sys.path.append(os.pardir)

import utils



In [2]:
df = pd.read_parquet("../input/wikipedia-20230701/b.parquet")

In [21]:
def extract_sections(title: str, text: str) -> list[tuple[str, str]]:
    pattern = re.compile(r"={2,}\s?(.*?)\s?={2,}")
    sections = []

    matches = list(pattern.finditer(text))
    start_idx = 0

    for i, match in enumerate(matches):
        if i == 0:
            end_idx = match.start()
            sections.append((title, text[start_idx:end_idx].strip()))

        start_idx = match.end()
        end_idx = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        section = (match.group(1).strip(), text[start_idx:end_idx].strip())
        if section[0] not in ["See also", "References", "Further reading", "External links"]:
            sections.append(section)

        start_idx = end_idx

    # 空のtextの場合は飛ばす
    sections = [section for section in sections if len(section[1].split(" ")) >= 3]
    return sections


def compress_and_split_sections(
    sections: list[tuple[str, str]], max_sentence_length: int, max_sentence_num: int, filter_len=3
) -> list[str]:
    # セクション同士を結合する
    
    # 
    document_sentences = []
    for title, content in sections:
        document = f"{title or 'No Title'}: {content}" + "\n"
        section_sentences = []
        try:
            _, sentence_offsets = bf.text_to_sentences_and_offsets(document)
            for o in sentence_offsets:
                if o[1] - o[0] > filter_len:
                    sentence = document[o[0] : o[1]]
                    section_sentences.append(sentence)
        except:
            section_sentences = [document]

        buffer = ""

        for text in section_sentences:
            if len((buffer + text).split(" ")) <= max_sentence_length:
                buffer += text + "\n"
            else:
                document_sentences.append(buffer.strip())
                buffer = text + "\n"

        if buffer:
            document_sentences.append(buffer.strip())

    # 空のセクションをフィルタリング
    sections = [section for section in document_sentences if len(section) > 0]
    return sections[:max_sentence_num]

In [35]:
max_sentence_length = 250
max_sentence_num = 300

row = df.iloc[17]
section = extract_sections(row["title"], row["text"])
secs = compress_and_split_sections(section, max_sentence_length, max_sentence_num)

for sec in secs:
    print(sec)
    print()

B 32 Muthal 44 Vare: B 32 Muthal 44 Vare is 2023 Indian Malayalam-language film written and directed by Shruthi Sharanyam featuring Remya Nambeesan and Anarkali Marikar.

Cast: * Remya Nambeesan * Anarkali Marikar * Zarin Shihab * Aswathy Babu * Raina Radhakrishnan * Krisha Kurup * Harish Uthaman * Sajitha Madathil * Sajin Cherukayil

Production: ‘Aanandam’ song from the film was released on 29 March 2023.
Later teaser and the trailer was released on 5 April 2023.
Film got "U/A" censored and scheduled to release on 6 April 2023.
Film selected for funding by the Kerala State Film Development Corporation (KSFDC) as part of a project launched in 2019-20 to promote women filmmakers.

Reception: Anjana George critic of Times of india gave 4 stars out of 5 and stated that "The film is a pathbreaking celebration for women and a thought provoker for society.".
S.R.Praveen, critic of The Hindu, stated that "A sensitive, nuanced take on body politics".
Cris critic of The News Minute gave 3.5 sta

In [28]:
section

[('B (Los Angeles Railway)',
  'The B was a streetcar line in Los Angeles, California. It was operated by the Los Angeles Railway from 1920 to 1948, originally running from Ramona Boulevard and Miller Street in East Los Angeles to Ascot Avenue and 51st Street.'),
 ('Brooklyn and Ascot Lines (1895–1911)',
  'The first Brooklyn line was built in 1895 by the Los Angeles Consolidated Railway as a horsecar road. It terminated at the intersection of Brooklyn Avenue and Evergreen Avenue. The line was electrified the following year and rerouted downtown. In 1902 the route was bought by the Pacific Electric Railway to be standard gauged, but never was. In 1908, the Brooklyn Avenue Line ran from Arcade Depot to Evergreen Cemetery via 5th Street, Main Street, Macy, Pleasant Avenue, Bridge, and Brooklyn Avenue. Following the Great Merger of 1911, control of the route returned to the Los Angeles Railway. They extended the route north and east along Evergreen and Wabash Avenues to the city limits in