In [6]:
import nltk
nltk.download("punkt")          # classic Punkt models
nltk.download("punkt_tab")   

[nltk_data] Downloading package punkt to /home/yunhengzou/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/yunhengzou/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [19]:
# single_doc_middle_sentence.py
# Python 3.11 – self‑contained helper for “middle **block** of sentences”

from __future__ import annotations
from typing import Dict, List, Optional
import random

import nltk
try:
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt", quiet=True)


# ------------------------------------------------------------------------ #
def split_sentences(text: str) -> List[str]:
    """Sentence‑tokenize a document (Punkt)."""
    return nltk.tokenize.sent_tokenize(text, language="english")


def middle_sample(
    text: str,
    mid_idx: Optional[int] = None,   # start index of the middle block
    mid_len: int = 1,                # how many “middle” sentences to predict
    n_prev: int = 3,
    n_next: int = 3,
) -> Dict[str, str | int]:
    """
    Return a training example with **multiple middle sentences**.

    Returns
    -------
    dict with:
        prev   : str   – `n_prev` sentences before the middle block
        target : str   – `mid_len` sentences to be predicted
        after  : str   – `n_next` sentences after the block
        index  : int   – sentence index of the *first* prev sentence
                         (i.e. start of the whole chunk)

    Parameters
    ----------
    mid_idx : int | None
        Starting sentence index of the block to predict.
        • If None → choose the centred valid position.
    mid_len : int
        Number of sentences in the middle block (≥1).
    n_prev, n_next : int
        Context lengths before and after the middle block.
    """
    if mid_len < 1:
        raise ValueError("mid_len must be ≥ 1")

    sents = split_sentences(text)
    total = len(sents)

    need = n_prev + mid_len + n_next
    if total < need:
        raise ValueError(f"doc too short: need ≥{need} sentences, got {total}")

    # establish valid range for the *start* index of the middle block
    start_min = n_prev
    start_max = total - mid_len - n_next

    if mid_idx is None:  # centred start within allowed band
        mid_idx = start_min + (start_max - start_min) // 2
    else:
        if not (start_min <= mid_idx <= start_max):
            raise IndexError(
                f"mid_idx {mid_idx} invalid; valid range [{start_min}, {start_max}]"
            )

    # build strings
    prev_ctx = " ".join(sents[mid_idx - n_prev : mid_idx])
    target   = " ".join(sents[mid_idx : mid_idx + mid_len])
    after_ctx = " ".join(sents[mid_idx + mid_len : mid_idx + mid_len + n_next])
    index_prev_start = mid_idx - n_prev   # as requested

    return {
        "prev": prev_ctx,
        "target": target,
        "after": after_ctx,
        "index": index_prev_start,
    }


# ---------------- quick demo ---------------------------------------------
if __name__ == "__main__":
    DOC = (
        "Transformer models have revolutionized natural language processing. "
        "However, their quadratic time complexity on long sequences motivates efficient variants. "
        "Sparse attention, kernel methods, and recurrent gating are popular directions. "
        "Recently, the Mamba architecture introduced state‑space gating for sequence modeling. "
        "It achieves linear scaling while outperforming previous efficient transformers. "
        "We explore its applicability to protein‑fold prediction tasks. "
        "Initial results are promising and pave the way for future research."
    )

    sample = middle_sample(
        DOC,
        mid_idx=2,  # auto‑centre    
        mid_len=2,     # predict two sentences
        n_prev=2,
        n_next=2,
    )
    for k, v in sample.items():
        print(f"{k.upper()}:\n{v}\n")


PREV:
Transformer models have revolutionized natural language processing. However, their quadratic time complexity on long sequences motivates efficient variants.

TARGET:
Sparse attention, kernel methods, and recurrent gating are popular directions. Recently, the Mamba architecture introduced state‑space gating for sequence modeling.

AFTER:
It achieves linear scaling while outperforming previous efficient transformers. We explore its applicability to protein‑fold prediction tasks.

INDEX:
0



In [13]:
print("Prev Sentences: \n", middle_sample(DOC, mid_idx=3, n_prev=2, n_next=1)["prev"])
print("After Sentences: \n", middle_sample(DOC, mid_idx=3, n_prev=2, n_next=1)["after"])
print("Target Sentence: \n", middle_sample(DOC, mid_idx=3, n_prev=2, n_next=1)["target"])

Prev Sentences: 
 However, their quadratic time complexity on long sequences motivates efficient variants. Sparse attention, kernel methods, and recurrent gating are popular directions.
After Sentences: 
 It achieves linear scaling while outperforming previous efficient transformers.
Target Sentence: 
 Recently, the Mamba architecture introduced state‑space gating for sequence modeling.
