In [None]:
import PyPDF2
import re  # For regular expressions (if needed)

from datasets import load_dataset
from sklearn.model_selection import train_test_split

In [None]:

def chunk_pdf_by_sections(pdf_path, section_markers=None):  # section_markers can be regex or list of headings
    with open(pdf_path, "rb") as file:
        reader = PyPDF2.PdfReader(file)
        text = ""
        for page in reader.pages:
            text += page.extract_text()

    chunks = []

    if section_markers:  # Automatic section detection
        if isinstance(section_markers, list): # List of headings
            for marker in section_markers:
                # Use regex to find start and end of section
                matches = re.finditer(rf"(^{marker}$)([\s\S]*?)(?=(^{section_markers[section_markers.index(marker) + 1]}$)|$)", text, re.MULTILINE) if section_markers.index(marker) < len(section_markers) - 1 else re.finditer(rf"(^{marker}$)([\s\S]*?)$", text, re.MULTILINE)
                for match in matches:
                    chunks.append(match.group(2).strip())
        elif isinstance(section_markers, str): # Regex for section start
             matches = re.finditer(section_markers, text, re.MULTILINE)
             for match in matches:
                start = match.start()
                end = text.find(section_markers, start + 1) if text.find(section_markers, start + 1) != -1 else len(text)
                chunks.append(text[start:end].strip())

    else:  # Manual annotation (example)
        # (Implementation for reading section boundaries from a separate file or markers in the text)
        # ... (Add your logic here) ...
        pass # replace with your logic

    return chunks

# Example usage (automatic - list of headings):
section_headings = ["Introduction", "Symptoms", "Treatment", "Prevention"]  # Your actual section headings
chunks = chunk_pdf_by_sections("mental_health.pdf", section_headings)

# Example usage (automatic - regex):
section_regex = r"^##\s*(.+)$"  # Regex to find section starts (e.g. Markdown headings)
chunks = chunk_pdf_by_sections("mental_health.pdf", section_regex)

# Example usage (manual annotation):
# chunks = chunk_pdf_by_sections("mental_health.pdf") # You'd need to implement the manual parsing

with open("my_dataset.txt", "w") as outfile:
    for chunk in chunks:
        outfile.write(chunk + "\n")  # Each chunk on a new line

In [None]:
# Load your full dataset
dataset = load_dataset("text", data_files={"full": "your_dataset.txt"})

# Split into train and (val+test)
train_dataset, eval_test_dataset = train_test_split(dataset["full"], test_size=0.3, random_state=42) # Adjust test_size as needed

# Split (val+test) into validation and test
eval_dataset, test_dataset = train_test_split(eval_test_dataset, test_size=0.5, random_state=42)

# Convert to Hugging Face datasets
train_dataset = load_dataset("text", data_files={"train": train_dataset})
eval_dataset = load_dataset("text", data_files={"eval": eval_dataset})
test_dataset = load_dataset("text", data_files={"test": test_dataset})

# Tokenize datasets
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True, remove_columns=["train"])
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True, remove_columns=["eval"])
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True, remove_columns=["test"])