In [None]:
# two options - randomly pair (brain, face, spine) reports or if there are an equal number, you can deterministically pair
# we provide both options in the next two cells

import os
import numpy as np
import pandas as pd
import random

BRAIN_DIR = "MIMIC/partitioned_by_report"
FACE_DIR = "MIMIC/partitioned_by_report_face"
SPINE_DIR = "MIMIC/partitioned_by_report_spine"
OUTPUT_DIR = "MIMIC/synthetic_concat_reports"
os.makedirs(OUTPUT_DIR, exist_ok=True)

brain_files = [f for f in os.listdir(BRAIN_DIR) if f.endswith('.csv')]
face_files = [f for f in os.listdir(FACE_DIR) if f.endswith('.csv')]
spine_files = [f for f in os.listdir(SPINE_DIR) if f.endswith('.csv')]

for i, brain_file in enumerate(brain_files):
    brain_path = os.path.join(BRAIN_DIR, brain_file)
    face_file = random.choice(face_files)
    face_path = os.path.join(FACE_DIR, face_file)
    spine_file = random.choice(spine_files)
    spine_path = os.path.join(SPINE_DIR, spine_file)
    
    df_brain = pd.read_csv(brain_path)
    df_face = pd.read_csv(face_path)
    df_spine = pd.read_csv(spine_path)

    for df in [df_brain, df_face, df_spine]:
        if "sentence" in df.columns:
            df.rename(columns={"sentence": "Sentence"}, inplace=True)
        if "sentence_num" in df.columns:
            df.rename(columns={"sentence_num": "Sentence Num"}, inplace=True)
        if "report_index" in df.columns:
            df.drop(columns=["report_index"], inplace=True)
        if "Brain Related" not in df:
            df["Brain Related"] = -1
        df["Brain Related"] = df["Brain Related"].fillna(-1).astype(int)
        if "Unnamed: 0" in df.columns:
            df.drop(columns=["Unnamed: 0"], inplace=True)

    # reset sentence numbers sequentially
    def reset_sentence_num(dfs):
        all_dfs = []
        current_num = 1
        for df in dfs:
            df = df.copy()
            n = len(df)
            df["Sentence Num"] = np.arange(current_num, current_num + n)
            current_num += n
            all_dfs.append(df)
        return pd.concat(all_dfs, ignore_index=True)

    brain_only = reset_sentence_num([df_brain])
    brain_only.to_csv(os.path.join(OUTPUT_DIR, f"synthetic_brain_{i:03d}.csv"), index=False)

    brain_face = reset_sentence_num([df_brain, df_face])
    brain_face.to_csv(os.path.join(OUTPUT_DIR, f"synthetic_brain_face_{i:03d}.csv"), index=False)

    brain_spine = reset_sentence_num([df_brain, df_spine])
    brain_spine.to_csv(os.path.join(OUTPUT_DIR, f"synthetic_brain_spine_{i:03d}.csv"), index=False)

    trio_order = [df_brain, df_face, df_spine]
    if random.choice([True, False]):
        # brain + face + spine
        ordered_trio = trio_order
        filename = f"synthetic_brain_face_spine_{i:03d}.csv"
    else:
        # brain + spine + face
        ordered_trio = [df_brain, df_spine, df_face]
        filename = f"synthetic_brain_spine_face_{i:03d}.csv"
    trio_df = reset_sentence_num(ordered_trio)
    trio_df.to_csv(os.path.join(OUTPUT_DIR, filename), index=False)

    print(f"Saved synthetic reports for brain report {i+1}/{len(brain_files)}")

print("All synthetic reports generated in", OUTPUT_DIR)


In [None]:
import os
import numpy as np
import pandas as pd

BRAIN_DIR = "MIMIC/partitioned_by_report"
FACE_DIR  = "MIMIC/partitioned_by_report_face"
SPINE_DIR = "MIMIC/partitioned_by_report_spine"
OUTPUT_DIR = "MIMIC/synthetic_concat_reports"
os.makedirs(OUTPUT_DIR, exist_ok=True)

brain_files = sorted([f for f in os.listdir(BRAIN_DIR) if f.endswith('.csv')])
face_files  = sorted([f for f in os.listdir(FACE_DIR)  if f.endswith('.csv')])
spine_files = sorted([f for f in os.listdir(SPINE_DIR) if f.endswith('.csv')])

assert len(brain_files) == len(face_files) == len(spine_files), "Counts must match"

def reset_sentence_num(dfs):
    all_dfs = []
    current = 1
    for df in dfs:
        df = df.copy()
        n = len(df)
        df["Sentence Num"] = np.arange(current, current + n)
        current += n
        all_dfs.append(df)
    return pd.concat(all_dfs, ignore_index=True)

for i, (brain_file, face_file, spine_file) in enumerate(zip(brain_files, face_files, spine_files)):
    brain_path = os.path.join(BRAIN_DIR, brain_file)
    face_path  = os.path.join(FACE_DIR,  face_file)
    spine_path = os.path.join(SPINE_DIR, spine_file)

    df_brain = pd.read_csv(brain_path)
    df_face  = pd.read_csv(face_path)
    df_spine = pd.read_csv(spine_path)

    for df in (df_brain, df_face, df_spine):
        if "sentence" in df.columns: df.rename(columns={"sentence": "Sentence"}, inplace=True)
        if "sentence_num" in df.columns: df.rename(columns={"sentence_num": "Sentence Num"}, inplace=True)
        if "report_index" in df.columns: df.drop(columns=["report_index"], inplace=True)
        if "Unnamed: 0" in df.columns: df.drop(columns=["Unnamed: 0"], inplace=True)
        if "Brain Related" not in df: df["Brain Related"] = -1
        df["Brain Related"] = df["Brain Related"].fillna(-1).astype(int)

    reset_sentence_num([df_brain]).to_csv(os.path.join(OUTPUT_DIR, f"synthetic_brain_{i:03d}.csv"), index=False)

    reset_sentence_num([df_brain, df_face]).to_csv(os.path.join(OUTPUT_DIR, f"synthetic_brain_face_{i:03d}.csv"), index=False)

    reset_sentence_num([df_brain, df_spine]).to_csv(os.path.join(OUTPUT_DIR, f"synthetic_brain_spine_{i:03d}.csv"), index=False)

    if i % 2 == 0:
        ordered = [df_brain, df_face, df_spine]
        fname = f"synthetic_brain_face_spine_{i:03d}.csv"
    else:
        ordered = [df_brain, df_spine, df_face]
        fname = f"synthetic_brain_spine_face_{i:03d}.csv"

    reset_sentence_num(ordered).to_csv(os.path.join(OUTPUT_DIR, fname), index=False)

    print(f"Saved synthetic reports for brain report {i+1}/{len(brain_files)}")

print("All synthetic reports generated in", OUTPUT_DIR)


In [None]:
# individual reports
import os
import pandas as pd

BRAIN_DIR = "MIMIC/partitioned_by_report"
FACE_DIR  = "MIMIC/partitioned_by_report_face"
SPINE_DIR = "MIMIC/partitioned_by_report_spine"
OUTPUT_DIR = "MIMIC/real_individual_reports"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def list_csvs(d):
    return sorted([f for f in os.listdir(d) if f.endswith(".csv")])

brain_files = list_csvs(BRAIN_DIR)
face_files  = list_csvs(FACE_DIR)
spine_files = list_csvs(SPINE_DIR)

def load_and_standardize(path):
    df = pd.read_csv(path)

    if "Unnamed: 0" in df.columns:
        df = df.drop(columns=["Unnamed: 0"])

    df = df.rename(columns={
        "sentence": "Sentence",
        "sentence_num": "Sentence Num",
        "report_index": "Report Index"
    })

    if "Brain Related" not in df.columns:
        df["Brain Related"] = -1

    return df

def copy_reports(files, source_dir, prefix):
    for fname in files:
        src_path = os.path.join(source_dir, fname)
        df = load_and_standardize(src_path)

        out_path = os.path.join(OUTPUT_DIR, f"{prefix}_{fname}")
        if os.path.exists(out_path):
            print(f"Skipping (already exists): {out_path}")
            continue

        df.to_csv(out_path, index=False)
        print(f"Saved {len(df)} rows â†’ {out_path}")

copy_reports(brain_files, BRAIN_DIR, "brain")
copy_reports(face_files,  FACE_DIR,  "face")
copy_reports(spine_files, SPINE_DIR, "spine")

print("All individual reports copied to", OUTPUT_DIR)


In [None]:
# scrambling reports together
import os
import numpy as np
import pandas as pd
import random

SEED = 42
random.seed(SEED)
rng = np.random.default_rng(SEED)

BRAIN_DIR = "MIMIC/partitioned_by_report"
FACE_DIR  = "MIMIC/partitioned_by_report_face"
SPINE_DIR = "MIMIC/partitioned_by_report_spine"
OUTPUT_DIR = "MIMIC/synthetic_scrambled_reports"
os.makedirs(OUTPUT_DIR, exist_ok=True)

def list_csvs(d):
    return sorted([f for f in os.listdir(d) if f.endswith(".csv")])

def load_and_standardize(path):
    """Load a per-report CSV and standardize columns."""
    df = pd.read_csv(path)

    if "Unnamed: 0" in df.columns:
        df = df.drop(columns=["Unnamed: 0"])

    df = df.rename(columns={
        "sentence": "Sentence",
        "sentence_num": "Sentence Num",
        "report_index": "Report Index",
    })

    if "Brain Related" not in df.columns:
        df["Brain Related"] = -1

    # we will reassign sentence num after scrambling anyway
    return df

def scramble_and_renumber(df, rng):
    if len(df) == 0:
        out = df.copy()
        out["Sentence Num"] = []
        return out
    rs = int(rng.integers(0, 2**32 - 1))
    out = df.sample(frac=1.0, random_state=rs).reset_index(drop=True).copy()
    out["Sentence Num"] = np.arange(1, len(out) + 1)
    return out

def concat_scramble_renumber(dfs, rng):
    combined = pd.concat(dfs, ignore_index=True)
    return scramble_and_renumber(combined, rng)

brain_files = list_csvs(BRAIN_DIR)
face_files  = list_csvs(FACE_DIR)
spine_files = list_csvs(SPINE_DIR)

for i, brain_fname in enumerate(brain_files):
    brain_path = os.path.join(BRAIN_DIR, brain_fname)
    face_path  = os.path.join(FACE_DIR,  random.choice(face_files))
    spine_path = os.path.join(SPINE_DIR, random.choice(spine_files))

    df_brain = load_and_standardize(brain_path)
    df_face  = load_and_standardize(face_path)
    df_spine = load_and_standardize(spine_path)

    brain_only_scr = scramble_and_renumber(df_brain, rng)
    brain_only_path = os.path.join(OUTPUT_DIR, f"scrambled_brain_{i:03d}.csv")
    brain_only_scr.to_csv(brain_only_path, index=False)

    brain_face_scr = concat_scramble_renumber([df_brain, df_face], rng)
    brain_face_path = os.path.join(OUTPUT_DIR, f"scrambled_brain_face_{i:03d}.csv")
    brain_face_scr.to_csv(brain_face_path, index=False)

    brain_spine_scr = concat_scramble_renumber([df_brain, df_spine], rng)
    brain_spine_path = os.path.join(OUTPUT_DIR, f"scrambled_brain_spine_{i:03d}.csv")
    brain_spine_scr.to_csv(brain_spine_path, index=False)

    if random.choice([True, False]):
        trio_dfs = [df_brain, df_face, df_spine]
        trio_name = f"scrambled_brain_face_spine_{i:03d}.csv"
    else:
        trio_dfs = [df_brain, df_spine, df_face]
        trio_name = f"scrambled_brain_spine_face_{i:03d}.csv"
    trio_scr = concat_scramble_renumber(trio_dfs, rng)
    trio_path = os.path.join(OUTPUT_DIR, trio_name)
    trio_scr.to_csv(trio_path, index=False)

    print(f"Saved scrambled variants for brain report {i+1}/{len(brain_files)}")

print("All scrambled reports written to:", OUTPUT_DIR)


In [None]:
import os
import numpy as np
import pandas as pd
import random
import re
from typing import Dict, List, Tuple

BRAIN_DIR = "MIMIC/partitioned_by_report"
FACE_DIR = "MIMIC/partitioned_by_report_face"
SPINE_DIR = "MIMIC/partitioned_by_report_spine"
OUTPUT_DIR = "MIMIC/synthetic_bundled_reports"
os.makedirs(OUTPUT_DIR, exist_ok=True)

SECTION_HEADERS = [
    "EXAMINATION", "EXAM",
    "HISTORY", "HIST",
    "INDICATION", "IND",
    "TECHNIQUE", "TECH",
    "DOSE",
    "COMPARISON", "COMP",
    "FINDINGS", "FIND",
    "IMPRESSIONS", "IMPRESSION", "IMP",
    "NOTIFICATION", "NOTIFY",
    "RECOMMENDATIONS", "RECOMMENDATION", "REC"
]

# only write these three sections to csv (in this order) - these are the most important sections for filtering information and the history sections, etc may have conflicting information that wouldn't make sense to merge
ALLOWED_SECTIONS = ["EXAMINATION", "FINDINGS", "IMPRESSIONS"]

def extract_sections(text: str) -> Dict[str, str]:
    if not text or pd.isna(text):
        return {}
    text = str(text).strip()
    sections = {}

    header_pattern = r'\b(' + '|'.join(SECTION_HEADERS) + r')(?:\s*:|\s*\n|\s+)'
    matches = list(re.finditer(header_pattern, text, re.IGNORECASE))

    for i, match in enumerate(matches):
        header = match.group(1).upper()
        start_pos = match.end()
        end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        content = text[start_pos:end_pos].strip()

        if header in ["EXAM"]:
            header = "EXAMINATION"
        elif header in ["HIST"]:
            header = "HISTORY"
        elif header in ["IND"]:
            header = "INDICATION"
        elif header in ["TECH"]:
            header = "TECHNIQUE"
        elif header in ["COMP"]:
            header = "COMPARISON"
        elif header in ["FIND"]:
            header = "FINDINGS"
        elif header in ["IMPRESSION", "IMP"]:
            header = "IMPRESSIONS"
        elif header in ["NOTIFY"]:
            header = "NOTIFICATION"
        elif header in ["RECOMMENDATIONS", "REC"]:
            header = "RECOMMENDATION"

        if content:
            sections[header] = content
    return sections

def get_report_text(df: pd.DataFrame) -> str:
    if 'Sentence' not in df.columns:
        return ""
    sentences = df['Sentence'].dropna().tolist()
    return ' '.join(sentences)

def text_to_sentences(text: str) -> List[str]: # this suffices for the reports we labeled but more complex handling may be needed for larger scale experiments
    if not text:
        return []
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s.strip() for s in sentences if s.strip()]

def merge_sections_labeled(sections_list: List[Dict[str, str]], labels: List[int]) -> Dict[str, List[Tuple[str, int]]]:
    full_order = [
        "EXAMINATION", "HISTORY", "INDICATION", "TECHNIQUE",
        "DOSE", "COMPARISON", "FINDINGS", "IMPRESSIONS",
        "NOTIFICATION", "RECOMMENDATION"
    ]
    merged: Dict[str, List[Tuple[str, int]]] = {}
    for section in full_order:
        parts = []
        for sec_dict, lab in zip(sections_list, labels):
            if section in sec_dict and sec_dict[section].strip():
                parts.append((sec_dict[section], lab))
        if parts:
            merged[section] = parts
    return merged

def flatten_exam_to_single_sentence(exam_parts: List[Tuple[str, int]]) -> Tuple[str, int]:
    contents = [c for c, _ in exam_parts if c and c.strip()]
    if not contents:
        return ("", 1)
    combined = ' '.join(contents).replace('\n', ' ')
    combined = re.sub(r'\s+', ' ', combined)
    combined = re.sub(r'[.!?;:]+', '', combined).strip()
    return (combined, 1)

def create_dataframe_from_merged(merged_labeled: Dict[str, List[Tuple[str, int]]]) -> pd.DataFrame:
    # forcing the examination section to be labeled all 1 for inclusion information because we opt to want the header included
    sentences: List[str] = []
    labels: List[int] = []

    if "EXAMINATION" in merged_labeled:
        exam_sentence, exam_label = flatten_exam_to_single_sentence(merged_labeled["EXAMINATION"])
        if exam_sentence:
            sentences.append(exam_sentence)
            labels.append(1)  # force label

    for section in ["FINDINGS", "IMPRESSIONS"]:
        if section in merged_labeled:
            for content, lab in merged_labeled[section]:
                for s in text_to_sentences(content):
                    sentences.append(s)
                    labels.append(lab)

    df = pd.DataFrame({
        "Sentence": sentences,
        "Sentence Num": range(1, len(sentences) + 1),
        "Brain Related": labels
    })
    return df

brain_files = [f for f in os.listdir(BRAIN_DIR) if f.endswith('.csv')]
face_files = [f for f in os.listdir(FACE_DIR) if f.endswith('.csv')]
spine_files = [f for f in os.listdir(SPINE_DIR) if f.endswith('.csv')]

for i, brain_file in enumerate(brain_files):
    brain_path = os.path.join(BRAIN_DIR, brain_file)
    # can also remove randomness in the same way - for lack of redundancy and generality, we just provide this script
    face_file = random.choice(face_files)
    face_path = os.path.join(FACE_DIR, face_file)
    spine_file = random.choice(spine_files)
    spine_path = os.path.join(SPINE_DIR, spine_file)
    
    df_brain = pd.read_csv(brain_path)
    df_face = pd.read_csv(face_path)
    df_spine = pd.read_csv(spine_path)
    
    for df in [df_brain, df_face, df_spine]:
        if "sentence" in df.columns:
            df.rename(columns={"sentence": "Sentence"}, inplace=True)
        if "sentence_num" in df.columns:
            df.rename(columns={"sentence_num": "Sentence Num"}, inplace=True)
        if "report_index" in df.columns:
            df.drop(columns=["report_index"], inplace=True)
        if "Brain Related" not in df.columns:
            df["Brain Related"] = -1
        df["Brain Related"] = df["Brain Related"].fillna(-1).astype(int)
        if "Unnamed: 0" in df.columns:
            df.drop(columns=["Unnamed: 0"], inplace=True)
    
    brain_text = get_report_text(df_brain)
    face_text = get_report_text(df_face)
    spine_text = get_report_text(df_spine)
    
    brain_sections = extract_sections(brain_text)
    face_sections = extract_sections(face_text)
    spine_sections = extract_sections(spine_text)

    merged_b_only = merge_sections_labeled([brain_sections], [1])
    df_brain_only = create_dataframe_from_merged(merged_b_only)
    df_brain_only.to_csv(os.path.join(OUTPUT_DIR, f"synthetic_brain_{i:03d}.csv"), index=False)
    
    merged_bf_labeled = merge_sections_labeled([brain_sections, face_sections], [1, 0])
    df_brain_face = create_dataframe_from_merged(merged_bf_labeled)
    df_brain_face.to_csv(os.path.join(OUTPUT_DIR, f"synthetic_brain_face_{i:03d}.csv"), index=False)
    
    merged_bs_labeled = merge_sections_labeled([brain_sections, spine_sections], [1, 0])
    df_brain_spine = create_dataframe_from_merged(merged_bs_labeled)
    df_brain_spine.to_csv(os.path.join(OUTPUT_DIR, f"synthetic_brain_spine_{i:03d}.csv"), index=False)
    
    if random.choice([True, False]):
        merged_trio_labeled = merge_sections_labeled([brain_sections, face_sections, spine_sections], [1, 0, 0])
        filename = f"synthetic_brain_face_spine_{i:03d}.csv"
    else:
        merged_trio_labeled = merge_sections_labeled([brain_sections, spine_sections, face_sections], [1, 0, 0])
        filename = f"synthetic_brain_spine_face_{i:03d}.csv"
    df_trio = create_dataframe_from_merged(merged_trio_labeled)
    df_trio.to_csv(os.path.join(OUTPUT_DIR, filename), index=False)
    
    print(f"Processed bundled reports")

print("All bundled reports generated in", OUTPUT_DIR)


In [None]:
# generation script that makes the reports fully independent 
import os
import numpy as np
import pandas as pd
import random
import re
from typing import Dict, List, Tuple

BRAIN_DIR = "MIMIC/partitioned_by_report"
FACE_DIR = "MIMIC/partitioned_by_report_face"
SPINE_DIR = "MIMIC/partitioned_by_report_spine"
OUTPUT_DIR = "MIMIC/synthetic_bundled_reports_deterministic"
SUFFIX = "_deterministic"
os.makedirs(OUTPUT_DIR, exist_ok=True)

SECTION_HEADERS = [
    "EXAMINATION", "EXAM",
    "HISTORY", "HIST",
    "INDICATION", "IND",
    "TECHNIQUE", "TECH",
    "DOSE",
    "COMPARISON", "COMP",
    "FINDINGS", "FIND",
    "IMPRESSIONS", "IMPRESSION", "IMP",
    "NOTIFICATION", "NOTIFY",
    "RECOMMENDATIONS", "RECOMMENDATION", "REC"
]

ALLOWED_SECTIONS = ["EXAMINATION", "FINDINGS", "IMPRESSIONS"]

def extract_sections(text: str) -> Dict[str, str]:
    """Extract sections from a medical report text."""
    if not text or pd.isna(text):
        return {}
    text = str(text).strip()
    sections = {}

    header_pattern = r'\b(' + '|'.join(SECTION_HEADERS) + r')(?:\s*:|\s*\n|\s+)'
    matches = list(re.finditer(header_pattern, text, re.IGNORECASE))

    for i, match in enumerate(matches):
        header = match.group(1).upper()
        start_pos = match.end()
        end_pos = matches[i + 1].start() if i + 1 < len(matches) else len(text)
        content = text[start_pos:end_pos].strip()

        if header in ["EXAM"]:
            header = "EXAMINATION"
        elif header in ["HIST"]:
            header = "HISTORY"
        elif header in ["IND"]:
            header = "INDICATION"
        elif header in ["TECH"]:
            header = "TECHNIQUE"
        elif header in ["COMP"]:
            header = "COMPARISON"
        elif header in ["FIND"]:
            header = "FINDINGS"
        elif header in ["IMPRESSION", "IMP"]:
            header = "IMPRESSIONS"
        elif header in ["NOTIFY"]:
            header = "NOTIFICATION"
        elif header in ["RECOMMENDATIONS", "REC"]:
            header = "RECOMMENDATION"

        if content:
            sections[header] = content
    return sections

def get_report_text(df: pd.DataFrame) -> str:
    if 'Sentence' not in df.columns:
        return ""
    sentences = df['Sentence'].dropna().tolist()
    return ' '.join(sentences)

def text_to_sentences(text: str) -> List[str]:
    if not text:
        return []
    sentences = re.split(r'(?<=[.!?])\s+', text.strip())
    return [s.strip() for s in sentences if s.strip()]

def merge_sections_labeled(sections_list: List[Dict[str, str]], labels: List[int]) -> Dict[str, List[Tuple[str, int]]]:
    full_order = [
        "EXAMINATION", "HISTORY", "INDICATION", "TECHNIQUE",
        "DOSE", "COMPARISON", "FINDINGS", "IMPRESSIONS",
        "NOTIFICATION", "RECOMMENDATION"
    ]
    merged: Dict[str, List[Tuple[str, int]]] = {}
    for section in full_order:
        parts = []
        for sec_dict, lab in zip(sections_list, labels):
            if section in sec_dict and sec_dict[section].strip():
                parts.append((sec_dict[section], lab))
        if parts:
            merged[section] = parts
    return merged

def flatten_exam_to_single_sentence(exam_parts: List[Tuple[str, int]]) -> Tuple[str, int]:
    contents = [c for c, _ in exam_parts if c and c.strip()]
    if not contents:
        return ("", 1)
    combined = ' '.join(contents).replace('\n', ' ')
    combined = re.sub(r'\s+', ' ', combined)
    combined = re.sub(r'[.!?;:]+', '', combined).strip()
    return (combined, 1)

def create_dataframe_from_merged(merged_labeled: Dict[str, List[Tuple[str, int]]]) -> pd.DataFrame:
    sentences: List[str] = []
    labels: List[int] = []

    if "EXAMINATION" in merged_labeled:
        exam_sentence, exam_label = flatten_exam_to_single_sentence(merged_labeled["EXAMINATION"])
        if exam_sentence:
            sentences.append(exam_sentence)
            labels.append(1)

    for section in ["FINDINGS", "IMPRESSIONS"]:
        if section in merged_labeled:
            for content, lab in merged_labeled[section]:
                for s in text_to_sentences(content):
                    sentences.append(s)
                    labels.append(lab)

    df = pd.DataFrame({
        "Sentence": sentences,
        "Sentence Num": range(1, len(sentences) + 1),
        "Brain Related": labels
    })
    return df

brain_files = [f for f in os.listdir(BRAIN_DIR) if f.endswith('.csv')]
face_files  = [f for f in os.listdir(FACE_DIR)  if f.endswith('.csv')]
spine_files = [f for f in os.listdir(SPINE_DIR) if f.endswith('.csv')]

print(len(brain_files))
print(len(face_files))
print(len(spine_files))
limit = 100
brain_files = brain_files[:limit]
face_files  = face_files[:limit]
spine_files = spine_files[:limit]
assert len(brain_files) == len(face_files) == len(spine_files), "Brain/Face/Spine counts must match"

# one time shuffles
rng_b, rng_f, rng_s = random.Random(1), random.Random(2), random.Random(3)
rng_b.shuffle(brain_files)
rng_f.shuffle(face_files)
rng_s.shuffle(spine_files)

for i, (brain_file, face_file, spine_file) in enumerate(zip(brain_files, face_files, spine_files)):
    brain_path = os.path.join(BRAIN_DIR, brain_file)
    face_path  = os.path.join(FACE_DIR,  face_file)
    spine_path = os.path.join(SPINE_DIR, spine_file)
    
    df_brain = pd.read_csv(brain_path)
    df_face = pd.read_csv(face_path)
    df_spine = pd.read_csv(spine_path)
    
    for df in [df_brain, df_face, df_spine]:
        if "sentence" in df.columns:
            df.rename(columns={"sentence": "Sentence"}, inplace=True)
        if "sentence_num" in df.columns:
            df.rename(columns={"sentence_num": "Sentence Num"}, inplace=True)
        if "report_index" in df.columns:
            df.drop(columns=["report_index"], inplace=True)
        if "Brain Related" not in df.columns:
            df["Brain Related"] = -1
        df["Brain Related"] = df["Brain Related"].fillna(-1).astype(int)
        if "Unnamed: 0" in df.columns:
            df.drop(columns=["Unnamed: 0"], inplace=True)
    
    brain_text = get_report_text(df_brain)
    face_text = get_report_text(df_face)
    spine_text = get_report_text(df_spine)
    
    brain_sections = extract_sections(brain_text)
    face_sections = extract_sections(face_text)
    spine_sections = extract_sections(spine_text)
    
    SUFFIX = "_deterministic"
    
    variant = i % 4 # to preserve independence, we can only merge one way
    if variant == 0:
        merged = merge_sections_labeled([brain_sections], [1])
        df_out = create_dataframe_from_merged(merged)
        out_name = f"synthetic_brain_{i:03d}{SUFFIX}.csv"
    
    elif variant == 1:
        merged = merge_sections_labeled([brain_sections, face_sections], [1, 0])
        df_out = create_dataframe_from_merged(merged)
        out_name = f"synthetic_brain_face_{i:03d}{SUFFIX}.csv"
    
    elif variant == 2:
        merged = merge_sections_labeled([brain_sections, spine_sections], [1, 0])
        df_out = create_dataframe_from_merged(merged)
        out_name = f"synthetic_brain_spine_{i:03d}{SUFFIX}.csv"
    
    else:
        merged = merge_sections_labeled([brain_sections, face_sections, spine_sections], [1, 0, 0])
        df_out = create_dataframe_from_merged(merged)
        out_name = f"synthetic_brain_face_spine_{i:03d}{SUFFIX}.csv"
    
    df_out.to_csv(os.path.join(OUTPUT_DIR, out_name), index=False)
    print(f"[variant {variant}] wrote {out_name} for brain report {i+1}/{len(brain_files)}")
    

print("All authentic bundled reports generated in", OUTPUT_DIR)
