In [None]:
!pip -q install reportlab

Imports, paths, helpers

In [None]:
import os, json, math, textwrap, random
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Where your files live in Colab’s sidebar (adjust if needed)
ROND_FILES = ["./ROND_train.csv","./ROND_val.csv","./ROND_test.csv"]
BLUESCRUBS_FILES = ["./bluescrubs_train_clean.csv","./bluescrubs_val_clean.csv","./bluescrubs_test_clean.csv"]

PLOT_DIR = "./eda_figs"
os.makedirs(PLOT_DIR, exist_ok=True)

# simple logger to collect summary lines for the PDF
EDA_LOG = []
def log(line=""):
    print(line)
    EDA_LOG.append(str(line))


Load & sanity-check ROND

In [None]:
# Load & concat
rond = pd.concat([pd.read_csv(p) for p in ROND_FILES], ignore_index=True)

# Basic checks
log("=== ROND overview ===")
log(f"Shape: {rond.shape}")
log(f"Columns: {list(rond.columns)}")

expected_cols = {"instruction","input","output"}
missing_cols = expected_cols - set(rond.columns)
extra_cols = set(rond.columns) - expected_cols
log(f"Missing expected columns: {missing_cols}")
log(f"Extra columns: {extra_cols}")

log("Nulls per column:")
log(rond.isnull().sum())

log(f"Duplicate rows: {rond.duplicated().sum()}")


ROND: lengths, ratios, distributions, plots

In [None]:
# Compute token/word lengths (simple whitespace split)
rond["input_len"]  = rond["input"].astype(str).str.split().str.len()
rond["output_len"] = rond["output"].astype(str).str.split().str.len()
rond["length_ratio"] = rond["output_len"] / rond["input_len"].replace(0, np.nan)

# Summary stats
log("\n=== ROND length stats ===")
for col in ["input_len","output_len","length_ratio"]:
    desc = rond[col].describe(percentiles=[.1,.25,.5,.75,.9,.95]).to_string()
    log(f"\n{col}:\n{desc}")

# Histograms
plt.figure()
rond["input_len"].clip(upper=rond["input_len"].quantile(0.99)).hist(bins=40)
plt.title("ROND: Input length (words)")
plt.xlabel("words"); plt.ylabel("frequency")
plt.tight_layout(); plt.savefig(f"{PLOT_DIR}/rond_input_len_hist.png"); plt.show()

plt.figure()
rond["output_len"].clip(upper=rond["output_len"].quantile(0.99)).hist(bins=40)
plt.title("ROND: Summary length (words)")
plt.xlabel("words"); plt.ylabel("frequency")
plt.tight_layout(); plt.savefig(f"{PLOT_DIR}/rond_output_len_hist.png"); plt.show()

plt.figure()
rond["length_ratio"].dropna().clip(upper=rond["length_ratio"].quantile(0.99)).hist(bins=40)
plt.title("ROND: summary/input length ratio")
plt.xlabel("ratio"); plt.ylabel("frequency")
plt.tight_layout(); plt.savefig(f"{PLOT_DIR}/rond_ratio_hist.png"); plt.show()

# Scatter (sample to keep it light)
sample = rond.sample(min(2000, len(rond)), random_state=42)
plt.figure()
plt.scatter(sample["input_len"], sample["output_len"], s=6, alpha=0.5)
plt.title("ROND: input vs summary length (sample)")
plt.xlabel("input_len"); plt.ylabel("output_len")
plt.tight_layout(); plt.savefig(f"{PLOT_DIR}/rond_len_scatter.png"); plt.show()


Load & sanity-check BlueScrubs (clean CSVs)

In [None]:
bs = pd.concat([pd.read_csv(p) for p in BLUESCRUBS_FILES], ignore_index=True)

log("\n=== BlueScrubs overview ===")
log(f"Shape: {bs.shape}")
log(f"Columns: {list(bs.columns)}")

expected_cols_bs = {"instruction","input","output"}
missing_cols_bs = expected_cols_bs - set(bs.columns)
extra_cols_bs = set(bs.columns) - expected_cols_bs
log(f"Missing expected columns: {missing_cols_bs}")
log(f"Extra columns: {extra_cols_bs}")

log("Nulls per column:")
log(bs.isnull().sum())

log(f"Duplicate rows: {bs.duplicated().sum()}")


BlueScrubs: label balance, lengths, plots

In [None]:
# ensure label is integer 0/1 (it was saved as string earlier)
bs["label"] = bs["output"].astype(str).str.strip().astype(int)
bs["input_len"] = bs["input"].astype(str).str.split().str.len()

# Class balance
counts = bs["label"].value_counts().sort_index()
total = len(bs)
pos = int(counts.get(1,0)); neg = int(counts.get(0,0))
log("\n=== BlueScrubs class balance ===")
log(f"0 (not cancer-related): {neg} ({neg/total:.2%})")
log(f"1 (cancer-related):     {pos} ({pos/total:.2%})")

# Length stats overall & by class
log("\nBlueScrubs input length stats (overall):")
log(bs["input_len"].describe(percentiles=[.1,.25,.5,.75,.9,.95]).to_string())

log("\nBlueScrubs input length stats by class:")
grp = bs.groupby("label")["input_len"].describe()
log(grp.to_string())

# Plots
plt.figure()
counts.plot(kind="bar")
plt.title("BlueScrubs: class counts (0/1)")
plt.xlabel("label"); plt.ylabel("count")
plt.tight_layout(); plt.savefig(f"{PLOT_DIR}/bs_label_counts.png"); plt.show()

plt.figure()
bs["input_len"].clip(upper=bs["input_len"].quantile(0.99)).hist(bins=40)
plt.title("BlueScrubs: Input length (words)")
plt.xlabel("words"); plt.ylabel("frequency")
plt.tight_layout(); plt.savefig(f"{PLOT_DIR}/bs_input_len_hist.png"); plt.show()

plt.figure()
bs.boxplot(column="input_len", by="label")
plt.title("BlueScrubs: input length by class"); plt.suptitle("")
plt.xlabel("label"); plt.ylabel("words")
plt.tight_layout(); plt.savefig(f"{PLOT_DIR}/bs_len_by_label_box.png"); plt.show()


Quick data-quality flags (both datasets)

In [None]:
def quality_flags(df, name, text_col="input", out_col="output"):
    flags = {}
    # blank/very short texts or outputs
    flags["empty_text_rows"] = int((df[text_col].astype(str).str.strip()=="").sum())
    flags["very_short_text_<5w"] = int((df[text_col].astype(str).str.split().str.len()<5).sum())
    flags["empty_output_rows"] = int((df[out_col].astype(str).str.strip()=="").sum())
    # extreme lengths
    text_len = df[text_col].astype(str).str.split().str.len()
    flags["text_len_p99"] = int(text_len.quantile(0.99))
    flags["text_len_max"] = int(text_len.max())
    log(f"\n=== {name} quality flags ===")
    for k,v in flags.items():
        log(f"{k}: {v}")
    return flags

rond_flags = quality_flags(rond, "ROND", "input", "output")
bs_flags   = quality_flags(bs, "BlueScrubs", "input", "output")


Build the PDF report (text + charts)

In [None]:
from reportlab.lib.pagesizes import letter
from reportlab.platypus import SimpleDocTemplate, Paragraph, Spacer, Image as RLImage
from reportlab.lib.styles import getSampleStyleSheet

PDF_PATH = "./EDA_OncoSummarizer_Report.pdf"
doc = SimpleDocTemplate(PDF_PATH, pagesize=letter)
styles = getSampleStyleSheet()
story = []

story += [
    Paragraph("OncoSummarizer — EDA Report", styles["Title"]),
    Spacer(1, 12),
    Paragraph("This report summarizes EDA for two datasets kept separate by task:", styles["BodyText"]),
    Paragraph("• ROND (summarization) — columns: instruction, input, output", styles["BodyText"]),
    Paragraph("• BlueScrubs (classification) — columns: instruction, input, output (label 0/1)", styles["BodyText"]),
    Spacer(1, 12),
]

# Add logged text
for line in EDA_LOG:
    story.append(Paragraph(str(line).replace("\n","<br/>"), styles["BodyText"]))
    story.append(Spacer(1, 6))

# Add plots if they exist
def add_img(path, caption):
    if os.path.exists(path):
        story.append(Spacer(1, 10))
        story.append(RLImage(path, width=480, height=320))
        story.append(Paragraph(caption, styles["Italic"]))
        story.append(Spacer(1, 10))

add_img(f"{PLOT_DIR}/rond_input_len_hist.png", "ROND: Input length (words)")
add_img(f"{PLOT_DIR}/rond_output_len_hist.png", "ROND: Summary length (words)")
add_img(f"{PLOT_DIR}/rond_ratio_hist.png", "ROND: Summary/Input length ratio")
add_img(f"{PLOT_DIR}/rond_len_scatter.png", "ROND: Input vs Summary length (sample)")

add_img(f"{PLOT_DIR}/bs_label_counts.png", "BlueScrubs: class counts (0/1)")
add_img(f"{PLOT_DIR}/bs_input_len_hist.png", "BlueScrubs: Input length (words)")
add_img(f"{PLOT_DIR}/bs_len_by_label_box.png", "BlueScrubs: Input length by class")

doc.build(story)
print(f"Saved PDF: {PDF_PATH}")


The adapted EDA code for BlueScrubs length analysis

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np

# ==============================
# LOAD DATA
# ==============================
train_path = "/content/bluescrubs_train_clean.csv"
val_path   = "/content/bluescrubs_val_clean.csv"
test_path  = "/content/bluescrubs_test_clean.csv"

train_df = pd.read_csv(train_path)
val_df   = pd.read_csv(val_path)
test_df  = pd.read_csv(test_path)

# Combine splits
df = pd.concat([train_df, val_df, test_df], ignore_index=True)

print("Dataset loaded.")
print("Total documents:", len(df))

# ==============================
# WORD COUNTS
# ==============================
df["word_count"] = df["input"].astype(str).apply(lambda x: len(x.split()))

# ==============================
# BASIC STATS
# ==============================
print("\nWord count statistics:")
print(df["word_count"].describe(percentiles=[.5, .75, .9, .95, .99]))

# ==============================
# HISTOGRAM (log scale for long tail)
# ==============================
plt.figure(figsize=(10,6))
plt.hist(df["word_count"], bins=100, color="skyblue", edgecolor="black")
plt.xlabel("Word Count per Document")
plt.ylabel("Frequency")
plt.title("Distribution of Document Lengths (BlueScrubs)")
plt.yscale("log")
plt.show()

# ==============================
# CUTOFF ANALYSIS
# ==============================
def pct_below(threshold):
    return np.mean(df["word_count"] <= threshold) * 100

thresholds = {
    "BERT (512 tokens ≈ 350 words)": 350,
    "Longformer (4096 tokens ≈ 3000 words)": 3000,
    "Extreme cutoff (10k words)": 10000
}

print("\nPercentage of documents fitting within limits:")
for name, cutoff in thresholds.items():
    print(f"{name}: {pct_below(cutoff):.2f}%")

# ==============================
# ZOOM IN (<5000 words)
# ==============================
plt.figure(figsize=(10,6))
subset = df[df["word_count"] < 5000]
plt.hist(subset["word_count"], bins=100, color="orange", edgecolor="black")
plt.xlabel("Word Count per Document (<5000 words)")
plt.ylabel("Frequency")
plt.title("Zoomed-In Document Length Distribution")
plt.show()
