In [1]:
# Cell 1: Project paths + directory setup

import sys
from pathlib import Path

def find_repo_root(start: Path) -> Path:
    start = start.resolve()
    for p in [start] + list(start.parents):
        has_scripts = (p / "scripts").is_dir()
        has_data = (p / "data").is_dir()
        has_readme = (p / "README.md").is_file()
        # Optional: prefer actual git repo root if present
        has_git = (p / ".git").exists()

        if has_scripts and has_data and has_readme:
            return p
    raise RuntimeError(
        "Could not locate repo root containing scripts/, data/, and README.md.\n"
        f"Start path was: {start}"
    )

REPO_ROOT = find_repo_root(Path.cwd())

if str(REPO_ROOT) not in sys.path:
    sys.path.insert(0, str(REPO_ROOT))

print("Repo root detected:", REPO_ROOT)

from scripts.config.project_paths import (
    REPO_ROOT, ensure_dirs,
    EXPR_RAW, CLIN_RAW,
    PREPROC_OUTPUT_DIR, META_PREPROC, EXPR_PREPROC
)

ensure_dirs()

missing = [p for p in (EXPR_RAW, CLIN_RAW) if not p.exists()]
if missing:
    print("Missing required raw input files:")
    for p in missing:
        print(" -", p)
    print("\nRun this from the repo root to see download instructions:")
    print("  ./scripts/setup/download_xena_data.sh")
    raise FileNotFoundError("Raw input data not found. See instructions above.")

print("Repo root:", REPO_ROOT)
print("Expression input:", EXPR_RAW)
print("Clinical input:", CLIN_RAW)
print("Preproc outputs dir:", PREPROC_OUTPUT_DIR)

Repo root detected: /Users/tommyrucinski/dev/repos/tcga-brca-luminalA-deg-gsea
Repo root: /Users/tommyrucinski/dev/repos/tcga-brca-luminalA-deg-gsea
Expression input: /Users/tommyrucinski/dev/repos/tcga-brca-luminalA-deg-gsea/data/raw/preprocessing_inputs/HiSeqV2_PANCAN.gz
Clinical input: /Users/tommyrucinski/dev/repos/tcga-brca-luminalA-deg-gsea/data/raw/preprocessing_inputs/BRCA_clinicalMatrix.tsv
Preproc outputs dir: /Users/tommyrucinski/dev/repos/tcga-brca-luminalA-deg-gsea/data/processed/preprocessing_outputs


In [2]:
# Cell 2: File loaders and utility functions

from pathlib import Path
import pandas as pd

# -----------------------------
# File Loaders
# -----------------------------

def _assert_file_exists(path: Path) -> None:
    path = Path(path)
    if not path.exists():
        raise FileNotFoundError(f"File not found: {path}")

def load_expression(path: Path) -> pd.DataFrame:
    """Load expression matrix assumed to be genes x samples."""
    path = Path(path)
    _assert_file_exists(path)
    return pd.read_table(path, index_col=0)

def load_clinical(path: Path) -> pd.DataFrame:
    """Load clinical/phenotype table assumed to be samples x fields (or similar)."""
    path = Path(path)
    _assert_file_exists(path)
    return pd.read_table(path, index_col=0)

# -----------------------------
# Harmonize TCGA IDs
# -----------------------------

_TCGA_SAMPLE_LEN = 16  # sample-level: TCGA-XX-YYYY-01A

def harmonize_ids(idx: pd.Index, n: int = _TCGA_SAMPLE_LEN) -> pd.Index:
    """
    Standardize TCGA barcodes to a consistent key.
    - Converts '.' to '-'
    - Truncates to first n characters (default keeps sample type, e.g., -01A vs -11A)
    """
    return pd.Index(idx).astype(str).str.replace(r"\.", "-", regex=True).str[:n]

def harmonize_expression(expr: pd.DataFrame, duplicate_policy: str = "mean") -> pd.DataFrame:
    """
    duplicate_policy:
      - "fail": raise if duplicate sample IDs appear
      - "first": keep first occurrence
      - "mean": average duplicate columns (recommended)
    """
    expr = expr.copy()
    expr.columns = harmonize_ids(expr.columns, n=_TCGA_SAMPLE_LEN)

    dup_count = expr.columns.duplicated().sum()
    if dup_count:
        print(f"[warn] {dup_count} duplicate sample IDs after harmonization (n={_TCGA_SAMPLE_LEN}). Policy={duplicate_policy}")

    if dup_count and duplicate_policy == "fail":
        raise ValueError("Duplicate sample IDs detected after harmonization. Increase n or change duplicate_policy.")
    elif dup_count and duplicate_policy == "first":
        expr = expr.loc[:, ~expr.columns.duplicated(keep="first")]
    elif dup_count and duplicate_policy == "mean":
        expr = expr.groupby(level=0, axis=1).mean()

    return expr

def harmonize_clinical(clinical: pd.DataFrame) -> pd.DataFrame:
    clinical = clinical.copy()
    clinical.index = harmonize_ids(clinical.index, n=_TCGA_SAMPLE_LEN)
    # Keep first if duplicates; clinical duplicates usually reflect repeated rows
    clinical = clinical[~clinical.index.duplicated(keep="first")]
    return clinical

# -----------------------------
# Display Helpers
# -----------------------------

def print_df_sample(df: pd.DataFrame, title: str, head_rows: int = 10) -> None:
    print("-" * 70)
    print(f"{title}  |  shape={df.shape}")
    print("-" * 70)
    display(df.head(head_rows))

In [3]:
# Cell 3: Metadata Filtering Logic

# Metadata-file-forming functions

def build_metadata(expr: pd.DataFrame, clinical: pd.DataFrame) -> pd.DataFrame:
    # fail fast if columns aren't present
    required_cols = ["PAM50Call_RNAseq", "sample_type", "histological_type"]
    missing_cols = [c for c in required_cols if c not in clinical.columns]
    if missing_cols:
        raise KeyError(f"Clinical table missing required columns: {missing_cols}")

    # deterministic intersection
    common_ids = expr.columns.intersection(clinical.index)
    if common_ids.empty:
        raise ValueError("No overlapping samples between expression and clinical after harmonization.")

    clinical_sub = clinical.loc[common_ids]

    print(
        f"[meta] common samples: {len(common_ids)} | "
        f"NA PAM50: {clinical_sub['PAM50Call_RNAseq'].isna().sum()} | "
        f"NA histology: {clinical_sub['histological_type'].isna().sum()}"
    )

    meta = pd.DataFrame(index=common_ids)
    meta["Sample"] = meta.index
    meta["molecular_subtype"] = clinical_sub["PAM50Call_RNAseq"]
    meta["sample_type"] = clinical_sub["sample_type"]
    meta["histological_type"] = clinical_sub["histological_type"]

    # optional: add TCGA sample code for sanity checks
    meta["tcga_sample_code"] = meta["Sample"].str.split("-").str[-1].str[:2]

    return meta

def filter_LumA_IDC_Tumor_vs_AllNormals(meta: pd.DataFrame) -> pd.DataFrame:
    """
    Build contrast:
      - Tumor: LumA/Luminal A + IDC (ductal) + Primary Tumor
      - Normal: ALL Solid Tissue Normal (no subtype / histology restriction)
    """
    meta = meta.copy()

    # Luminal A definition
    is_luma = meta["molecular_subtype"].isin(["Luminal A", "LumA"])

    # IDC / ductal histology
    is_idc = meta["histological_type"].str.contains("Ductal", case=False, na=False)

    # Sample types
    is_primary = meta["sample_type"].str.contains("Primary Tumor", case=False, na=False)
    is_normal  = meta["sample_type"].str.contains("Solid Tissue Normal", case=False, na=False)

    # Tumors: LumA + IDC + Primary
    tumor  = meta[is_luma & is_idc & is_primary]

    # Normals: ALL solid tissue normals (no LumA/IDC restriction)
    normal = meta[is_normal]

    if tumor.empty:
        raise ValueError("No LumA-IDC primary tumors found in intersecting samples.")
    if normal.empty:
        raise ValueError("No Solid Tissue Normal samples found in intersecting samples.")

    tumor  = tumor.assign(Group="Tumor")
    normal = normal.assign(Group="Normal")

    out = pd.concat([tumor, normal], axis=0)
    out = out[["Sample", "Group", "molecular_subtype", "sample_type", "histological_type"]]

    print(f"[filter] Tumor n={len(tumor)} | Normal n={len(normal)}")

    return out.sort_values("Sample")

In [4]:
# Cell 4: Main Driver function
from pathlib import Path

def generate_metadata_and_expr(
    expr_path,
    clinical_path,
    meta_out_path,
    expr_out_path=None,
    display_output_sample: bool = False,
):
    # Normalize paths
    expr_path = Path(expr_path)
    clinical_path = Path(clinical_path)
    meta_out_path = Path(meta_out_path)
    expr_out_path = Path(expr_out_path) if expr_out_path is not None else None

    # Load
    expr = load_expression(expr_path)
    clinical = load_clinical(clinical_path)

    # Harmonize
    expr = harmonize_expression(expr, duplicate_policy="mean")  # or "fail"
    clinical = harmonize_clinical(clinical)

    # Build metadata for intersecting samples
    meta_full = build_metadata(expr, clinical)

    # Restrict to LumA-IDC Tumor vs ALL Normals
    meta = filter_LumA_IDC_Tumor_vs_AllNormals(meta_full)

    # Guardrail: ensure unique sample IDs
    if meta["Sample"].duplicated().any():
        dups = meta.loc[meta["Sample"].duplicated(), "Sample"].unique()[:10]
        raise ValueError(f"Duplicate Sample IDs in metadata (showing up to 10): {dups}")

    # Ensure all samples exist in expression
    samples = pd.Index(meta["Sample"])
    missing_in_expr = samples.difference(expr.columns)
    if len(missing_in_expr) > 0:
        raise ValueError(
            f"{len(missing_in_expr)} metadata samples missing from expression columns. "
            f"Example: {missing_in_expr[:5].tolist()}"
        )

    # Subset expression in the exact metadata order
    expr_sub = expr.loc[:, samples]
    
    # Display samples
    if display_output_sample:
        print_df_sample(meta, title="Final Metadata Output")
        print_df_sample(expr_sub.T, title="Final Expression Matrix (Transposed)", head_rows=3)

    print(meta["Group"].value_counts())
    
    # Save metadata
    meta_out_path.parent.mkdir(parents=True, exist_ok=True)
    meta = meta.reset_index(drop=True)
    meta.to_csv(meta_out_path, sep="\t", index=False)
    print(f"Saved metadata with {len(meta)} samples → {meta_out_path}")

    # Optionally save matched expression matrix
    if expr_out_path is not None:
        expr_out_path.parent.mkdir(parents=True, exist_ok=True)
        expr_sub.to_csv(expr_out_path, sep="\t")
        print(f"Saved expression matrix with {expr_sub.shape[1]} samples → {expr_out_path}")

In [5]:
# Cell 5: Pipeline execution

# Run the pipeline and display the output samples
generate_metadata_and_expr(
    expr_path=EXPR_RAW,
    clinical_path=CLIN_RAW,
    meta_out_path=META_PREPROC,
    expr_out_path=EXPR_PREPROC,
    display_output_sample=True, #Set to true to display the table preview
)

[meta] common samples: 1218 | NA PAM50: 262 | NA histology: 4
[filter] Tumor n=299 | Normal n=114
----------------------------------------------------------------------
Final Metadata Output  |  shape=(413, 5)
----------------------------------------------------------------------


Unnamed: 0,Sample,Group,molecular_subtype,sample_type,histological_type
TCGA-A1-A0SD-01,TCGA-A1-A0SD-01,Tumor,LumA,Primary Tumor,Infiltrating Ductal Carcinoma
TCGA-A1-A0SF-01,TCGA-A1-A0SF-01,Tumor,LumA,Primary Tumor,Infiltrating Ductal Carcinoma
TCGA-A1-A0SH-01,TCGA-A1-A0SH-01,Tumor,LumA,Primary Tumor,Infiltrating Ductal Carcinoma
TCGA-A1-A0SJ-01,TCGA-A1-A0SJ-01,Tumor,LumA,Primary Tumor,Infiltrating Ductal Carcinoma
TCGA-A1-A0SM-01,TCGA-A1-A0SM-01,Tumor,LumA,Primary Tumor,Infiltrating Ductal Carcinoma
TCGA-A1-A0SQ-01,TCGA-A1-A0SQ-01,Tumor,LumA,Primary Tumor,Infiltrating Ductal Carcinoma
TCGA-A2-A04N-01,TCGA-A2-A04N-01,Tumor,LumA,Primary Tumor,Infiltrating Ductal Carcinoma
TCGA-A2-A04V-01,TCGA-A2-A04V-01,Tumor,LumA,Primary Tumor,Infiltrating Ductal Carcinoma
TCGA-A2-A0CP-01,TCGA-A2-A0CP-01,Tumor,LumA,Primary Tumor,Infiltrating Ductal Carcinoma
TCGA-A2-A0CQ-01,TCGA-A2-A0CQ-01,Tumor,LumA,Primary Tumor,Infiltrating Ductal Carcinoma


----------------------------------------------------------------------
Final Expression Matrix (Transposed)  |  shape=(413, 20530)
----------------------------------------------------------------------


sample,ARHGEF10L,HIF3A,RNF17,RNF10,RNF11,RNF13,GTF2IP1,REM1,MTVR2,RTN4RL2,...,TULP2,NPY5R,GNGT2,GNGT1,TULP3,PTRF,BCL6B,GSTK1,SELP,SELS
Sample,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
TCGA-A1-A0SD-01,-0.698292,-3.061226,-0.531035,0.009228,0.045222,0.03249,0.318006,0.351354,-0.423399,-0.108372,...,-0.748878,2.027383,-0.115733,-0.86909,0.164023,0.782814,1.491273,-0.384495,3.227367,0.409488
TCGA-A1-A0SF-01,-0.005892,-0.641526,-0.531035,-0.295872,0.102122,-0.21821,0.270806,-0.880646,-0.423399,1.130328,...,-0.748878,-0.694317,-0.073033,0.55471,0.098823,0.606914,0.586973,0.382605,2.243967,0.113688
TCGA-A1-A0SH-01,-0.997092,-3.891626,-0.086135,0.100328,0.884722,0.38539,-0.160394,-0.697746,-0.423399,0.791528,...,0.540722,-1.142217,-0.525433,-1.28139,0.355023,2.018314,0.073873,0.208405,-0.789933,-0.456112


Group
Tumor     299
Normal    114
Name: count, dtype: int64
Saved metadata with 413 samples → /Users/tommyrucinski/dev/repos/tcga-brca-luminalA-deg-gsea/data/processed/preprocessing_outputs/metadata_LumA_IDC_Tumor_vs_AllNormals.tsv
Saved expression matrix with 413 samples → /Users/tommyrucinski/dev/repos/tcga-brca-luminalA-deg-gsea/data/processed/preprocessing_outputs/expr_LumA_IDC_Tumor_vs_AllNormals.tsv
