In [12]:
# Cell 1: Initialize libraries 

import pandas as pd
from pathlib import Path

In [13]:
# Cell 2: File loaders and utility functions

# File Loaders 

def load_expression(path: str) -> pd.DataFrame:
    # genes x samples, 
    return pd.read_table(path, index_col=0)

def load_clinical(path: str) -> pd.DataFrame:
    return pd.read_table(path, index_col=0)

# Harmonize ID Preprocessing

def harmonize_ids(idx):
    # Removes the trailing '.' and numbers (e.g., '.01', '.11') and limits to 15 characters
    # (e.g., 'TCGA-A1-A0SD-01' becomes 'TCGA-A1-A0SD')
    return (
        idx.astype(str)
        .str.replace(r"\.", "-", regex=True)
        .str[:15]
    )

def harmonize_expression(expr: pd.DataFrame) -> pd.DataFrame:
    expr = expr.copy()
    expr.columns = harmonize_ids(expr.columns)
    expr = expr.loc[:, ~expr.columns.duplicated()]
    return expr

def harmonize_clinical(clinical: pd.DataFrame) -> pd.DataFrame:
    clinical = clinical.copy()
    clinical.index = harmonize_ids(clinical.index)
    clinical = clinical[~clinical.index.duplicated()]
    return clinical

# Helper function for printing DataFrame previews
def print_df_sample(df: pd.DataFrame, title: str, head_rows: int = 10) -> None:
    """Prints a formatted sample (head and shape) of a DataFrame."""
    print("-" * 50)
    print(f" {title} Sample (Shape: {df.shape})")
    print("-" * 50)
    
    print(df.head(head_rows).to_markdown(index=df.index.name is not None))
    print("\n")

In [14]:
# Cell 3: Metadata Filtering Logic

# Metadata-file-forming functions

def build_metadata(expr: pd.DataFrame,
                   clinical: pd.DataFrame) -> pd.DataFrame:
    """Combines harmonized expression columns and clinical index into a metadata frame."""
    # Only keep samples that exist in both expression and clinical phenotype datasets
    common_ids = sorted(set(expr.columns) & set(clinical.index))
    if not common_ids:
        raise ValueError("No overlapping samples between expression and clinical after harmonization.")

    clinical_sub = clinical.loc[common_ids]

    meta = pd.DataFrame(index=common_ids)
    meta["Sample"] = meta.index

    # Use the actual columns from TCGA BRCA Clinical Matrix
    meta["molecular_subtype"] = clinical_sub["PAM50Call_RNAseq"]
    meta["sample_type"] = clinical_sub["sample_type"]
    meta["histological_type"] = clinical_sub["histological_type"]

    return meta

def filter_LumA_IDC_Tumor_vs_AllNormals(meta: pd.DataFrame) -> pd.DataFrame:
    """
    Build contrast:
      - Tumor: LumA/Luminal A + IDC (ductal) + Primary Tumor
      - Normal: ALL Solid Tissue Normal (no subtype / histology restriction)
    """
    meta = meta.copy()

    # Luminal A definition
    is_luma = meta["molecular_subtype"].isin(["Luminal A", "LumA"])

    # IDC / ductal histology
    is_idc = meta["histological_type"].str.contains("Ductal", case=False, na=False)

    # Sample types
    is_primary = meta["sample_type"].str.contains("Primary Tumor", case=False, na=False)
    is_normal  = meta["sample_type"].str.contains("Solid Tissue Normal", case=False, na=False)

    # Tumors: LumA + IDC + Primary
    tumor  = meta[is_luma & is_idc & is_primary]

    # Normals: ALL solid tissue normals (no LumA/IDC restriction)
    normal = meta[is_normal]

    if tumor.empty:
        raise ValueError("No LumA-IDC primary tumors found in intersecting samples.")
    if normal.empty:
        raise ValueError("No Solid Tissue Normal samples found in intersecting samples.")

    tumor  = tumor.assign(Group="Tumor")
    normal = normal.assign(Group="Normal")

    out = pd.concat([tumor, normal], axis=0)
    out = out[["Sample", "Group", "molecular_subtype", "sample_type", "histological_type"]]

    return out.sort_values("Sample")

In [15]:
# Cell 4: Main Driver Function

def generate_metadata_and_expr(
    expr_path: str,
    clinical_path: str,
    meta_out_path: str,
    expr_out_path: str = None,
    display_output_sample: bool = False,
):
    # Load
    expr = load_expression(expr_path)
    clinical = load_clinical(clinical_path)

    # Harmonize
    expr = harmonize_expression(expr)
    clinical = harmonize_clinical(clinical)

    # Build metadata for intersecting samples
    meta_full = build_metadata(expr, clinical)

    # Restrict to LumA-IDC Tumor vs ALL Normals
    meta = filter_LumA_IDC_Tumor_vs_AllNormals(meta_full)

    # Display Metadata Sample
    if display_output_sample:
        print_df_sample(meta, title="Final Metadata Output")

    # Save metadata
    meta_out_path = Path(meta_out_path)
    meta_out_path.parent.mkdir(parents=True, exist_ok=True)
    meta.to_csv(meta_out_path, index=False)
    print(f"Saved metadata with {len(meta)} samples → {meta_out_path}")

    # Optionally: save matched expression matrix
    if expr_out_path is not None:
        samples = meta["Sample"].tolist()
        expr_sub = expr.loc[:, samples]
        
        # Display Expression Sample
        if display_output_sample:
            # Transpose (T) for better readability in the console
            print_df_sample(expr_sub.T, title="Final Expression Matrix (Transposed)", head_rows=3)
        
        expr_out_path = Path(expr_out_path)
        expr_out_path.parent.mkdir(parents=True, exist_ok=True)
        expr_sub.to_csv(expr_out_path, sep="\t")
        print(f"Saved expression matrix with same {len(samples)} samples → {expr_out_path}")

In [16]:
# Cell 5: Pipeline execution

# Define path to input files
EXPR_INPUT_PATH = "/data/Bio2025/Thomas/BIOL616-FinalProject-Repo/datasets/preprocessing_inputs/TCGA.BRCA.sampleMap%2FHiSeqV2_PANCAN.gz"
CLINICAL_INPUT_PATH = "/data/Bio2025/Thomas/BIOL616-FinalProject-Repo/datasets/preprocessing_inputs/TCGA.BRCA.sampleMap%2FBRCA_clinicalMatrix"

# Use the following bash commands in terminal to download the files to the repo if needed:
# wget -P ~ https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/TCGA.BRCA.sampleMap%2FHiSeqV2_PANCAN.gz
# wget -P ~ https://tcga-xena-hub.s3.us-east-1.amazonaws.com/download/TCGA.BRCA.sampleMap%2FBRCA_clinicalMatrix

# Define path to output files
META_OUTPUT_PATH = "/data/Bio2025/Thomas/BIOL616-FinalProject-Repo/datasets/preprocessing_outputs/metadata_LumA_IDC_Tumor_vs_AllNormals.tsv"
EXPR_OUTPUT_PATH = "/data/Bio2025/Thomas/BIOL616-FinalProject-Repo/datasets/preprocessing_outputs/expr_LumA_IDC_Tumor_vs_AllNormals.tsv"

# Run the pipeline and display the output samples
generate_metadata_and_expr(
    expr_path=EXPR_INPUT_PATH,
    clinical_path=CLINICAL_INPUT_PATH,
    meta_out_path=META_OUTPUT_PATH,
    expr_out_path=EXPR_OUTPUT_PATH,
    display_output_sample=True, #Set to true to display the table preview
)

--------------------------------------------------
 Final Metadata Output Sample (Shape: (413, 5))
--------------------------------------------------
| Sample          | Group   | molecular_subtype   | sample_type   | histological_type             |
|:----------------|:--------|:--------------------|:--------------|:------------------------------|
| TCGA-A1-A0SD-01 | Tumor   | LumA                | Primary Tumor | Infiltrating Ductal Carcinoma |
| TCGA-A1-A0SF-01 | Tumor   | LumA                | Primary Tumor | Infiltrating Ductal Carcinoma |
| TCGA-A1-A0SH-01 | Tumor   | LumA                | Primary Tumor | Infiltrating Ductal Carcinoma |
| TCGA-A1-A0SJ-01 | Tumor   | LumA                | Primary Tumor | Infiltrating Ductal Carcinoma |
| TCGA-A1-A0SM-01 | Tumor   | LumA                | Primary Tumor | Infiltrating Ductal Carcinoma |
| TCGA-A1-A0SQ-01 | Tumor   | LumA                | Primary Tumor | Infiltrating Ductal Carcinoma |
| TCGA-A2-A04N-01 | Tumor   | LumA                

In [19]:
import pandas as pd

path = "/data/Bio2025/Thomas/BIOL616-FinalProject-Repo/datasets/preprocessing_outputs/expr_LumA_IDC_Tumor_vs_AllNormals.tsv"
df = pd.read_csv(path, sep="\t")

df.iloc[:10, :5]

Unnamed: 0,sample,TCGA-A1-A0SD-01,TCGA-A1-A0SF-01,TCGA-A1-A0SH-01,TCGA-A1-A0SJ-01
0,ARHGEF10L,-0.698292,-0.005892,-0.997092,-0.097692
1,HIF3A,-3.061226,-0.641526,-3.891626,0.621774
2,RNF17,-0.531035,-0.531035,-0.086135,-0.531035
3,RNF10,0.009228,-0.295872,0.100328,-0.275572
4,RNF11,0.045222,0.102122,0.884722,0.472622
5,RNF13,0.03249,-0.21821,0.38539,0.53399
6,GTF2IP1,0.318006,0.270806,-0.160394,0.189706
7,REM1,0.351354,-0.880646,-0.697746,0.112654
8,MTVR2,-0.423399,-0.423399,-0.423399,-0.423399
9,RTN4RL2,-0.108372,1.130328,0.791528,-1.274872


In [20]:
import pandas as pd

path = "/data/Bio2025/Thomas/BIOL616-FinalProject-Repo/datasets/preprocessing_outputs/metadata_LumA_IDC_Tumor_vs_AllNormals.tsv"
df = pd.read_csv(path, sep="\t")

df.iloc[:10, :5]

Unnamed: 0,"Sample,Group,molecular_subtype,sample_type,histological_type"
0,"TCGA-A1-A0SD-01,Tumor,LumA,Primary Tumor,Infil..."
1,"TCGA-A1-A0SF-01,Tumor,LumA,Primary Tumor,Infil..."
2,"TCGA-A1-A0SH-01,Tumor,LumA,Primary Tumor,Infil..."
3,"TCGA-A1-A0SJ-01,Tumor,LumA,Primary Tumor,Infil..."
4,"TCGA-A1-A0SM-01,Tumor,LumA,Primary Tumor,Infil..."
5,"TCGA-A1-A0SQ-01,Tumor,LumA,Primary Tumor,Infil..."
6,"TCGA-A2-A04N-01,Tumor,LumA,Primary Tumor,Infil..."
7,"TCGA-A2-A04V-01,Tumor,LumA,Primary Tumor,Infil..."
8,"TCGA-A2-A0CP-01,Tumor,LumA,Primary Tumor,Infil..."
9,"TCGA-A2-A0CQ-01,Tumor,LumA,Primary Tumor,Infil..."
