<a href="https://colab.research.google.com/github/vyshnaviracha/GI_TissueSpecific_Transcriptomics/blob/main/analysis/heatmap_analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# ==============================
# Load expression
# ==============================
expr = pd.read_csv("GI_1396_Annotated_TPM.tsv", sep="\t")
expr = expr.set_index("symbol")

# Keep numeric columns only
expr = expr.select_dtypes(include=[np.number])

# ==============================
# Load phenotype (fix encoding)
# ==============================
pheno = pd.read_csv(
    "TcgaTargetGTEX_phenotype.txt",
    sep="\t",
    encoding="latin1"
)

# ==============================
# Match samples
# ==============================
common_samples = list(set(expr.columns) & set(pheno["sample"]))

expr = expr[common_samples]
pheno = pheno[pheno["sample"].isin(common_samples)]

print("Expression matrix:", expr.shape)
print("Phenotype matrix:", pheno.shape)

In [None]:
gi_sites = {
    "Colorectum": ["Colon", "Rectum"],
    "Stomach": ["Stomach"],
    "Liver": ["Liver"],
    "Pancreas": ["Pancreas"],
    "Esophagus": ["Esophagus"]
}

result = {}

for tissue, keywords in gi_sites.items():

    tissue_samples = pheno[
        pheno["_primary_site"].str.contains("|".join(keywords), case=False, na=False)
    ]

    tumor_ids = tissue_samples[
        tissue_samples["_sample_type"] == "Primary Tumor"
    ]["sample"].tolist()

    normal_ids = tissue_samples[
        tissue_samples["_study"] == "GTEX"
    ]["sample"].tolist()

    tumor_ids = list(set(tumor_ids) & set(expr.columns))
    normal_ids = list(set(normal_ids) & set(expr.columns))

    if len(tumor_ids) > 20 and len(normal_ids) > 20:
        result[f"{tissue}_Normal"] = expr[normal_ids].mean(axis=1)
        result[f"{tissue}_Tumor"] = expr[tumor_ids].mean(axis=1)

heatmap_df = pd.DataFrame(result)

print("Final heatmap shape:", heatmap_df.shape)

In [None]:
heatmap_scaled = heatmap_df.sub(
    heatmap_df.mean(axis=1), axis=0
).div(
    heatmap_df.std(axis=1), axis=0
)

heatmap_scaled = heatmap_scaled.replace([np.inf, -np.inf], np.nan)
heatmap_scaled = heatmap_scaled.dropna()

print("Scaled matrix:", heatmap_scaled.shape)

In [None]:
sns.set(style="white")

g = sns.clustermap(
    heatmap_scaled,
    cmap="RdBu_r",
    center=0,
    vmin=-2,
    vmax=2,
    figsize=(12, 24),
    row_cluster=True,
    col_cluster=False,
    xticklabels=True,
    yticklabels=False  # Hide gene names for journal cleanliness
)

plt.title("Figure 3 â€” Tissue-Specific Tumor vs Normal Expression (GI Cancers)", pad=120)

plt.show()

In [None]:
g.savefig(
    "Figure3_GI_TissueSpecific_TumorVsNormal.svg",
    format="svg"
)