In [None]:
import pandas as pd
import glob
import os
from tqdm import tqdm
import json
import requests
from lifelines import CoxPHFitter
from sklearn.preprocessing import StandardScaler

In [None]:
import numpy as np
import matplotlib.pyplot as plt

In [None]:
files = glob.glob("../TCGA/TCGA_PAAD/*/*.tsv")
dfs = []
for fn in tqdm(files, desc='Reading samples'):
    try:
        sample = fn.split("/")[-2]
        df = pd.read_table(fn, comment="#", usecols=["gene_id","gene_name", "tpm_unstranded"])
        df = df[~df["gene_id"].str.startswith("N_")]
        df = df.set_index("gene_id").rename(columns={"tpm_unstranded": sample})
        dfs.append(df)
    except Exception as e:
        print(f"Error processing file {fn}: {e}")
        raise

expr = pd.concat(dfs, axis=1)
expr.index = expr.index.str.replace(r"\.\d+$", "", regex=True)
expr.to_csv("TCGA_PAAD_TPM_unstranded.csv")

In [None]:
# df_all = pd.read_csv('TCGA_PAAD_TPM_unstranded.csv')
# cols_remove = df_all.columns[df_all.columns.str.contains('gene_name')].tolist()
# cols_remove.remove('gene_name')
# df_all.drop(cols_remove, axis=1, inplace=True)
# df_all.to_csv("TCGA_PAAD_TPM_unstranded.csv")

In [None]:
df_all = pd.read_csv('TCGA_PAAD_TPM_unstranded.csv')

In [None]:
df_all.head()

In [None]:
expr_log = np.log2(df_all.iloc[:, 3:] + 1.0, )

In [None]:
expr_log

In [None]:
expr_log['gene_name'] = df_all['gene_name']
expr_log.set_index('gene_name', inplace=True)


In [None]:
expr_log.head()

In [None]:
# 1) Collect all your file UUIDs from the download folders
folders = glob.glob("../TCGA/TCGA_PAAD/*")  # each named by file UUID
file_ids = [f.split("/")[-1] for f in folders]

# 2) Query the GDC API in batches to get the matching case_submitter_id
# chunk size of 100 to avoid URL‐length issues
mappings = []
for i in range(0, len(file_ids), 100):
    batch = file_ids[i:i+100]
    filters = {
        "op": "in",
        "content": {
            "field": "files.file_id",
            "value": batch
        }
    }
    params = {
        "filters": json.dumps(filters),
        "fields": "file_id,cases.case_id,cases.submitter_id",
        "format": "JSON",
        "size": len(batch)
    }
    r = requests.get("https://api.gdc.cancer.gov/files", params=params)
    r.raise_for_status()
    hits = r.json()["data"]["hits"]
    for h in hits:
        mappings.append({
            "file_id":       h["file_id"],
            "case_id":       h["cases"][0]["case_id"],
            "submitter_id":  h["cases"][0]["submitter_id"]
        })

map_df = pd.DataFrame(mappings)


In [None]:
map_df

In [None]:
map_df = map_df.sort_values(['submitter_id', 'file_id'])

# _rep1, _rep2, ... (first one can be left without suffix if you like)
rep = map_df.groupby('submitter_id').cumcount()
map_df['obs_id'] = np.where(rep == 0,
                            map_df['submitter_id'],
                            map_df['submitter_id'] + '_rep' + (rep+1).astype(str))

In [None]:
map_df.sort_values(by='obs_id', ascending=False)

In [None]:
id_map = dict(zip(map_df.file_id, map_df.submitter_id))
expr_log.rename(columns=id_map, inplace=True)

In [None]:
expr_log_rep = np.log2(df_all.iloc[:, 3:] + 1.0)
expr_log_rep['gene_name'] = df_all['gene_name']
expr_log_rep.set_index('gene_name', inplace=True)
id_map = dict(zip(map_df.file_id, map_df.obs_id))
expr_log_rep.rename(columns=id_map, inplace=True)

In [None]:
expr_log_rep.shape

In [None]:
expr_log.head()

In [None]:
files = glob.glob("../TCGA/Clinical_Data/*.tsv")
dfs = []    

In [None]:
files = glob.glob("../TCGA/Clinical_Data/*.tsv")
metadata = {}
for fn in tqdm(files, desc='Reading samples'):
    try:
        sample = fn.split('/')[-1].split('.')[0]
        df = pd.read_table(fn)
        metadata[sample] = df
    except Exception as e:
        print(f"Error processing file {fn}: {e}")
        raise

In [None]:
expr_df = expr_log.copy()
clin0    = metadata['clinical']       
followup = metadata['follow_up']      

In [None]:
expr_df_ids = expr_df.columns[expr_df.columns.str.contains("T")].tolist()

In [None]:
pd.set_option('display.max_column', 10)
pd.set_option('display.max_row', 10)

In [None]:
interesting_cols = sorted(['cases.submitter_id', 'treatments.initial_disease_status', 'diagnoses.classification_of_tumor', 'treatments.treatment_or_therapy', 
                    'diagnoses.days_to_last_follow_up', 'diagnoses.tumor_grade_category', 'diagnoses.age_at_diagnosis', 'diagnoses.sites_of_involvement',
                    'diagnoses.tumor_grade', 'treatments.treatment_intent_type', 'diagnoses.days_to_diagnosis', 'diagnoses.uicc_clinical_stage', 'treatments.treatment_outcome',
                    'diagnoses.tumor_regression_grade', 'treatments.treatment_frequency', 'diagnoses.progression_or_recurrence','treatments.treatment_dose',
                    'diagnoses.metastasis_at_diagnosis', 'treatments.chemo_concurrent_to_radiation', 'treatments.drug_category', 'demographic.vital_status', 'demographic.race', 
                    'cases.days_to_lost_to_followup', 'cases.lost_to_followup', 'demographic.cause_of_death', 'demographic.days_to_death', 'demographic.gender', 'diagnoses.days_to_last_known_disease_status',
                    'diagnoses.days_to_best_overall_response', 'diagnoses.days_to_recurrence',])

In [None]:
clin0 = clin0[interesting_cols]

In [None]:
clin0.reset_index(drop=True, inplace=True)

In [None]:
clin0.head()

In [None]:
import numpy as np

In [None]:
def summarize(group):
    time_death = group['demographic.days_to_death'].dropna()
    time_follow = group['diagnoses.days_to_last_follow_up'].dropna()
    if len(time_death):
        time = time_death.max()
    else:
        time = time_follow.max() if len(time_follow) else np.nan

    # event: 1 if any row says 'Dead'
    event = int((group['demographic.vital_status'] == 'Dead').any())

    return pd.Series({'time': time, 'event': event})
clin0 = clin0.replace("'--", np.nan)
clin_df = clin0.groupby('cases.submitter_id').apply(summarize).dropna(subset=['time'])

In [None]:
clin_tab = clin_df.reset_index().rename(columns={'cases.submitter_id':'submitter_id'})

In [None]:
clin_tab

In [None]:
clin_tab = clin_tab[clin_tab.submitter_id.isin(expr_df_ids)]

In [None]:
clin_tab['gender'] = clin_tab.submitter_id.map(dict(zip(clin0['cases.submitter_id'], clin0['demographic.gender'])))
clin_tab['gender_encoded'] = clin_tab['gender'].map({'female': 1, 'male':0})

In [None]:
clin_tab #.shape

In [None]:
expr_log_t = expr_log.T.reset_index()

In [None]:
expr_log_t.rename(columns={'index': 'submitter_id'}, inplace=True)

In [None]:
expr_log_t.head()

In [None]:
clin_tab.shape

In [None]:
expr_log_t.shape

In [None]:
df_tcga = pd.merge(clin_tab, expr_log_t, on='submitter_id')

In [None]:
df_tcga.shape

In [None]:
df_tcga['time'] = df_tcga['time'].astype(str).str.replace("'", "")
df_tcga['time'] = pd.to_numeric(df_tcga['time'], errors='coerce')
df_tcga['time'] = df_tcga['time'].astype(int)

In [None]:
df_tcga

In [None]:
df_tcga.to_csv('df_tcga.csv')

In [None]:
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 10)

In [None]:
expr_df_filtered = expr_df.loc[:,~expr_df.columns.duplicated()].copy()

In [None]:
expr_df_filtered.shape

# Mes Drivers

In [None]:
mes_genes = pd.read_csv('mes_genes.csv', index_col='Unnamed: 0')

In [None]:
mes_drivers = mes_genes['0'].tolist()

In [None]:
len(mes_drivers)

In [None]:
mes_drivers = mes_drivers[:50]

In [None]:
def summarize(group):
    time_death = group['demographic.days_to_death'].dropna()
    time_follow = group['diagnoses.days_to_last_follow_up'].dropna()
    if len(time_death):
        time = time_death.max()
    else:
        time = time_follow.max() if len(time_follow) else np.nan

    # event: 1 if any row says 'Dead'
    event = int((group['demographic.vital_status'] == 'Dead').any())

    return pd.Series({'time': time, 'event': event})
clin0 = clin0.replace("'--", np.nan)
clin_df = clin0.groupby('cases.submitter_id').apply(summarize).dropna(subset=['time'])

In [None]:
clin_tab = clin_df.reset_index().rename(columns={'cases.submitter_id':'submitter_id'})

In [None]:
clin_tab

In [None]:
clin_tab = clin_tab[clin_tab.submitter_id.isin(expr_df_ids)]

In [None]:
clin_tab['gender'] = clin_tab.submitter_id.map(dict(zip(clin0['cases.submitter_id'], clin0['demographic.gender'])))
clin_tab['gender_encoded'] = clin_tab['gender'].map({'female': 1, 'male':0})

In [None]:
expr_df_filtered.head()

In [None]:
for gene in mes_drivers:
    if gene not in expr_df_filtered.T.columns:
        print(f'Not present: {gene}')
        continue
    clin_tab[gene] = expr_df_filtered.loc[gene, clin_tab.submitter_id].tolist()

In [None]:
clin_tab.shape

In [None]:
clin_tab['time'] = clin_tab['time'].astype(str).str.replace("'", "")
clin_tab['time'] = pd.to_numeric(clin_tab['time'], errors='coerce')
clin_tab['time'] = clin_tab['time'].astype(int)

In [None]:
clin_tab.shape

In [None]:
gene_cols = clin_tab.columns[clin_tab.columns.isin(mes_drivers)]

In [None]:
from sklearn.feature_selection import VarianceThreshold
vt = VarianceThreshold(threshold=1e-6)
vt.fit(clin_tab[gene_cols])
mask = vt.get_support()           # e.g. [True, False, True, ...]
kept_genes = [g for g, keep in zip(gene_cols, mask) if keep]
print(f"Dropped {len(gene_cols) - len(kept_genes)} zero‐variance genes.")
clin_tab_filtered = clin_tab[kept_genes + ['gender_encoded','time','event']].copy()

In [None]:
clin_tab_filtered.index = clin_tab.submitter_id

In [None]:
pd.set_option('display.max_column', None)
pd.set_option('display.max_row', None)

In [None]:
clin_tab_filtered.head()

In [None]:
# clin_tab_filtered.drop('gender_enc

In [None]:
# 4. Now you can fit your penalized Cox model
from lifelines import CoxPHFitter
cph = CoxPHFitter(penalizer=0.1)
cph.fit(
    clin_tab_filtered,
    duration_col='time', 
    event_col='event'
)
# cph.print_summary()
df = cph.summary 

In [None]:
df.sort_values('p') #.columns

In [None]:
df[df.index.str.contains('HIC')]

In [None]:
summary = cph.summary.loc[kept_genes, ['exp(coef)','exp(coef) lower 95%','exp(coef) upper 95%','p']]

# 5) Forest plot
fig, ax = plt.subplots(figsize=(10,20))
y = np.arange(len(kept_genes))
hr    = summary['exp(coef)']
cil   = summary['exp(coef) lower 95%']
ciu   = summary['exp(coef) upper 95%']
ps    = summary['p']

ax.hlines(y, cil, ciu, color='black')
ax.scatter(hr, y, marker='s', color='black', s=50)
ax.vlines(1, -1, len(kept_genes), linestyles='dashed', color='gray')

ax.set_yticks(y)
ax.set_yticklabels(kept_genes)
ax.set_xlabel('Hazard Ratio (95% CI)')
ax.set_title('Multivariate CoxPH (adjusted for gender)')

# annotate p-values
for i, p in enumerate(ps):
    ax.text(ciu[i]*1.02, y[i], f"p={p:.3f}", va='center', fontsize=8)

plt.tight_layout()
plt.show()

In [None]:
clin_tab_filtered

In [None]:

from sklearn.preprocessing import StandardScaler
from lifelines import CoxPHFitter

mes50 = [
 'NDN','C1R','MGP','MFAP4','COL14A1','CCDC80','TCF4','CXCL12','KCNE4','ASPN',
 'THY1','SERPING1','OLFML3','NEXN','HIC1','TCEAL7','OLFML1','DCN','FBN1','C1S',
 'ADGRA2','SERPINF1','SPARCL1','COL6A2','EMILIN1','FRZB','IGFBP7','ANGPT1','JAM3',
 'CALD1','NLGN2','BGN','PCOLCE','TIMP3','PTH1R','PCDH18','CPXM2','GXYLT2','COX7A1',
 'ACVRL1','COPZ2','PODN','COL3A1','PYGO1','A2M','PRRX1','SPARC','C7','GSTM5','TCF21'
]

# df_tcga: one row per patient; columns = OS_time, OS_event, age, stage (encoded),
# sex (0/1), purity (continuous), plus all mes50 gene expression columns (log CPM/TPM)
present = [g for g in mes50 if g in clin_tab_filtered.columns]
Z = pd.DataFrame(StandardScaler().fit_transform(clin_tab_filtered[present]),
                 columns=present, index=clin_tab_filtered.index)
clin_tab_filtered["Mes50_score"] = Z.mean(axis=1)

covars = ["Mes50_score", "gender_encoded"]
cph = CoxPHFitter()
cph.fit(clin_tab_filtered[["time","event"]+covars], duration_col="time", event_col="event")
print(cph.summary.loc["Mes50_score", ["coef","exp(coef)","p","coef lower 95%","coef upper 95%"]])

# Radio vs no Radio

In [None]:
radio_dge = ['ZFAND3',
 'ZBTB20',
 'RORA',
 'PARD3B',
 'NAALADL2',
 'PRKG1',
 'TRIO',
 'FTX',
 'LDLRAD4',
 'FNDC3B',
 'MACF1',
 'KIF26B',
 'SMYD3',
 'SPIDR',
 'CDK14',
 'WWOX',
 'EXT1',
 'LPP',
 'PARD3',
 'RASAL2',
 'CUX1',
 'DLG2',
 'ZSWIM6',
 'PRKCA',
 'AGAP1',
 'TSHZ2',
 'MYO1D',
 'GLIS3',
 'BNC2',
 'LAMA2',
 'MAML2',
 'BTBD9',
 'PLXDC2',
 'FAM155A',
 'DOCK1',
 'FBXL7',
 'PTPRG',
 'VPS13B',
 'PTPRM',
 'UBE2E2',
 'PDE4D',
 'TCF12',
 'DIP2C',
 'CACNA1C',
 'PTK2',
 'ZFPM2',
 'MYO1E',
 'BICC1',
 'FOXO1',
 'AUTS2']

In [None]:
clin_tab

In [None]:
clin_tab.to_csv('TCGA_Radiotherapy/clin_tab.csv')

In [None]:
def summarize(group):
    time_death = group['demographic.days_to_death'].dropna()
    time_follow = group['diagnoses.days_to_last_follow_up'].dropna()
    if len(time_death):
        time = time_death.max()
    else:
        time = time_follow.max() if len(time_follow) else np.nan

    # event: 1 if any row says 'Dead'
    event = int((group['demographic.vital_status'] == 'Dead').any())

    return pd.Series({'time': time, 'event': event})
clin0 = clin0.replace("'--", np.nan)
clin_df = clin0.groupby('cases.submitter_id').apply(summarize).dropna(subset=['time'])

In [None]:
clin_tab = clin_df.reset_index().rename(columns={'cases.submitter_id':'submitter_id'})

In [None]:
clin_tab

In [None]:
clin_tab = clin_tab[clin_tab.submitter_id.isin(expr_df_ids)]

In [None]:
clin_tab['gender'] = clin_tab.submitter_id.map(dict(zip(clin0['cases.submitter_id'], clin0['demographic.gender'])))
clin_tab['gender_encoded'] = clin_tab['gender'].map({'female': 1, 'male':0})

In [None]:
expr_df_filtered = expr_df.loc[:,~expr_df.columns.duplicated()].copy()
expr_df_filtered.shape

In [None]:
expr_df_filtered.head()

In [None]:
for gene in radio_dge:
    if gene not in expr_df_filtered.T.columns:
        print(f'Not present: {gene}')
        continue
    clin_tab[gene] = expr_df_filtered.loc[gene, clin_tab.submitter_id].tolist()

In [None]:
clin_tab.shape

In [None]:
clin_tab['time'] = clin_tab['time'].astype(str).str.replace("'", "")
clin_tab['time'] = pd.to_numeric(clin_tab['time'], errors='coerce')
clin_tab['time'] = clin_tab['time'].astype(int)

In [None]:
clin_tab.shape

In [None]:
gene_cols = clin_tab.columns[clin_tab.columns.isin(radio_dge)]

In [None]:
from sklearn.feature_selection import VarianceThreshold
vt = VarianceThreshold(threshold=1e-6)
vt.fit(clin_tab[gene_cols])
mask = vt.get_support()           # e.g. [True, False, True, ...]
kept_genes = [g for g, keep in zip(gene_cols, mask) if keep]
print(f"Dropped {len(gene_cols) - len(kept_genes)} zero‐variance genes.")
clin_tab_filtered = clin_tab[kept_genes + ['gender_encoded','time','event']].copy()

In [None]:
clin_tab_filtered.index = clin_tab.submitter_id

In [None]:
pd.set_option('display.max_column', None)
pd.set_option('display.max_row', None)

In [None]:
clin_tab_filtered.head()

In [None]:
# clin_tab_filtered.drop('gender_enc

In [None]:
# 4. Now you can fit your penalized Cox model
from lifelines import CoxPHFitter
cph = CoxPHFitter(penalizer=0.1)
cph.fit(
    clin_tab_filtered,
    duration_col='time', 
    event_col='event'
)
# cph.print_summary()
df = cph.summary 

In [None]:
df.sort_values('p') #.columns

In [None]:
clin_tab.to_csv('TCGA_Radiotherapy/clin_tab_genes.csv')

# Save

In [None]:
df.to_csv('radio_dge_tcga.csv')

# Reload

In [None]:
pwd

In [None]:
df = pd.read_csv('TCGA_Radiotherapy/radio_dge_tcga.csv')
clin_tab = pd.read_csv('TCGA_Radiotherapy/clin_tab_genes.csv')

In [None]:
df.columns

In [None]:
df.sort_values('p')

In [None]:
df[df.index.str.contains('TSHZ2')]

In [None]:
summary = cph.summary.loc[kept_genes, ['exp(coef)','exp(coef) lower 95%','exp(coef) upper 95%','p']]

# 5) Forest plot
fig, ax = plt.subplots(figsize=(10,20))
y = np.arange(len(kept_genes))
hr    = summary['exp(coef)']
cil   = summary['exp(coef) lower 95%']
ciu   = summary['exp(coef) upper 95%']
ps    = summary['p']

ax.hlines(y, cil, ciu, color='black')
ax.scatter(hr, y, marker='s', color='black', s=50)
ax.vlines(1, -1, len(kept_genes), linestyles='dashed', color='gray')

ax.set_yticks(y)
ax.set_yticklabels(kept_genes)
ax.set_xlabel('Hazard Ratio (95% CI)')
ax.set_title('Multivariate CoxPH (adjusted for gender)')

# annotate p-values
for i, p in enumerate(ps):
    ax.text(ciu[i]*1.02, y[i], f"p={p:.3f}", va='center', fontsize=8)

plt.tight_layout()
plt.show()

In [None]:
from lifelines import KaplanMeierFitter
from lifelines.statistics import logrank_test

def km_plot_gene(df, gene, cut="median", ax=None):
    """Plot KM curves for gene using a binary split."""
    x = df[gene]
    if cut == "median":
        thresh = x.median()
        group = np.where(x > thresh, "High", "Low")
    else:
        # numeric threshold if you want to pass e.g., cut=2.3
        thresh = float(cut)
        group = np.where(x > thresh, "High", "Low")

    tmp = df.assign(group=group)

    kmf = KaplanMeierFitter()
    if ax is None:
        fig, ax = plt.subplots(figsize=(5,4), dpi=120)

    for label in ["High", "Low"]:
        sel = tmp["group"] == label
        kmf.fit(durations=tmp.loc[sel, "time"],
                event_observed=tmp.loc[sel, "event"],
                label=f"{label} {gene} (n={sel.sum()})")
        kmf.plot(ax=ax, ci_show=True)

    # log-rank test
    hi = tmp["group"] == "High"
    lo = tmp["group"] == "Low"
    res = logrank_test(tmp.loc[hi, "time"], tmp.loc[lo, "time"],
                       event_observed_A=tmp.loc[hi, "event"],
                       event_observed_B=tmp.loc[lo, "event"])
    p = res.p_value

    ax.set_title(f"{gene}: High vs Low (cut={thresh:.3g})")
    ax.set_xlabel("Time")
    ax.set_ylabel("Survival probability")
    ax.legend(frameon=False)
    ax.text(0.02, 0.02, f"log-rank p = {p:.3g}", transform=ax.transAxes)

    return ax, p, tmp

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(10,4), dpi=300, sharey=True)
ax, p, tmp = km_plot_gene(clin_tab, "MYO1E", ax=axes)
fig.savefig('myo1e_tcga.png')

In [None]:
fig, axes = plt.subplots(1, 1, figsize=(10,4), dpi=120, sharey=True)
ax, p, tmp = km_plot_gene(clin_tab, "TSHZ2", ax=axes)

In [None]:
worse_survival = df[df["coef"] > 0].sort_values(by='p')

In [None]:
worse_survival[:4].covariate.tolist()

In [None]:
df.sort_values(by='p')[:4].index.tolist()

In [None]:
genes = ['TSHZ2', 'MYO1E', 'LPP', 'CDK14']

fig, axs = plt.subplots(2, 2, figsize=(10, 8), dpi=300, sharey=True)
axs = axs.flatten()

for i, gene in enumerate(genes):
    ax, p, tmp = km_plot_gene(clin_tab, gene, ax=axs[i])  # pass the i-th Axes
    ax.set_title(f"{gene}")

fig.tight_layout()
plt.savefig('TCGA_Radiotherapy/top_four.png', dpi=300)

In [None]:
genes = ['MYO1E', 'LPP', 'CDK14', 'ZBTB20']

fig, axs = plt.subplots(2, 2, figsize=(10, 8), dpi=300, sharey=True)
axs = axs.flatten()

for i, gene in enumerate(genes):
    ax, p, tmp = km_plot_gene(clin_tab, gene, ax=axs[i])  # pass the i-th Axes
    ax.set_title(f"{gene}")

fig.tight_layout()
plt.savefig('TCGA_Radiotherapy/four_worse.png', dpi=300)