In [None]:
import os
import numpy as np
import pandas as pd
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import StratifiedKFold, cross_val_predict
import matplotlib.pyplot as plt

regions = 'dkt95'
# regions = 'dkt62'


## WSEV data prep
new_wsev = pd.read_csv("C:/Users/WooSikKim/Desktop/Research/projects/co_pathology/scripts/stage_copath/data/260108_wsev_final_df.csv")
hc_df = new_wsev[new_wsev['DX'] == 'HC']

df_wsev = pd.read_csv('C:/Users/WooSikKim/Desktop/Research/projects/co_pathology/scripts/stage_copath/data/wsev_old_100_data.csv')
df_wsev = df_wsev.dropna()
print(df_wsev['DX'].value_counts())

if regions == 'dkt62':
    region_cols = df_wsev.loc[:, 'VA/1002':'VA/2035'].columns
    X_hc = hc_df[hc_df.loc[:,'VA/1002':'VA/2035'].columns].values.astype(float)
    print(f"HC: {X_hc.shape[0]} subjects")
    hc_mean = X_hc.mean(axis=0, keepdims=True)
    hc_std  = X_hc.std(axis=0, keepdims=True) + 1e-8  # avoid divide-by-zero

elif regions == 'dkt95':
    region_cols = df_wsev.loc[:, 'VA/2':'VA/2035'].columns
    X_hc = hc_df[hc_df.loc[:,'VA/2':'VA/2035'].columns].values.astype(float)
    print(f"HC: {X_hc.shape[0]} subjects")
    hc_mean = X_hc.mean(axis=0, keepdims=True)
    hc_std  = X_hc.std(axis=0, keepdims=True) + 1e-8  # avoid divide-by-zero

df_wsev = pd.concat([df_wsev, hc_df], axis=0, ignore_index=True)
X_wsev = df_wsev[region_cols].values.astype(float)
print(f"Patients: {X_wsev.shape[0]} subjects")

Z_wsev = (X_wsev - hc_mean) / hc_std
X_wsev = np.maximum(-Z_wsev, 0.0)
X_wsev[X_wsev < 0] = 0.0

## SMC data prep
df_smc = pd.read_csv('C:/Users/WooSikKim/Desktop/Research/projects/co_pathology/scripts/stage_copath/data//SMC_AD_FTD_VA_final.csv')
df_nc = df_smc[df_smc['DX'] == 'NC']
# df_smc_pat = df_smc[df_smc['DX'] != 'NC']
df_smc_pat = df_smc # include NC
print(df_smc_pat['DX'].value_counts())

if regions == 'dkt62':
    region_cols = df_smc_pat.loc[:, 'VA/1002':'VA/2035'].columns
    X_nc = df_nc[df_nc.loc[:,'VA/1002':'VA/2035'].columns].values.astype(float)
    print(f"NC: {X_nc.shape[0]} subjects")
    nc_mean = X_nc.mean(axis=0, keepdims=True)
    nc_std  = X_nc.std(axis=0, keepdims=True) + 1e-8  # avoid divide-by-zero

elif regions == 'dkt95':
    region_cols = df_smc_pat.loc[:, 'VA/2':'VA/2035'].columns
    X_nc = df_nc[df_nc.loc[:,'VA/2':'VA/2035'].columns].values.astype(float)
    print(f"NC: {X_nc.shape[0]} subjects")
    nc_mean = X_nc.mean(axis=0, keepdims=True)
    nc_std  = X_nc.std(axis=0, keepdims=True) + 1e-8  # avoid divide-by-zero

X_smc = df_smc_pat[region_cols].values.astype(float)
print(f"Patients: {X_smc.shape[0]} subjects")

Z_smc = (X_smc - nc_mean) / nc_std
X_smc = np.maximum(-Z_smc, 0.0)
X_smc[X_smc < 0] = 0.0


In [None]:
## combine cohorts (WSEV and SMC)
# -------------------------------
# WSEV cohort
# -------------------------------
df_wsev_x = pd.DataFrame(
    X_wsev,
    columns=region_cols
)

df_wsev_x.insert(0, "SUBJ_ID", df_wsev["PTID"].values)
df_wsev_x.insert(1, "DX", df_wsev["DX"].values)


# -------------------------------
# SMC cohort
# -------------------------------
df_smc_x = pd.DataFrame(
    X_smc,
    columns=region_cols
)

df_smc_x.insert(0, "SUBJ_ID", df_smc_pat["PTID"].values)
df_smc_x.insert(1, "DX", df_smc_pat["DX"].values)


# df_smc_x = downsample_to_n_per_class(df_smc)

# -------------------------------
# Safety check: align ROI columns
# -------------------------------
meta_cols = ["SUBJ_ID", "DX"]
roi_cols = list(region_cols)  # already aligned by construction

df_wsev_x = df_wsev_x[meta_cols + roi_cols]
# df_smc_x  = df_smc_x[['PTID', 'DX'] + roi_cols]
df_smc_x  = df_smc_x[meta_cols + roi_cols]


# -------------------------------
# Concatenate cohorts
# -------------------------------
df_combined = pd.concat(
    [df_wsev_x, df_smc_x],
    axis=0,
    ignore_index=True
)

df_combined = df_combined.dropna()
print("Combined shape:", df_combined.shape)
# print(df_combined.head())
print(df_combined['DX'].value_counts())


In [None]:
df_combined.to_csv('C:/Users/WooSikKim/Desktop/Research/projects/co_pathology/scripts/stage_copath/data/260128_wsev_smc_combined_cn_included.csv', index=False)