In [7]:
import pandas as pd
import numpy as np
from sklearn.decomposition import NMF
import os


data_path = 'C:/Users/WooSikKim/Desktop/Research/projects/co_pathology/scripts/stage_copath/data'

inp_df = pd.read_csv(os.path.join(data_path,'train_data/260128_wsev_smc_combined_cn_included.csv'))
inp_df = inp_df[inp_df['DX']!='HC'] # EXCLUDE WSEV HC

## DOWNSAMPLE LARGE DX 
N = 25
dx_col = "DX"
balanced_parts = []

for dx, g in inp_df.groupby(dx_col):
    if dx == 'AD':
        N=50
    elif dx == 'NC':
        N=50
    else: 
        N=25
    if len(g) > N:
        g = g.sample(n=N, replace=False, random_state=42)
    balanced_parts.append(g)

train_df = pd.concat(balanced_parts).reset_index(drop=True)


#### add mci to AD ####
# train_df['DX'] = train_df['DX'].replace({'MCI' : 'AD'})

print(train_df[dx_col].value_counts())
region_cols = train_df.loc[:, 'VA/2':'VA/2035'].columns

DX
AD        50
NC        50
DLB       25
bvFTD     25
nfvPPA    25
svPPA     25
PD        24
SVAD      24
Name: count, dtype: int64


In [None]:
n_topics = 8  # start small, tune later

X = train_df[region_cols]
dx = train_df['DX']
print(X.shape)

nmf = NMF(
    n_components=n_topics,
    init="nndsvda",
    solver="cd",
    max_iter=2000,
    random_state=42
)
W = nmf.fit_transform(X)    # subjects × topics
H = nmf.components_         # topics × ROIs
Hn = H / (H.sum(axis=1, keepdims=True) + 1e-12)  # row-L1 normalize

W_df = pd.DataFrame(
    W,
    columns=[f"Topic_{k+1}" for k in range(n_topics)]
)

H_df = pd.DataFrame(
    H,
    columns=region_cols,
    index=[f"Topic_{k+1}" for k in range(n_topics)]
)
print(W_df.shape)
print(H_df.shape)


In [None]:
W_df["DX"] = dx.values

# mean topic expression per diagnosis
topic_means = W_df.groupby("DX").mean()

# simple AD vs non-AD contrast
ad_mean = topic_means.loc["AD"]
non_ad_mean = topic_means.drop(index="AD").mean()

ad_contrast = ad_mean - non_ad_mean
ad_contrast = ad_contrast.sort_values(ascending=False)

print(ad_contrast)

ad_topics = ad_contrast.index[:2].tolist()
# ad_topics = ['Topic_8','Topic_7']
print("AD-related topics:", ad_topics)

ad_topic_idx = [int(t.split("_")[1]) - 1 for t in ad_topics]

print(ad_topic_idx)

In [None]:
H_df.loc[ad_topics].T.plot(kind="bar", figsize=(18, 4))
H_df.loc[ad_contrast.index.tolist()].T.plot(kind="bar", figsize=(18, 4))
W_df.boxplot(column=ad_topics, by="DX", figsize=(15, 4))
W_df.boxplot(column=ad_contrast.index.tolist(), by="DX", figsize=(15, 10))


In [1]:
## RADAR PLOT ## 
import numpy as np
import matplotlib.pyplot as plt

topic_cols = [c for c in W_df.columns if c.startswith("Topic_")]
dx_means = W_df.groupby("DX")[topic_cols].mean()
def plot_radar_subplots(dx_means, n_rows=3):
    labels = dx_means.columns.tolist()
    n_topics = len(labels)

    angles = np.linspace(0, 2 * np.pi, n_topics, endpoint=False)
    angles = np.concatenate([angles, [angles[0]]])

    dx_list = dx_means.index.tolist()
    n_plots = len(dx_list) + 1  # +1 for composite

    # grid layout
    n_cols = int(np.ceil(n_plots / n_rows))

    # consistent colors per DX
    cmap = plt.get_cmap("tab10")
    dx_colors = {dx: cmap(i) for i, dx in enumerate(dx_list)}

    fig, axes = plt.subplots(
        n_rows, n_cols,
        figsize=(4 * n_cols, 4 * n_rows),
        subplot_kw=dict(polar=True)
    )

    axes = axes.flatten()

    # individual DX plots
    for i, dx in enumerate(dx_list):
        ax = axes[i]
        values = dx_means.loc[dx].values
        values = np.concatenate([values, [values[0]]])

        ax.plot(
            angles, values,
            linewidth=2,
            color=dx_colors[dx]
        )
        ax.fill(
            angles, values,
            color=dx_colors[dx],
            alpha=0.25
        )

        ax.set_title(dx, y=1.1)
        ax.set_thetagrids(angles[:-1] * 180 / np.pi, labels)
        ax.grid(True)

    # composite plot (last)
    ax = axes[len(dx_list)]
    for dx in dx_list:
        values = dx_means.loc[dx].values
        values = np.concatenate([values, [values[0]]])

        ax.plot(
            angles, values,
            linewidth=2,
            color=dx_colors[dx],
            label=dx
        )

    ax.set_title("Composite", y=1.1)
    ax.set_thetagrids(angles[:-1] * 180 / np.pi, labels)
    ax.legend(bbox_to_anchor=(1.3, 1.1))
    ax.grid(True)

    # turn off unused axes
    for j in range(len(dx_list) + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout()
    plt.show()


plot_radar_subplots(dx_means)



NameError: name 'W_df' is not defined

In [2]:
## Topic Barplot ##
dkt_labels = pd.read_csv('C:/Users/WooSikKim/Desktop/Research/projects/co_pathology/scripts/stage_copath/data/dkt_labels.csv')
rois = dkt_labels.iloc[0].tolist()

def plot_all_topics_top_rois(H_df, region_names, top_n=10, n_cols=4):
    topics = H_df.index.tolist()
    n_topics = len(topics)
    n_rows = int(np.ceil(n_topics / n_cols))

    cmap = plt.get_cmap("tab10")

    fig, axes = plt.subplots(
        n_rows, n_cols,
        figsize=(4 * n_cols, 3 * n_rows),
        squeeze=False
    )

    for i, topic in enumerate(topics):
        row = i // n_cols
        col = i % n_cols
        ax = axes[row, col]

        color = cmap(i % cmap.N)

        weights = H_df.loc[topic]
        indices = np.argsort(weights)[::-1][:top_n]
        regions = [region_names[i] for i in indices]
        top_rois = weights.sort_values(ascending=False).head(top_n)

        top_rois[::-1].plot(
            kind="barh",
            ax=ax,
            color=color
        )

        ax.set_title(topic)
        ax.set_yticklabels(regions[::-1], fontsize=9)
        # ax.set_xlabel("NMF Weight")

    # remove empty axes
    for j in range(i + 1, n_rows * n_cols):
        fig.delaxes(axes[j // n_cols, j % n_cols])

    plt.tight_layout()
    plt.show()


plot_all_topics_top_rois(H_df, region_names=rois, top_n=10, n_cols=4)



NameError: name 'pd' is not defined

**NACC Correction**

In [None]:
from LDA_XGB.data_processor import *
nacc_df_orig = pd.read_csv(data_path+'/nacc/nacc_stage_external/NACC_external_260206.csv')
nacc_df_param = pd.read_csv(data_path+'/nacc/nacc_stage_external/NACC_external_260206.LDA.topic_added_k_8_param.csv')
nacc_df_correct = pd.read_csv(data_path+'/nacc/nacc_stage_external/NACC_external_260206.LDA.topic_added_k_8_a_1_b_01.csv')
# print(nacc_df_orig.shape)

inf_df = nacc_df_orig.dropna(subset=region_cols)
print(inf_df.shape)

prep = DataProcessor(region_cols=region_cols, dx_col='DX', subject_col='FULL_ID')
cn_df = inf_df[inf_df['DX']=='CN']
prep.fit_baseline(hc_data=cn_df)
X_new_z = prep.compute_atrophy_scores(data=inf_df)
print(X_new_z.shape)
X_new_raw = inf_df[region_cols].values

W_new = nmf.transform(X_new_z)
print(W_new.shape)

X_new_raw_weighted = []

for i in range(X_new_raw.shape[0]):
    x_i = X_new_raw[i]          # (95,)
    w_i = W_new[i]          # (K,)

    # subject-specific AD weight vector
    ad_weight_i = np.zeros_like(x_i)

    for k in ad_topic_idx:
        # ad_weight_i += w_i[k] * H[k]
        ad_weight_i += w_i[k] * Hn[k]
        # print(ad_weight_i)

    # # optional stabilization (recommended)
    # ad_weight_i = ad_weight_i / (ad_weight_i.mean() + 1e-8)

    # reweight original atrophy
    x_i_weighted = x_i * ad_weight_i
    X_new_raw_weighted.append(x_i_weighted)

X_new_raw_weighted = np.vstack(X_new_raw_weighted)
print(X_new_raw_weighted.shape)

In [None]:
# assume you already have CN baseline mean/std per ROI from the SAME definition used for z_inv
mu = prep._hc_mean          # (n_roi,) if your DataProcessor exposes it
sigma = prep._hc_std    # (n_roi,)
sigma = np.asarray(sigma).reshape(-1)   # (95,)
mu = np.asarray(mu).reshape(-1)         # (95,)

# 1) AD reconstruction in z_inv space
H = nmf.components_                 # (K, n_roi)
W_new = nmf.transform(X_new_z)      # (n_subj, K)

ad_recon_z = W_new[:, ad_topic_idx] @ H[ad_topic_idx, :]   # (n_subj, n_roi)

# 2) convert to raw loss
ad_loss_raw = ad_recon_z * sigma[None, :]                  # (n_subj, n_roi)

# 3) make raw-space features
ad_loss_cols = [c.replace("VA/", "VA_adloss/") for c in region_cols]
inf_df[ad_loss_cols] = ad_loss_raw

# optional: AD-expected volume map
vol_ad = mu[None, :] - ad_loss_raw
vol_ad_cols = [c.replace("VA/", "VA_ADpred/") for c in region_cols]
inf_df[vol_ad_cols] = vol_ad

In [None]:
weighted_cols = [c.replace("VA/", "VA_nmf/") for c in region_cols]

X_new_weighted_df = pd.DataFrame(
    X_new_raw_weighted,
    columns=weighted_cols,
    index=inf_df.index
)

inf_df_weighted = pd.concat(
    [inf_df, X_new_weighted_df],
    axis=1
)

print(inf_df_weighted.shape)


**RAW VA NMF**

In [5]:
import pandas as pd
df_wsev = pd.read_csv("C:/Users/WooSikKim/Desktop/Research/projects/co_pathology/scripts/stage_copath/data/260108_wsev_final_df.csv")
df_smc = pd.read_csv('C:/Users/WooSikKim/Desktop/Research/projects/co_pathology/scripts/stage_copath/data//SMC_AD_FTD_VA_final.csv')
region_cols = df_smc.loc[:, 'VA/2':'VA/2035'].columns.to_list()

raw_va_df = pd.concat(
    [df_wsev, df_smc],
    axis=0,
    ignore_index=True
)

raw_va_df = raw_va_df[['PTID', 'DX'] + region_cols]

raw_va_df = raw_va_df.dropna()
print("Combined shape:", raw_va_df.shape)
print(raw_va_df['DX'].value_counts())

## DOWNSAMPLE LARGE DX 
N = 25
dx_col = "DX"
balanced_parts = []

for dx, g in raw_va_df.groupby(dx_col):
    if dx == 'AD':
        N=50
    elif dx == 'NC':
        N=50
    else: 
        N=25
    if len(g) > N:
        g = g.sample(n=N, replace=False, random_state=42)
    balanced_parts.append(g)

raw_va_train_df = pd.concat(balanced_parts).reset_index(drop=True)
raw_va_train_df = raw_va_train_df[raw_va_train_df['DX']!='HC']
raw_va_train_df = raw_va_train_df[raw_va_train_df['DX']!='NC']

print(raw_va_train_df[dx_col].value_counts())
region_cols = raw_va_train_df.loc[:, 'VA/2':'VA/2035'].columns

Combined shape: (581, 97)
DX
NC        166
AD        103
svPPA      59
PD         56
bvFTD      53
HC         49
nfvPPA     46
DLB        28
SVAD       21
Name: count, dtype: int64
DX
AD        50
DLB       25
PD        25
bvFTD     25
nfvPPA    25
svPPA     25
SVAD      21
Name: count, dtype: int64


In [11]:
n_topics = 8  # start small, tune later

X = raw_va_train_df[region_cols]
dx = raw_va_train_df['DX']
print(X.shape)

nmf = NMF(
    n_components=n_topics,
    init="nndsvda",
    solver="cd",
    max_iter=2000,
    random_state=42
)
W = nmf.fit_transform(X)    # subjects × topics
H = nmf.components_         # topics × ROIs
Hn = H / (H.sum(axis=1, keepdims=True) + 1e-12)  # row-L1 normalize

W_df = pd.DataFrame(
    W,
    columns=[f"Topic_{k+1}" for k in range(n_topics)]
)

H_df = pd.DataFrame(
    Hn,
    columns=region_cols,
    index=[f"Topic_{k+1}" for k in range(n_topics)]
)
print(W_df.shape)
print(H_df.shape)


(196, 95)
(196, 8)
(8, 95)


In [12]:
W_df["DX"] = dx.values

# mean topic expression per diagnosis
topic_means = W_df.groupby("DX").mean()

# simple AD vs non-AD contrast
ad_mean = topic_means.loc["AD"]
non_ad_mean = topic_means.drop(index="AD").mean()

ad_contrast = ad_mean - non_ad_mean
ad_contrast = ad_contrast.sort_values(ascending=True)

print(ad_contrast)

# ad_topics = ad_contrast.index[-2:].tolist()
ad_topics = ad_contrast[ad_contrast<0].index.to_list()
print("AD-related topics:", ad_topics)

ad_topic_idx = [int(t.split("_")[1]) - 1 for t in ad_topics]

print(ad_topic_idx)

Topic_8   -0.023353
Topic_6   -0.022481
Topic_5   -0.003467
Topic_1    0.000880
Topic_2    0.001471
Topic_4    0.003086
Topic_3    0.004882
Topic_7    0.042390
dtype: float64
AD-related topics: ['Topic_8', 'Topic_6', 'Topic_5']
[7, 5, 4]


In [10]:
topic_cols = [c for c in W_df.columns if c.startswith("Topic_")]
dx_means = W_df.groupby("DX")[topic_cols].mean()
plot_radar_subplots(dx_means)
plot_all_topics_top_rois(H_df, region_names=rois, top_n=10, n_cols=4)

NameError: name 'plot_radar_subplots' is not defined

In [29]:
## RAW VA NMF INFERENCE - NACC ##
from LDA_XGB.data_processor import *
nacc_df_orig = pd.read_csv(data_path+'/nacc/nacc_stage_external/NACC_external_260206.csv')
nacc_df_param = pd.read_csv(data_path+'/nacc/nacc_stage_external/NACC_external_260206.LDA.topic_added_k_8_param.csv')
nacc_df_correct = pd.read_csv(data_path+'/nacc/nacc_stage_external/NACC_external_260206.LDA.topic_added_k_8_a_1_b_01.csv')
# print(nacc_df_orig.shape)

inf_df = nacc_df_correct
inf_df = inf_df.dropna(subset=region_cols)

print(inf_df.shape)
X_new = inf_df[region_cols].values
print(X_new.shape)
W_new = nmf.transform(X_new)
print(W_new.shape)

X_ad_topic = W_new[:, ad_topic_idx] @ Hn[ad_topic_idx, :]
print("AD-topic reconstruction shape:", X_ad_topic.shape)

ad_weights = X_ad_topic / (
    X_ad_topic.mean(axis=1, keepdims=True) + 1e-8
)

print("AD weights min/max:",
      ad_weights.min(),
      ad_weights.max())

X_new_va_weighted = X_new * ad_weights
print("Weighted VA shape:", X_new_va_weighted.shape)

weighted_cols = [c.replace("VA/", "VA_nmf/") for c in region_cols]

X_new_va_weighted_df = pd.DataFrame(
    X_new_va_weighted,
    columns=weighted_cols,
    index=inf_df.index
)

inf_df_nmf = pd.concat(
    [inf_df, X_new_va_weighted_df],
    axis=1
)
print(inf_df_nmf.shape)


(7795, 125)
(7795, 95)
(7795, 8)
AD-topic reconstruction shape: (7795, 95)
AD weights min/max: 0.0 3.298457902368474
Weighted VA shape: (7795, 95)
(7795, 220)




In [None]:
inf_df_nmf.to_csv('C:/Users/WooSikKim/Desktop/Research/projects/co_pathology/scripts/stage_copath/data/nacc/nacc_stage_external/NACC_external_260206.LDA.topic_added_k_8_a_1_b_01_nmf_adjusted.csv', index=False)
# inf_df_nmf.to_csv('C:/Users/WooSikKim/Desktop/Research/projects/co_pathology/scripts/stage_copath/data/nacc/nacc_stage_external/NACC_external_260206.LDA.topic_added_k_8_param_nmf_adjusted.csv', index=False)
# inf_df_nmf.to_csv('C:/Users/WooSikKim/Desktop/Research/projects/co_pathology/scripts/stage_copath/data/nacc/nacc_stage_external/NACC_external_260206_nmf_adjusted.csv', index=False)

In [None]:
## RAW VA PCA ##
from sklearn.decomposition import PCA

pca = PCA(n_components=8, svd_solver="full", random_state=42)
scores = pca.fit_transform(X)     # subjects × 8
loadings = pca.components_.T        # ROIs × 8

pc_names = [f"PC{i+1}" for i in range(8)]

scores_df = pd.DataFrame(
    scores,
    columns=pc_names,
)

loadings_df = pd.DataFrame(
    loadings,
    columns=pc_names,
)

print(scores_df.shape)
print(loadings_df.shape)

scores_df['DX'] = dx.values

In [None]:
scores_df["DX"] = dx.values

# mean topic expression per diagnosis
topic_means = scores_df.groupby("DX").mean()

# simple AD vs non-AD contrast
ad_mean = topic_means.loc["AD"]
non_ad_mean = topic_means.drop(index="AD").mean()

ad_contrast = ad_mean - non_ad_mean
ad_contrast = ad_contrast.sort_values(ascending=True)

print(ad_contrast)

# ad_topics = ad_contrast.index[-2:].tolist()
ad_topics = ad_contrast[ad_contrast<0].index.to_list()
print("AD-related topics:", ad_topics)

ad_topic_idx = [int(t.split("_")[1]) - 1 for t in ad_topics]

print(ad_topic_idx)

In [None]:
topic_cols = [c for c in scores_df.columns if c.startswith("PC")]
dx_means = scores_df.groupby("DX")[topic_cols].mean()
plot_radar_subplots(dx_means)
plot_all_topics_top_rois(loadings_df.T, region_names=rois, top_n=10, n_cols=4)