## CREATE AND SAVE LDA+XGB MODELS

In [None]:
import pandas as pd
import numpy as np
from LDA_XGB.data_processor import *
from LDA_XGB.lda_model import LDATopicModel
from LDA_XGB.classifier import TopicClassifier
from LDA_XGB.visualizer import *
from LDA_XGB.pipeline import CopathologyPipeline
from LDA_XGB.brain_visualizer import *
import matplotlib.pyplot as plt

def class_balance(inp_df, class_col, n=25, special_care=['AD','NC'], special_n=50):
    out_df = []
    for dx, g in inp_df.groupby(class_col):
        if dx in (special_care):
            N=special_n
        else:
            N=n
        if len(g) > N:
            g = g.sample(n=N, replace=False, random_state=42)
        out_df.append(g)
    
    return pd.concat(out_df).reset_index(drop=True)

def lda_k_plots(k_list):
    perplexities = []


**Model Initialization and Train Data**

In [None]:
## WSEV, SMC TRAIN DATA - NC INCLUDED ##
data_path = 'C:/Users/WooSikKim/Desktop/Research/projects/co_pathology/scripts/stage_copath/data'
dkt_labels = pd.read_csv(os.path.join(data_path, 'dkt_labels.csv'))
dkt_rois = dkt_labels.iloc[0].tolist()
train_df = pd.read_csv(os.path.join(data_path,'train_data/260128_wsev_smc_combined_cn_included.csv'))
train_df = train_df[train_df['DX']!='HC'] # EXCLUDE WSEV HC

train_all_dx = class_balance(train_df, class_col='DX')
print(train_all_dx['DX'].value_counts())

train_ad_non_ad = class_balance(train_df, class_col='DX', n=25, special_care=['AD','NC'], special_n=100)
train_ad_non_ad['AD_label'] = np.where(train_ad_non_ad['DX'].isin(['AD','NC']), train_ad_non_ad['DX'], 'non-AD')
print(train_ad_non_ad['AD_label'].value_counts())

K_TOPICS = 18 ###############
ALPHA = 1/K_TOPICS
BETA = 1/K_TOPICS

In [None]:
## ALL DX MDL - K=18 ## - OUTDATED
mdl_pipeline = CopathologyPipeline(
    n_topics = 18,
    alpha = ALPHA,
    beta = BETA,
    output_dir = f'C:/Users/WooSikKim/Desktop/Research/projects/co_pathology/scripts/stage_copath/results/wsev_smc_all_dx/18'
)
mdl_results = mdl_pipeline.fit(
    train_df = train_all_dx,
    region_cols = train_all_dx.loc[:,'VA/2':'VA/2035'].columns,
    standardize = False,
    subject_col = 'SUBJ_ID'
)
mdl_pipeline.save_results(mdl_results['theta'], mdl_results['labels'], mdl_results['subject_ids'])
mdl_pipeline.generate_internal_visualizations(mdl_results['theta'], mdl_results['labels'], region_names=dkt_rois)
mdl_pipeline.save(f'./LDA_XGB/models/lda_xgb_dx_with_cn_k_18.pkl')

surface_mapper = BrainVisualizer(
    output_dir=os.path.join(mdl_pipeline.output_dir, 'surface_maps'),
)
surface_mapper.plot_all_topics(topic_patterns=mdl_pipeline.lda_model.get_topic_dataframe(region_names=dkt_rois).values.T)


In [None]:
## ALL DX - NO CN ## 260219
print('## ALL DX - NO CN ## 260219')
print('K-topics = ', K_TOPICS)
temp_train = train_all_dx[train_all_dx['DX']!='NC']
print(temp_train['DX'].value_counts())
mdl_pipeline = CopathologyPipeline(
    n_topics = K_TOPICS,
    alpha = ALPHA,
    beta = BETA,
    output_dir = f'C:/Users/WooSikKim/Desktop/Research/projects/co_pathology/scripts/stage_copath/results/wsev_smc_all_dx_no_cn/{K_TOPICS}/'
)
mdl_results = mdl_pipeline.fit(
    train_df = temp_train,
    region_cols = temp_train.loc[:,'VA/2':'VA/2035'].columns,
    standardize = False,
    subject_col = 'SUBJ_ID'
)
mdl_pipeline.save_results(mdl_results['theta'], mdl_results['labels'], mdl_results['subject_ids'])
mdl_pipeline.generate_internal_visualizations(mdl_results['theta'], mdl_results['labels'], region_names=dkt_rois)
mdl_pipeline.save(f'./LDA_XGB/models/wsev_smc_all_dx_no_cn_k_{K_TOPICS}.pkl')

surface_mapper = BrainVisualizer(
    output_dir=os.path.join(mdl_pipeline.output_dir, 'surface_maps'),
)
surface_mapper.plot_all_topics(topic_patterns=mdl_pipeline.lda_model.get_topic_dataframe(region_names=dkt_rois).values.T)

In [None]:
## ALL DX MDL ##
print('## ALL DX Class Model ##')
print('K-topics = ', K_TOPICS)
mdl_pipeline = CopathologyPipeline(
    n_topics = K_TOPICS,
    alpha = ALPHA,
    beta = BETA,
    output_dir = f'C:/Users/WooSikKim/Desktop/Research/projects/co_pathology/scripts/stage_copath/results/wsev_smc_all_dx/{K_TOPICS}/'
)
mdl_results = mdl_pipeline.fit(
    train_df = train_all_dx,
    region_cols = train_all_dx.loc[:,'VA/2':'VA/2035'].columns,
    standardize = False,
    subject_col = 'SUBJ_ID'
)
mdl_pipeline.save_results(mdl_results['theta'], mdl_results['labels'], mdl_results['subject_ids'])
mdl_pipeline.generate_internal_visualizations(mdl_results['theta'], mdl_results['labels'], region_names=dkt_rois)
mdl_pipeline.save(f'./LDA_XGB/models/wsev_smc_all_dx_with_cn_k_{K_TOPICS}.pkl')

surface_mapper = BrainVisualizer(
    output_dir=os.path.join(mdl_pipeline.output_dir, 'surface_maps'),
)
surface_mapper.plot_all_topics(topic_patterns=mdl_pipeline.lda_model.get_topic_dataframe(region_names=dkt_rois).values.T)

In [None]:
## AD NON-AD MDL ##
print('## NC, AD, non-AD Three Class Model ##')
print('K-topics = ', K_TOPICS)
mdl_pipeline = CopathologyPipeline(
    n_topics = K_TOPICS,
    alpha = ALPHA,
    beta = BETA,
    output_dir = f'C:/Users/WooSikKim/Desktop/Research/projects/co_pathology/scripts/stage_copath/results/wsev_smc_ad_non_ad/{K_TOPICS}'
)
mdl_results = mdl_pipeline.fit(
    train_df = train_ad_non_ad,
    region_cols = train_ad_non_ad.loc[:,'VA/2':'VA/2035'].columns,
    standardize = False,
    dx_col='AD_label',
    subject_col = 'SUBJ_ID'
)
mdl_pipeline.save_results(mdl_results['theta'], mdl_results['labels'], mdl_results['subject_ids'])
mdl_pipeline.generate_internal_visualizations(mdl_results['theta'], mdl_results['labels'], region_names=dkt_rois)
mdl_pipeline.save(f'./LDA_XGB/models/wsev_smc_ad_non_ad_with_cn_k_{K_TOPICS}.pkl')

surface_mapper = BrainVisualizer(
    output_dir=os.path.join(mdl_pipeline.output_dir, 'surface_maps')
)
surface_mapper.plot_all_topics(topic_patterns=mdl_pipeline.lda_model.get_topic_dataframe(region_names=dkt_rois).values.T)

**RAW VA TOPIC MODEL** - OUTDATED

In [None]:
df_wsev = pd.read_csv("C:/Users/WooSikKim/Desktop/Research/projects/co_pathology/scripts/stage_copath/data/260108_wsev_final_df.csv")
df_smc = pd.read_csv('C:/Users/WooSikKim/Desktop/Research/projects/co_pathology/scripts/stage_copath/data//SMC_AD_FTD_VA_final.csv')
region_cols = df_smc.loc[:, 'VA/1002':'VA/2035'].columns.to_list()

raw_va_df = pd.concat(
    [df_wsev, df_smc],
    axis=0,
    ignore_index=True
)
raw_va_df = raw_va_df[['PTID', 'DX'] + region_cols]
raw_va_df = raw_va_df.dropna()
raw_va_df = raw_va_df[raw_va_df['DX']!='HC']
print("Combined shape:", raw_va_df.shape)
# print(raw_va_df['DX'].value_counts())

raw_va_train_df = class_balance(raw_va_df, class_col='DX', n=25)
print(raw_va_train_df['DX'].value_counts())
raw_va_train_df[region_cols] = 7-raw_va_train_df[region_cols]
# raw_va_train_df[region_cols] = (raw_va_train_df[region_cols].max()- raw_va_train_df[region_cols])*10
cn_df = raw_va_train_df[raw_va_train_df['DX']=='NC']

mdl_pipeline = CopathologyPipeline(
    n_topics = K_TOPICS,
    alpha = ALPHA,
    beta = BETA,
    output_dir = f'C:/Users/WooSikKim/Desktop/Research/projects/co_pathology/scripts/stage_copath/results/wsev_smc_raw_va/{K_TOPICS}'
)
mdl_results = mdl_pipeline.fit(
    train_df = raw_va_train_df,
    region_cols = region_cols,
    standardize = False,
    # ref_df = cn_df,
    dx_col='DX',
    subject_col = 'PTID'
)
mdl_pipeline.save_results(mdl_results['theta'], mdl_results['labels'], mdl_results['subject_ids'])
mdl_pipeline.generate_internal_visualizations(mdl_results['theta'], mdl_results['labels'], region_names=dkt_rois[-62:])
mdl_pipeline.save(f'./LDA_XGB/models/wsev_smc_7-raw_va_k_{K_TOPICS}.pkl')