In [1]:
# Melinda Kleczynski
# Data from Christina Bergonzo 

# Finalized March 13, 2025 

# Hyperparameter tuning for classification 
# Three possible types of topological summaries: 
    # GCCD Matrices
    # Gaussian Betti Curves
    # Concatenated Normalized 0D, 1D, 2D Gaussian Betti Curves

In [None]:
# Choose starting frame and type of topological summary 

start_frame = 200  # 200, 350, or 500 
summary_type = 'gccd'  # 'gccd' or 'betti or betti012' 

In [3]:
import numpy as np 
import pandas as pd 
from sklearn import neighbors 

In [4]:
def get_gccd_fpath(class_label, trajectory, sigma_exp, frame):

    folder_path = 'gccds\\' + class_label + '\\traj' + str(trajectory) + '\\'
    return  folder_path + class_label + '_traj' + str(trajectory) + '_frame' + str(frame) + '_sigmaexp' + str(sigma_exp) + '_gccd.csv'

def get_betti012_fpath(class_label, trajectory, sigma_exp, frame):

    folder_path = 'betti012s\\' + class_label + '\\traj' + str(trajectory) + '\\'
    return  folder_path + class_label + '_traj' + str(trajectory) + '_frame' + str(frame) + '_sigmaexp' + str(sigma_exp) + '_betti012.csv'

In [5]:
class_labels = ['Fc_glycans', 'Fc_noglycans']
trajectories = [0, 1, 2, 3]

frames = [i for i in range(start_frame, 1000)] 

first_eps = 1 
last_eps = 30 
epsilons = np.linspace(first_eps, last_eps, 1+10*(last_eps-first_eps))

# hyperparameters 
sigma_exponents = [s for s in range(1, 7)]
k_vals = [15, 25, 35, 45]

In [6]:
# use any summary to get array size 

if summary_type in ['gccd', 'betti']:
    ex_gccd = np.array(pd.read_csv(get_gccd_fpath(class_labels[0], trajectories[0], sigma_exponents[0], frames[0])))  
    if summary_type == 'gccd':
        summary_n_elmnts = np.shape(ex_gccd)[0]*np.shape(ex_gccd)[1]
    elif summary_type == 'betti':
        summary_n_elmnts = np.shape(ex_gccd)[0]

elif summary_type == 'betti012':
    ex_betti012 = np.array(pd.read_csv(get_betti012_fpath(class_labels[0], trajectories[0], sigma_exponents[0], frames[0]))['0'])
    summary_n_elmnts = len(ex_betti012) 

In [7]:
n_summaries = len(class_labels)*len(trajectories)*len(frames)
n_sigmas = len(sigma_exponents)

In [8]:
# set up dataframes 

summary_dfs = n_sigmas*[pd.DataFrame()]

for s_iter in range(n_sigmas):

    summary_data = np.zeros((n_summaries, summary_n_elmnts))

    summary_class_labels = n_summaries*['']
    summary_trajectories = np.zeros(n_summaries, int)
    summary_frames = np.zeros(n_summaries, int)

    summary_iter = 0

    for class_label in class_labels:
        for trajectory in trajectories:
            for frame in frames:

                if summary_type in ['gccd', 'betti']:
                    gccd_matrix = np.array(pd.read_csv(get_gccd_fpath(class_label, trajectory, sigma_exponents[s_iter], frame)))
                    if summary_type == 'gccd':
                        summary_data[summary_iter, :] = gccd_matrix.flatten()
                    elif summary_type == 'betti':
                        summary_data[summary_iter, :] = np.sum(gccd_matrix, axis = 1)
                elif summary_type == 'betti012':
                    betti012_vec = np.array(pd.read_csv(get_betti012_fpath(class_label, trajectory, sigma_exponents[s_iter], frame))['0'])
                    summary_data[summary_iter, :] = betti012_vec

                summary_class_labels[summary_iter] = class_label
                summary_trajectories[summary_iter] = trajectory
                summary_frames[summary_iter] = frame

                summary_iter += 1

    metadata_df = pd.DataFrame({'class_label': summary_class_labels, 'trajectory': summary_trajectories, 'frame': summary_frames})
    summary_df = pd.concat([pd.DataFrame(summary_data), metadata_df], axis = 1)

    summary_dfs[s_iter] = summary_df

In [9]:
# hyperparameter tuning 

all_glyc_train_trajs = 16*[[]]
all_aglyc_train_trajs = 16*[[]]
all_glyc_test_trajs = np.zeros(16, int)
all_aglyc_test_trajs = np.zeros(16, int)
best_sigma_exps = np.zeros(16, int)
best_ks = np.zeros(16, int)
best_mean_accs = np.zeros(16)
worst_mean_accs = np.zeros(16)

hyper_iter = 0

for glyc_test_traj in trajectories:
    glyc_train_trajs = [traj for traj in trajectories if traj != glyc_test_traj]

    for aglyc_test_traj in trajectories:
        aglyc_train_trajs = [traj for traj in trajectories if traj != aglyc_test_traj]

        cval_scores = np.zeros((n_sigmas, len(k_vals)))

        for s_iter in range(n_sigmas):

            summary_df = summary_dfs[s_iter]

            glyc_summary_df = summary_df[summary_df.class_label == 'Fc_glycans']
            aglyc_summary_df = summary_df[summary_df.class_label == 'Fc_noglycans']

            glyc_tune = glyc_summary_df[[traj in glyc_train_trajs for traj in glyc_summary_df.trajectory]]
            aglyc_tune = aglyc_summary_df[[traj in aglyc_train_trajs for traj in aglyc_summary_df.trajectory]]

            for glyc_tune_traj in glyc_train_trajs:

                glyc_tune_train = glyc_tune[glyc_tune.trajectory != glyc_tune_traj]
                glyc_tune_test = glyc_tune[glyc_tune.trajectory == glyc_tune_traj]

                for aglyc_tune_traj in aglyc_train_trajs:

                    aglyc_tune_train = aglyc_tune[aglyc_tune.trajectory != aglyc_tune_traj]
                    aglyc_tune_test = aglyc_tune[aglyc_tune.trajectory == aglyc_tune_traj]

                    for k_iter in range(len(k_vals)):

                        tune_train = pd.concat([glyc_tune_train, aglyc_tune_train])
                        X_tune_train = tune_train[[j for j in range(summary_n_elmnts)]]
                        y_tune_train = tune_train.class_label
                        tune_classifier = neighbors.KNeighborsClassifier(n_neighbors = k_vals[k_iter], p = 2).fit(X_tune_train, y_tune_train) 

                        tune_test = pd.concat([glyc_tune_test, aglyc_tune_test])
                        X_tune_test = tune_test[[j for j in range(summary_n_elmnts)]]
                        y_tune_test = tune_test.class_label
                        tune_score = tune_classifier.score(X_tune_test, y_tune_test) 
                        cval_scores[s_iter, k_iter] += tune_score

        cval_scores/=9

        best_param_indices = np.unravel_index(np.argmax(cval_scores), np.shape(cval_scores))

        all_glyc_train_trajs[hyper_iter] = glyc_train_trajs
        all_aglyc_train_trajs[hyper_iter] = aglyc_train_trajs
        all_glyc_test_trajs[hyper_iter] = glyc_test_traj
        all_aglyc_test_trajs[hyper_iter] = aglyc_test_traj
        best_sigma_exps[hyper_iter] = sigma_exponents[best_param_indices[0]]
        best_ks[hyper_iter] = k_vals[best_param_indices[1]]
        best_mean_accs[hyper_iter] = np.max(cval_scores)
        worst_mean_accs[hyper_iter] = np.min(cval_scores)
        hyper_iter += 1

tuning_df = pd.DataFrame({'glyc_train_trajs': all_glyc_train_trajs,
                          'aglyc_train_trajs': all_aglyc_train_trajs,
                          'glyc_test_trajs': all_glyc_test_trajs,
                          'aglyc_test_trajs': all_aglyc_test_trajs,
                          'sigma_exp': best_sigma_exps,
                          'k': best_ks,
                          'best_mean_acc': best_mean_accs,
                          'worst_mean_acc': worst_mean_accs})

In [10]:
tuning_df.to_csv(summary_type + '_hyperparams_start_frame_' + str(start_frame) + '.csv')