In [None]:
# Melinda Kleczynski
# Data from Christina Bergonzo 

# Finalized March 13, 2025 

# Classification test accuracy 
# Three possible types of topological summaries: 
    # GCCD Matrices
    # Gaussian Betti Curves
    # Concatenated Normalized 0D, 1D, 2D Gaussian Betti Curves

In [None]:
# Choose starting frame and type of topological summary 

start_frame = 200  # 200, 350, or 500 
summary_type = 'gccd'  # 'gccd' or 'betti' or 'betti012' 

In [None]:
import numpy as np 
import pandas as pd 
from sklearn import neighbors 

In [None]:
def get_gccd_fpath(class_label, trajectory, sigma_exp, frame):

    folder_path = 'gccds\\' + class_label + '\\traj' + str(trajectory) + '\\'
    return  folder_path + class_label + '_traj' + str(trajectory) + '_frame' + str(frame) + '_sigmaexp' + str(sigma_exp) + '_gccd.csv'

def get_betti012_fpath(class_label, trajectory, sigma_exp, frame):

    folder_path = 'betti012s\\' + class_label + '\\traj' + str(trajectory) + '\\'
    return  folder_path + class_label + '_traj' + str(trajectory) + '_frame' + str(frame) + '_sigmaexp' + str(sigma_exp) + '_betti012.csv'

In [None]:
class_labels = ['Fc_glycans', 'Fc_noglycans']
trajectories = [0, 1, 2, 3]

frames = [i for i in range(start_frame, 1000)] 

# possible hyperparameter values 
sigma_exponents = [s for s in range(1, 7)]
k_vals = [15, 25, 35, 45]

In [None]:
# use any summary to get array size 

if summary_type in ['gccd', 'betti']:
    ex_gccd = np.array(pd.read_csv(get_gccd_fpath(class_labels[0], trajectories[0], sigma_exponents[0], frames[0])))  
    if summary_type == 'gccd':
        summary_n_elmnts = np.shape(ex_gccd)[0]*np.shape(ex_gccd)[1]
    elif summary_type == 'betti':
        summary_n_elmnts = np.shape(ex_gccd)[0]

elif summary_type == 'betti012':
    ex_betti012 = np.array(pd.read_csv(get_betti012_fpath(class_labels[0], trajectories[0], sigma_exponents[0], frames[0]))['0'])
    summary_n_elmnts = len(ex_betti012) 

In [None]:
n_summaries = len(class_labels)*len(trajectories)*len(frames)
n_sigmas = len(sigma_exponents)

In [None]:
# set up dataframes 

summary_dfs = n_sigmas*[pd.DataFrame()]

for s_iter in range(n_sigmas):

    summary_data = np.zeros((n_summaries, summary_n_elmnts))

    summary_class_labels = n_summaries*['']
    summary_trajectories = np.zeros(n_summaries, int)
    summary_frames = np.zeros(n_summaries, int)

    summary_iter = 0

    for class_label in class_labels:
        for trajectory in trajectories:
            for frame in frames:

                if summary_type in ['gccd', 'betti']:
                    gccd_matrix = np.array(pd.read_csv(get_gccd_fpath(class_label, trajectory, sigma_exponents[s_iter], frame)))
                    if summary_type == 'gccd':
                        summary_data[summary_iter, :] = gccd_matrix.flatten()
                    elif summary_type == 'betti':
                        summary_data[summary_iter, :] = np.sum(gccd_matrix, axis = 1)
                elif summary_type == 'betti012':
                    betti012_vec = np.array(pd.read_csv(get_betti012_fpath(class_label, trajectory, sigma_exponents[s_iter], frame))['0'])
                    summary_data[summary_iter, :] = betti012_vec

                summary_class_labels[summary_iter] = class_label
                summary_trajectories[summary_iter] = trajectory
                summary_frames[summary_iter] = frame

                summary_iter += 1

    metadata_df = pd.DataFrame({'class_label': summary_class_labels, 'trajectory': summary_trajectories, 'frame': summary_frames})
    summary_df = pd.concat([pd.DataFrame(summary_data), metadata_df], axis = 1)

    summary_dfs[s_iter] = summary_df

In [None]:
# read in hyperparameter choices 
hyper_df = pd.read_csv('hyperparams\\' + summary_type + '_hyperparams_start_frame_' + str(start_frame) + '.csv')[['glyc_test_trajs', 'aglyc_test_trajs', 'sigma_exp', 'k']] 

In [None]:
test_mean_accs = np.zeros(len(hyper_df))

for row_iter in range(len(hyper_df)):

    current_row = hyper_df.iloc[row_iter]

    glyc_test_traj = current_row['glyc_test_trajs']
    aglyc_test_traj = current_row['aglyc_test_trajs']
    best_sigma_exp = current_row['sigma_exp']
    best_k = current_row['k']

    # training trajectories
    glyc_train_trajs = [traj for traj in trajectories if traj != glyc_test_traj]
    aglyc_train_trajs = [traj for traj in trajectories if traj != aglyc_test_traj]

    # best hyperparameters 
    best_sigma_exp_index = np.where(sigma_exponents == best_sigma_exp)[0][0]

    # dataframe for best sigma hyperparameter
    summary_df = summary_dfs[best_sigma_exp_index]
    glyc_summary_df = summary_df[summary_df.class_label == 'Fc_glycans']
    aglyc_summary_df = summary_df[summary_df.class_label == 'Fc_noglycans']

    # training data 
    glyc_train_df = glyc_summary_df[[traj in glyc_train_trajs for traj in glyc_summary_df.trajectory]]
    aglyc_train_df = aglyc_summary_df[[traj in aglyc_train_trajs for traj in aglyc_summary_df.trajectory]]
    train_df = pd.concat([glyc_train_df, aglyc_train_df])
    X_train = train_df[[j for j in range(summary_n_elmnts)]]
    y_train = train_df.class_label

    # fit classifier 
    knn_classifier = neighbors.KNeighborsClassifier(n_neighbors = best_k, p = 2).fit(X_train, y_train)

    # testing data 
    glyc_test_df = glyc_summary_df[glyc_summary_df.trajectory == glyc_test_traj] 
    aglyc_test_df = aglyc_summary_df[aglyc_summary_df.trajectory == aglyc_test_traj] 
    test_df = pd.concat([glyc_test_df, aglyc_test_df]) 
    X_test = test_df[[j for j in range(summary_n_elmnts)]]
    y_test = test_df.class_label 
    test_mean_accs[row_iter] = knn_classifier.score(X_test, y_test)

test_df = pd.concat([hyper_df[['glyc_test_trajs', 'aglyc_test_trajs']], pd.DataFrame({'mean_test_accuracy': test_mean_accs})], axis = 1)
test_df.to_csv('test_results\\test_results_' + summary_type + '_start_frame_' + str(start_frame) + '.csv')