<h1>APLoD Clustering<h1>

In [None]:
import os,sys
import numpy as np
#The path for HK_DataMiner library that implements APLoD Clustering
HK_DataMiner_Path = os.path.relpath("./APLoD_clustering/HK_DataMiner/hkdataminer/")                                     
sys.path.append(HK_DataMiner_Path)
from cluster import APLoD
from utils import  split_assignments
from multiprocessing import Pool
import pandas as pd


#Setting up the hyperparameters to be tested in GMRQ
#Each list is in this format:
#[no. of features, no. of tlCA dimension,no of KNN, tlCA lag time,parameter tested,argument for the parameter]
no_of_features=[[800,4,250,300,'no_of_features',800],
[1000,4,250,300,'no_of_features',1000],
['Full',4,250,300,'no_of_features','Full']]
no_components=[[1000,3,250,300,'no_components',3],
[1000,5,250,300,'no_components',5]]
tica_lagtime=[[1000,4,250,250,'tica_lagtime',250]]
kNN=[[1000,4,200,300,'kNN',200],
[1000,4,300,300,'kNN',300]]

def run_APLoD(no_of_feature,no_component,kNN,tica_lagtime,dir1,dir2,
              aplod_dir="./APLoD_clustering/",tica_dir="./TICA/"):
    """
    Wrapper for running APLoD clustering
    
    Parameters
    ----------
    no_of_feature: int or str
        No of features used for APLoD clustering
        
    no_component: int
        No of tlCA dimensions used for APLoD clustering
        
    tica_lagtime: int
        tlCA lag time for APLoD clustering
        
    kNN: int
        No of k-nearest neighbours for  APLoD clustering
        
    dir1: string
        Parameter to be tested in GMRQ
    
    dir2: int or str
        Value of the parameter tested
    """
    
    tica_trajs=[]
    traj_len=[]
    for i in range(84):
    # loading feeatures
        tica_trajs.append(np.load("{}{}/tica_lag{}/{}.npy".format(tica_dir,no_of_feature,tica_lagtime,i))[:,0:no_component])
        traj_len.append(len(tica_trajs[i]))
    tica_trajs=np.array(tica_trajs)
    trajs=np.concatenate(tica_trajs).tolist()
    # Performing APloD clustering
    clustering = APLoD(rho_cutoff=1.0, delta_cutoff=1.0, n_neighbors=kNN,
    metric="euclidean", algorithm="kd_tree")
    clustering.fit(trajs)
    # Outputing APloD clustering results into different directories
    tica_trajs=np.array(tica_trajs)
    trajs=np.concatenate(tica_trajs).tolist()
    os.system("mkdir -p {}{}/{}".format(aplod_dir,dir1,dir2))
    outdir="{}{}/{}".format(aplod_dir,dir1,dir2)
    aplod_labels = clustering.labels_
    aplod_sequences =split_assignments(aplod_labels, traj_len)
    aplod_centers = clustering.cluster_centers_
    np.save(outdir+"/clustering_assignments.npy", aplod_sequences)
    np.save(outdir+"/clustering_centers.npy", aplod_centers)

def run_APLoD_parallel(no_of_features,no_components,kNN,tica_lagtime):
    """
    Wrapper for running APLoD clustering
    
    Parameters
    ----------
    no_of_features: list of lists
            a list containing list of GMRQ hyperparameters to be tested at different feature size
            
    no_components: list of lists
        a list containing list of GMRQ hyperparameters to be tested at different tlCA components
        
    tica_lagtime: list of lists
        a list containing list of GMRQ hyperparameters to be tested at different tlCA lag time
        
    kNN: list of lists
        a list containing list of GMRQ hyperparameters to be tested at different number of k-nearest neighbours
        
    """
    dict={0:no_of_features,1:no_components,2:tica_lagtime,3:kNN}
    df=pd.concat([pd.DataFrame(dict[i],columns = ['no_of_features', 'no_components',
                                                  'kNN','tica_lagtime','dir1','dir2'])
                  for i in range(4)],ignore_index=True)
    var=df.itertuples(index=False,name=False)
    with Pool() as p:
        p.starmap(run_APLoD, [i for i in var])

if __name__=="__main__":
     run_APLoD_parallel(no_of_features,no_components,kNN,tica_lagtime)

In [None]:
import os
#Creating shortcut so that you don't have to run the same set of parameters twice
os.system('ln -fsr ./APLoD_clustering/no_of_features/1000  ./APLoD_clustering/no_components/4')
os.system('ln -fsr ./APLoD_clustering/no_of_features/1000  ./APLoD_clustering/tica_lagtime/300')
os.system('ln -fsr ./APLoD_clustering/no_of_features/1000  ./APLoD_clustering/kNN/250')