In [8]:
import os,sys
import numpy as np
from msmbuilder.cluster import KCenters
import pandas as pd
from multiprocessing import Pool


#Setting up the hyperparameters to be tested in GMRQ
#Each list is in this format:
#[no. of features, no. of tlCA dimension,no of clusters, tlCA lag time,parameter tested,argument for the parameter]
no_of_features=[[20,3,800,2,'no_features',20],
                [24,3,800,2,'no_features',24],
                [28,3,800,2,'no_features',28]]
no_components=[[24,2,800,2,'no_components',2],
               [24,4,800,2,'no_components',4]]
tica_lagtime=[[24,3,800,4,'tica_lagtime',4],
              [24,3,800,6,'tica_lagtime',6]]
no_clusters=[[24,3,700,2,'no_clusters',700],
             [24,3,900,2,'no_clusters',900]]

def run_KCenters(no_of_feature,no_component,no_clusters,tica_lagtime,dir1,dir2,
                 cluster_dir="./clustering/",tica_dir="./TICA/"):
    """
    Wrapper for running APLoD clustering
    
    Parameters
    ----------
    no_of_feature: int or str
        No of features used for APLoD clustering
        
    no_component: int
        No of tlCA dimensions used for clustering
        
    tica_lagtime: int
        tlCA lag time for clustering
        
    no_clusters: int
        No of k-nearest neighbours for clustering
        
    dir1: string
        Parameter to be tested in GMRQ
    
    dir2: int or str
        Value of the parameter tested
    """
    
    tica_trajs=[]
    traj_len=[]
    for i in range(100):
    # loading feeatures
        tica_trajs.append(np.load("{}{}/tica_lag{}/{}.npy".format(tica_dir,no_of_feature,tica_lagtime,i))[:,0:no_component])
        traj_len.append(len(tica_trajs[i]))
    ttrajs=np.array([i[:,0:no_component] for i in tica_trajs])
    # Performing KCenters clustering
    clustering = KCenters(n_clusters=no_clusters)
    cluster_sequences=clustering.fit_predict(ttrajs)
    # Outputing KCenters clustering results into different directories
    os.system("mkdir -p {}{}/{}".format(cluster_dir,dir1,dir2))
    outdir="{}{}/{}".format(cluster_dir,dir1,dir2)
    np.save(outdir+"/clustering_assignments.npy", cluster_sequences)
    np.save(outdir+"/clustering_centers.npy",  clustering.cluster_centers_)

def run_KCenters_parallel(no_of_features,no_components,no_clusters,tica_lagtime):
    """
    Wrapper for running APLoD clustering
    
    Parameters
    ----------
    no_of_features: list of lists
            a list containing list of GMRQ hyperparameters to be tested at different feature size
            
    no_components: list of lists
        a list containing list of GMRQ hyperparameters to be tested at different tlCA components
        
    tica_lagtime: list of lists
        a list containing list of GMRQ hyperparameters to be tested at different tlCA lag time
        
    no_clusters: list of lists
        a list containing list of GMRQ hyperparameters to be tested at different number of clusters
        
    """
    dict={0:no_of_features,1:no_components,2:tica_lagtime,3:no_clusters}
    df=pd.concat([pd.DataFrame(dict[i],columns = ['no_of_features', 'no_components',
                                                  'no_clusters','tica_lagtime','dir1','dir2'])
                  for i in range(4)],ignore_index=True)
    var=df.itertuples(index=False,name=False)
    with Pool() as p:
        p.starmap(run_KCenters, [i for i in var])

if __name__=="__main__":
     run_KCenters_parallel(no_of_features,no_components,no_clusters,tica_lagtime)

In [9]:
import os
#Creating shortcut so that you don't have to run the same set of parameters twice
os.system('ln -fsr ./clustering/no_features/24  ./clustering/no_components/3')
os.system('ln -fsr ./clustering/no_features/24  ./clustering/tica_lagtime/2')
os.system('ln -fsr ./clustering/no_features/24  ./clustering/no_clusters/800')