<h1>Performing GMRQ<h1>

In [None]:
from msmbuilder.msm import MarkovStateModel
from sklearn.cross_validation import KFold
from  sklearn.pipeline  import Pipeline
import numpy as np
import os

os.makedirs('./GMRQ',exist_ok=True)

def GMRQ(no_of_features,no_of_components,no_clusters,tica_lagtime,
         parameter=['no_features','no_components','no_clusters','tica_lagtime'],
         gmrq_dir="./GMRQ/",clustering_dir="./clustering/"):
    """
    Wrapper for running GMRQ with scikit learn KFold

    Parameters
    ----------
    no_of_feature: list
        A list containing different feature sizes

    no_of_components: list
        A list containing different numbers of tlCA component

    no_clusters: list
        A list containing different numbers of clusters

    tica_lagtime: list
        A list containing different numbers of tlCA component tlCA lag time

    parameter: list, Default=['no_of_features','no_components','no_clusters','tica_lagtime']
        A list containing the name of the parameter to be tested:

    gmrq_dir: str, Default="./GMRQ/"
        Directory for saving output from running GMRQ

    clustering_dir: str, Default="./clustering/"
        Directory for loading data from KCenters clustering
    """
    #Setting up Scikit-learn pipeline
    model = Pipeline([
    ('msm', MarkovStateModel(n_timescales=3, lag_time=100, reversible_type='transpose', verbose=False))
    ])
    dict={0:no_of_features,1:no_of_components,2:no_clusters,3:tica_lagtime}
    for n,nth in enumerate(parameter):
        for m in range(0,len(dict[n])):
            # loading Clustering result
            print("Running GMRQ for {}:{}".format(nth,dict[n][m]))
            trajectories=np.load("{}{}/{}/clustering_assignments.npy".format(clustering_dir,nth,dict[n][m]))
            train_score_collection=[]
            test_score_collection = []
            i=0
            # Running GMRQ
            while i<10:
                if i==0:
                    print('Running the {}st cycle'.format(i+1))
                else:
                    print('Running the {}th cycle'.format(i+1))
                cv = KFold(len(trajectories), n_splits=6,shuffle=True)  #split the dataset into training set and test set
                results = []
                print_results = [] #train score, test score
                for fold, (train_index, test_index) in enumerate(cv):
                    train_data = [trajectories[i] for i in train_index]
                    test_data = [trajectories[i] for i in test_index]
                    model.fit(train_data)
                    train_score = model.score(train_data)
                    test_score = model.score(test_data)
                    results.append({'train_score': train_score,'test_score': test_score,'fold': fold})
                    train_score_collection.append(train_score)
                    test_score_collection.append(test_score)
                    print_results.append(train_score)
                    print_results.append(test_score)
                # Saving GMRQ test and training scores

                if i==9:                
                    os.system('mkdir -p {}{}/{}'.format(gmrq_dir,nth,dict[n][m]))
                    np.savetxt("{}{}/{}/gmrq_train_score.txt".format(gmrq_dir,nth,dict[n][m]),train_score_collection)
                    np.savetxt("{}{}/{}/gmrq_test_score.txt".format(gmrq_dir,nth,dict[n][m]),test_score_collection)
                    print('Gmrq score saved')
                i+=1
                
#The hyperparameters to be tested for running GMRQ
no_of_features=[20,24,28]
no_of_components=[2,3,4]
no_clusters=[700,800,900]
tica_lagtime=[2,4,6]

GMRQ(no_of_features,no_of_components,no_clusters,tica_lagtime)

<font size="5">Plotting GMRQ scores</font>

In [None]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt

import numpy as np
import matplotlib.pyplot as plt

def GMRQ_plot(no_of_features,no_of_components,no_clusters,tica_lagtime,
              parameter=['no_features','no_components','no_clusters','tica_lagtime'],
              gmrq_dir="./GMRQ/"):
    """
   Plotting GMRQ results

    Parameters
    ----------
    no_of_feature: list
        A list containing different feature sizes

    no_component: list
        A list containing different numbers of tlCA component

    no_clusters: list
        A list containing different numbers of clusters

    tica_lagtime: list
        A list containing different numbers of tlCA component tlCA lag time

    parameter: list, Default=['no_of_features','no_components','no_clusters','tica_lagtime']
        A list containing the name of the parameter to be tested:

    gmrq_dir: str, Default="./GMRQ/"
        Directory for saving output from running GMRQ

    """
    dict={0:no_of_features,1:no_of_components,2:no_clusters,3:tica_lagtime}
    for n,nth in enumerate(parameter):
        train_array=[]
        test_array=[]
        train_ave_array=[]
        train_std_array=[]
        for m in range(0,len(dict[n])):
            train=np.loadtxt("{}{}/{}/gmrq_train_score.txt".format(gmrq_dir,nth,dict[n][m])).tolist()
            train_array.extend(train)
            test=np.loadtxt("{}{}/{}/gmrq_test_score.txt".format(gmrq_dir,nth,dict[n][m])).tolist()
            test_array.append(test)
            train_ave_array.append(np.average(train))
            train_std_array.append(np.std(train))
        f,axs=plt.subplots(1,1,figsize=(6,5))
        bplot1=axs.boxplot(test_array,patch_artist=True,vert=True,widths=0.25,labels=dict[n],showfliers=False,sym='.',
                       capprops={'linewidth':3},
                       whiskerprops={'linewidth':3},
                       medianprops={'linewidth':3,'color':'k'},boxprops={'facecolor':'tab:orange','linewidth':3})
        print("test score median for {}: {}\n".format(nth,dict[n]),
              [bplot1['medians'][i].get_ydata()[0] for i in range(0,len(bplot1['medians']))])
        axs.errorbar(np.arange(len(train_ave_array))+1, train_ave_array, yerr=train_std_array,color="r",
                 elinewidth=3,markersize=10,fmt='o',capthick=3,capsize=10,label="Train set")
        axs.legend(loc=0,fontsize=16)
        axs.set_ylabel("GMRQ score", fontsize=20)
        axs.set_xlabel(str(nth), fontsize=20)
        #axs.set_ylim(1.5,4)
        axs.tick_params(labelsize=20)
        plt.tight_layout()
        plt.savefig("{}{}.png".format(gmrq_dir,nth),transparent=True)

#The hyperparameters tested in GMRQ
no_of_features=[20,24,28]
no_of_components=[2,3,4]
no_clusters=[700,800,900]
tica_lagtime=[2,4,6]

GMRQ_plot(no_of_features,no_of_components,no_clusters,tica_lagtime)

In [None]:
import os
#Creating shortcut so that you don't have to run the same set of parameters twice
os.system('ln -fsr ./GMRQ/no_features/24  ./GMRQ/no_components/3')
os.system('ln -fsr ./GMRQ/no_features/24  ./GMRQ/tica_lagtime/2')
os.system('ln -fsr ./GMRQ/no_features/24  ./GMRQ/no_clusters/800')