In [None]:
#Remember to load another conda environment with pyEMMA 2.5.7
%matplotlib inline
import os
import numpy as np
import pandas as pd
from pyemma.coordinates.transform._tica_base import *
from pyemma.coordinates.transform.nystroem_tica import *
from pyemma.coordinates import tica
import matplotlib.pyplot  as plt
import pickle
from multiprocessing import Pool
import itertools

<h1> Spectral oASIS <h1>

In [None]:
#Loading features
os.makedirs('./SpectralOasis/',exist_ok=True)
featdir="./Featurization/"
spectraldir="./SpectralOasis/"
input_feature_data=[]

for i in range(100):
    temp=np.load(featdir+("features/{}.npy").format(i))
    input_feature_data.append(temp)



In [None]:
#Preparing a list for the number of features tested and 
#Setting up parameters to run run_SpectraloASIS() in parallel
lt=[0.2,0.4,0.6]
dt=0.1
lts_in_steps=[ int(round(i/dt)) for i in lt]
num_features = input_feature_data[0].shape[1]
columns=[4,8,12,16,20,24,28]
parameters=[(a,b) for a in columns for b in lts_in_steps]
print("no. of features tested: ", columns)

In [None]:
def run_SpectraloASIS(max_columns,lt,dt=dt,num_features=num_features,
                      input_feature_data=input_feature_data,
                      spectraldir="./SpectralOasis/"):
    """
    Running Spectral oASIS 

    Parameters
    ----------
    max_columns : int
        The number of features to be selected

    input_feature_data: list containing ndarrays(dtype=int) or ndarray(n, dtype=int))
        features to be selected

    num_features: int
        The number of features in the full set

    spectraldir: str, default="./SpectralOasis/"
        The directory to save output

    Return
    ----------
    t.timescales:
         timescales for tlCA perform with this number of features

    """
    t = NystroemTICA(lt, max_columns, initial_columns=np.random.choice(num_features,1,replace=False), nsel=1)
    # You may want to increase the variables nsel to a higher number e.g. 50  when dealing with large numbers of features in your dataset
    # The alanine dipeptide dataset just has too few features....
    t.estimate(input_feature_data)   ##running oasis_tica
    os.makedirs('{}{}'.format(spectraldir,int(lt)),exist_ok=True)
    np.savetxt("{}{}/feature_column{}_ticalag_{}.txt".format(spectraldir,int(lt),max_columns,int(lt)), t.column_indices, fmt='%d')
    np.savetxt("{}{}/timescales_column{}_ticalag_{}.txt".format(spectraldir,int(lt),max_columns,int(lt)), t.timescales)
    return lt,max_columns,t.timescales

with Pool() as pool: 
    results = pool.starmap(run_SpectraloASIS,parameters)

df = pd.DataFrame(results)
df.to_pickle("{}timescales.pickl".format(spectraldir))  



In [None]:
columns_=[ i for i in columns]
columns_.append(num_features)

data=pd.read_pickle("{}timescales.pickl".format(spectraldir))
for n in lts_in_steps:
    t_timescales=data.loc[data[0] == n][2].values
    timescales=[]
    for i in range(0,len(columns_)-1):
        timescales.append(t_timescales[i][0])
    TICA=tica(input_feature_data, lag=n) #Calculating tlCA timescales for full features
    timescales.append(TICA.timescales[0])
    timescales=np.array(timescales)*dt
    #Plotting the tlCA timescales against number of features. We will pick the feature set when tlCA timescales is converged.
    f,ax=plt.subplots(figsize=(8,4))
    ax.plot(columns_, timescales,"-o", color="b", lw=4)
    ax.plot([-1,num_features+1], [timescales[-1], timescales[-1]], color="k", lw=4, linestyle=":")
    ax.set_ylabel("Timescale (ps)", fontsize=16)
    ax.set_xlabel("No. of Features", fontsize=16)
    ax.set_xlim(-1,num_features+1)
    ax.set_title("Lagtime={0:.1f}ps".format(n*dt),fontsize=18)
    ax.tick_params(axis='both',labelsize=16)
    plt.tight_layout()
    
    plt.savefig(spectraldir+"Timescale_vs_FeatureNo_ticalag_{0:.1f}.png".format(n*dt))
