In [None]:
#Remember to load another conda environment with pyEMMA 2.5.7
%matplotlib inline
import os
import numpy as np
import pandas as pd
from pyemma.coordinates.transform.tica import *
from pyemma.coordinates.transform._tica_base import *
from pyemma.coordinates.transform.nystroem_tica import *
from pyemma.coordinates import tica
import matplotlib
import matplotlib.pyplot  as plt
import matplotlib.ticker as mticker
import pickle
from multiprocessing import Pool

<h1> Spectral oASIS <h1>

In [None]:
#Loading features
os.makedirs('./SpectralOasis/',exist_ok=True)
featdir="./Featurization/"
spectraldir="./SpectralOasis/"
input_feature_data=[]

for i in range(84):
    temp=np.load(featdir+("features/{}.npy").format(i))
    input_feature_data.append(temp)

#Setting up parameters to run run_SpectraloASIS() in parallel
lt=300
dt=0.2
lt=int(lt/dt)
num_features = input_feature_data[0].shape[1]


In [None]:
#Preparing a list for the number of features tested
columns=[10]
tens=0
num_temp=num_features
while num_temp>1:
    num_temp=num_temp/10
    tens+=1

tens=tens-1
interval=int(10**(tens-1))*2
columns.append(int(interval/2))
columns.extend(np.arange(interval, num_features, interval).tolist())
print("no. of features tested: ", columns)



In [None]:
def run_SpectraloASIS(max_columns,lt=lt,num_features=num_features,
                      input_feature_data=input_feature_data,
                      spectraldir="./SpectralOasis/"):
    """
    Running Spectral oASIS 

    Parameters
    ----------
    max_columns : int
        The number of features to be selected

    input_feature_data: list containing ndarrays(dtype=int) or ndarray(n, dtype=int))
        features to be selected

    num_features: int
        The number of features in the full set

    spectraldir: str, default="./SpectralOasis/"
        The directory to save output

    Return
    ----------
    t.timescales:
         timescales for tlCA perform with this number of features

    """

    t = NystroemTICA(lt, max_columns, initial_columns=np.random.choice(num_features,1,replace=False), nsel=10)
    t.estimate(input_feature_data)   #######running oasis_tica
    np.savetxt(spectraldir+"feature_column{}_ticalag_1500.txt".format(max_columns), t.column_indices, fmt='%d')
    np.savetxt(spectraldir+"timescales_column{}_ticalag_1500.txt".format(max_columns), t.timescales)
    return max_columns,t.timescales

with Pool() as pool: #use 5 cpus
    t_timescales = dict(pool.imap_unordered(run_SpectraloASIS, columns))


with open("{}t_timescales.pickl".format(spectraldir), 'wb') as fp:
    pickle.dump(t_timescales, fp, protocol=pickle.HIGHEST_PROTOCOL)   

In [None]:
t_timescales=pd.read_pickle("{}t_timescales.pickl".format(spectraldir))
columns_=[]
timescales=[]
for i in columns:
        columns_.append(i)
        timescales.append(t_timescales[i][0])
columns_.append(num_features)
TICA=tica(input_feature_data, lag=lt) #Calculating tlCA timescales for full features
timescales.append(TICA.timescales[0])
timescales=np.array(timescales)*dt


In [None]:
#Plotting the tlCA timescales against number of features. We will pick the feature set when tlCA timescales is converged.
f,ax=plt.subplots(figsize=(8,4))
ax.plot(columns_, timescales, color="b", lw=4)
ax.scatter(columns_, timescales, color="b", s=30, marker="o")
ax.plot([-1,num_features+1], [timescales[-1], timescales[-1]], color="k", lw=4, linestyle=":")
ax.set_ylabel("Slowest tICA timescale (ns)", fontsize=16)
ax.set_xlabel("No. of Features", fontsize=16)
ax.set_xlim(-1,num_features+1)
ax.set_yscale('log')
y_ticks=[y for y in range(2000,10000,1000)]
ax.set_yticks(y_ticks)
ax.tick_params(axis='both',labelsize=16)
plt.tight_layout()
plt.savefig(spectraldir+"SlowestTicaTimescale_vs_FeatureNumbers.png")
