In [None]:
# change working directory to be repo base
import os
os.chdir('/Users/zbh0005/Library/CloudStorage/OneDrive-AuburnUniversity/Documents/Code/yukawa-sindy')
# import function file
import Yukawa_SINDy as ys
# import libraries
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error
import numpy as np
import pysindy as ps
import matplotlib.pyplot as plt
# ignore warnings generated from using LaTeX coding in matplotlib label strings
from warnings import filterwarnings
filterwarnings('ignore', message = 'invalid escape sequence')
# import scaling constant from working directory and declare as global variable
from pickle import load
with open('scaling_const.float','rb') as f:
    A = load(f)

In [None]:
def same_times(list_of_sims:list):
    '''
    Description: Helper function to check if all simulations in a list of 
    Yukawa_SINDy.Yukawa_simulation objs have the same time grid. returns 
    True or False.
    '''
    same_times:bool = True
    t_check = list_of_sims[0].t
    for sim in list_of_sims[1:]:
        if not np.all(t_check == sim.t):
            same_times = False
            break
    return same_times


def kfold_training(x_train:np.ndarray, t_data:np.ndarray, n_folds:int, SINDy_model:ps.SINDy, verbose=False):
    # check dimension of training_data and t_data arrays
    if x_train.ndim!=3:
        raise Exception('training data has wrong dimensions')
    if x_train.shape[1]!=t_data.shape[0]:
        raise Exception('time data has wrong dimensions')
    # get SINDy parameters from input SINDy model
    feature_list = SINDy_model.get_feature_names()
    n_features = len(feature_list)
    feature_library = SINDy_model.feature_library
    feature_names = SINDy_model.feature_names
    opt = SINDy_model.optimizer

    # check if feature_library is weak or strong, don't need
    # to pass time as an arg into the 'fit' method of ps.SINDy
    if isinstance(feature_library, ps.WeakPDELibrary):
        t_to_fit = None
    else:
        t_to_fit = t_data
    
    # perform KFold CV
    all_rmse = np.array([])
    all_coefs = np.empty((0,x_train.shape[2],n_features))
    kf = KFold(n_splits=n_folds)
    for train, test in kf.split(x_train):
        # split training data
        x_train_kf = [traj for traj in x_train[train]]
        x_test_kf  = [traj for traj in x_train[test]]
        # print(f'train shape: {train.shape}')
        # print(f'test shape: {test.shape}')
        # fit SINDy model using given threshold
        mdl = ps.SINDy(optimizer=opt, feature_library=feature_library, feature_names=feature_names)
        mdl.fit(x_train_kf, t_to_fit, multiple_trajectories=True)
        if verbose: mdl.print()

        # get coefs and append to all_coefs
        coefs = mdl.coefficients()
        coefs = coefs.reshape((1,*coefs.shape))
        all_coefs = np.vstack((all_coefs,coefs))

        # validate model against test data
        # print(f'test traj shape: {x_test_kf[0].shape}') # included for testing
        # print(f'coefficients shape: {mdl.coefficients().shape}') # included for testing
        rmse = mdl.score(x_test_kf, t=t_data, multiple_trajectories=True, metric=root_mean_squared_error)
        all_rmse = np.hstack((all_rmse, rmse))
    
    return all_rmse, all_coefs


def cross_validate(all_data:list, threshold:float, feature_library, feature_names, n_folds=10):
    '''
    Description: This function performs k-fold cross-validation (cv) with k specified by the 'n_folds'
    (default 10) argument. Gets help from the 'sklearn.model_selection.KFold' object. Takes a list 
    of Yukawa_SINDy.Yukawa_simulation objects, a SINDy STLSQ threshold, a feature library 
    ('pysindy.BaseFeatureLibrary' child objs), and feature names as args. Returns a rank 3 numpy
    array of coefficients from the best two models: the one with the lowest error and the average
    coefficients of all models generated during k-fold cv.
    '''
    # check if list of sim objects
    for item in all_data:
        if not isinstance(item, ys.Yukawa_simulation):
            raise TypeError("Argument 'all_data' should be list of 'Yukawa_SINDy.Yukawa_simulation' objects")
    # check if all time grids are the same
    if not same_times(all_data):
        raise Exception("All simulations do not have the same time grid.")
    
    # extract data from sim objects
    x_data = np.array([sim.x for sim in all_data])
    t_data = sims[0].t
    n_timesteps = t_data.shape[0]
    # print(f'shape and ndims of t_data: {t_data.shape}, {t_data.ndim}') # included for testing

    # split data into withhold(testing) and training data
    n_trajectories = len(all_data)
    rng = np.random.default_rng(seed=10235783)
    withhold_idxs = rng.choice(x_data.shape[0], np.floor(0.25 * n_trajectories).astype(int), replace=False)
    withhold_idxs.sort()
    train_idxs = np.delete(np.arange(len(all_data)), withhold_idxs)
    x_train = x_data[train_idxs]
    x_withhold = x_data[withhold_idxs]

    # declare optimizer with given threshold
    opt = ps.STLSQ(threshold=threshold)

    # get number of terms in library
    rand_data = np.random.random((5000,2))
    test_mdl = ps.SINDy(optimizer=opt, feature_library=feature_library, feature_names=feature_names)
    test_mdl.fit(rand_data)

    # perform kfold cv
    rmse, coef = kfold_training(x_train,t_data,n_folds,test_mdl)

    del test_mdl, rand_data


    return rmse, coef


In [None]:
sims = ys.generate_training_data(noise_level=0.01,scaled=True)

In [None]:
sims[77].plot()

In [None]:
threshold =0.4
t_data = sims[0].t
weak_lib, strong_lib = ys.generate_libraries(t_data)
feature_names = ['x', 'v']

rmse, coef = cross_validate(sims, threshold, weak_lib, feature_names)
# x_train.shape

fig, ax = plt.subplots()
ax.plot(rmse,'o')
ax.set_xlabel("Model no.")
ax.set_ylabel("Root mean squared error")
fig.tight_layout()

In [None]:
norm_mat = np.array( [ 10*[1], 10*[A] ] )

In [None]:
threshold/A

In [None]:
coef[5]/norm_mat

In [None]:
coef[1]/norm_mat

In [None]:
correct_coefs = np.array( [ [0., 1., 0., 0., 0., 0., 0., 0., 0., 0. ], 
                            [0., 0., 1., 0., 1., 0., 0., 0., 0., 0. ] ] ).reshape((1,2,10))
coef_diff = np.abs(coef/norm_mat - correct_coefs)

In [None]:
coef_diff[5]

In [None]:
coef_diff[6]

In [None]:
coef_diff.max()

In [None]:
coef.mean(axis=0)/norm_mat