In [1]:
# change working directory to be repo base
import os
os.chdir('/Users/zbh0005/Library/CloudStorage/OneDrive-AuburnUniversity/Documents/Code/yukawa-sindy')
# import function file
import Yukawa_SINDy as ys
# import libraries
from sklearn.model_selection import KFold
from sklearn.metrics import root_mean_squared_error
import numpy as np
import pysindy as ps
import matplotlib.pyplot as plt
# ignore warnings generated from using LaTeX coding in matplotlib label strings
from warnings import filterwarnings
filterwarnings('ignore', message = 'invalid escape sequence')

In [None]:
X = np.array([['traj0-19', 'traj20-39', 'traj40-59', 'traj60-79', 'traj80-99', 'traj100-119', 'traj120-139', 'traj140-159', 'traj160-179', 'traj180-199'],
              ['traj0-19', 'traj20-39', 'traj40-59', 'traj60-79', 'traj80-99', 'traj100-119', 'traj120-139', 'traj140-159', 'traj160-179', 'traj180-199']]).T
kf = KFold(n_splits=10)
for train, test in kf.split(X):
    print('train:', X[train])
    print('test:', X[test])
    print(60*'-')

In [None]:
X.T.shape

In [None]:
X = ['traj0-19', 'traj20-39', 'traj40-59', 'traj60-79', 'traj80-99', 'traj100-119', 'traj120-139', 'traj140-159', 'traj160-179', 'traj180-199']
              
kf = KFold(n_splits=10)
for train, test in kf.split(X):
    for idx in train:
        print('train:', X[idx])
    for idx in test:
        print('test:', X[idx])
    print(60*'-')

In [None]:
kf.get_n_splits()

In [None]:
X = np.array([[0., 0.], [1., 1.], [-1., -1.], [2., 2.]])
y = np.array([0, 1, 0, 1])
kf = KFold(n_splits=2)
for train, test in kf.split(X):
    print(train,test)
    X_train, X_test, y_train, y_test = X[train], X[test], y[train], y[test]

In [24]:
sims = ys.generate_training_data()

In [105]:
def same_times(list_of_sims:list):
    '''
    Description: Helper function to check if all simulations in a list of 
    Yukawa_SINDy.Yukawa_simulation objs have the same time grid. returns 
    True or False.
    '''
    same_times:bool = True
    t_check = list_of_sims[0].t
    for sim in list_of_sims[1:]:
        if not np.all(t_check == sim.t):
            same_times = False
            break
    return same_times


def cross_validate(all_data:list, threshold:float, feature_library, feature_names, n_folds=10):
    # check if list of sim objects
    for item in all_data:
        if not isinstance(item, ys.Yukawa_simulation):
            raise TypeError("Argument 'all_data' should be list of 'Yukawa_SINDy.Yukawa_simulation' objects")
    # check if all time grids are the same
    if not same_times(all_data):
        raise Exception("All simulations do not have the same time grid.")
    
    # extract data from sim objects
    x_data = np.array([sim.x for sim in all_data])
    t_data = sims[0].t
    n_timesteps = t_data.shape[0]

    # split data into withhold(testing) and training data
    n_trajectories = len(all_data)
    rng = np.random.default_rng(seed=10235783)
    withhold_idxs = rng.choice(x_data.shape[0], np.floor(0.25 * n_trajectories).astype(int), replace=False)
    withhold_idxs.sort()
    train_idxs = np.delete(np.arange(len(all_data)), withhold_idxs)
    x_train = x_data[train_idxs]
    x_withhold = x_data[withhold_idxs]

    # declare optimizer with given threshold
    opt = ps.STLSQ(threshold=threshold)

    # get number of terms in library
    rand_data = np.random.random((5000,2))
    test_mdl = ps.SINDy(optimizer=opt, feature_library=feature_library, feature_names=feature_names)
    test_mdl.fit(rand_data)
    feature_list = test_mdl.get_feature_names()
    n_features = len(feature_list)
    del test_mdl, rand_data

    # perform KFold CV
    rmse_kf = []
    all_coefs = np.empty((0,x_data.shape[2],n_features))
    kf = KFold(n_splits=n_folds)
    for train, test in kf.split(train_idxs):
        # split training data
        x_train_kf = [traj for traj in x_train[train]]
        x_test_kf  = [traj for traj in x_train[test]]
        # fit SINDy model using given threshold
        mdl = ps.SINDy(optimizer=opt, feature_library=feature_library, feature_names=feature_names)
        mdl.fit(x_train_kf, t_data, multiple_trajectories=True)

        # get coefs and append to all_coefs
        coefs = mdl.coefficients()
        coefs.resize((1,*coefs.shape))
        all_coefs = np.vstack((all_coefs,coefs))

        # validate model against test data
        rmse = mdl.score(x_test_kf,t_data,multiple_trajectories=True)#,metric=root_mean_squared_error)
        rmse_kf.append(rmse)
        # # old code
        # x_dot_predicted = mdl.predict(x_test_kf)
        # fd = ps.FiniteDifference()
        # x_dot_calculated = fd._differentiate(x_test_kf)
        # rmse = root_mean_squared_error(x_dot_calculated, x_dot_predicted)
        # rmse_kf.append(rmse)
        

    return x_train, x_withhold


In [106]:
threshold = 0.5
feature_library = ys.generate_Yukawa_library()
feature_names = ['x', 'v']

x_train, x_withhold = cross_validate(sims, threshold, feature_library, feature_names)



ValueError: matmul: Input operand 1 has a mismatch in its core dimension 0, with gufunc signature (n?,k),(k,m?)->(n?,m?) (size 2 is different from 10)

In [39]:

# extract data from sim objects
x_data = np.array([sim.x for sim in sims])
t_data = sims[0].t
n_timesteps = t_data.shape[0]

# split data into withhold(testing) and training data
n_trajectories = len(sims)
rng = np.random.default_rng(seed=10235783)
withhold_idxs = rng.choice(n_trajectories, np.floor(0.25 * n_trajectories).astype(int), replace=False)
withhold_idxs.sort()
train_idxs = np.delete(np.arange(len(sims)), withhold_idxs)
x_train = x_data[train_idxs]
x_withhold = x_data[withhold_idxs]

In [41]:
print(f"training: {x_train.shape}, withhold: {x_withhold.shape}")

training: (150, 5000, 2), withhold: (50, 5000, 2)


In [9]:
emp = np.empty((0,5000,2))
append = np.random.random((5000,2))
append.resize((1,5000,2))
np.vstack((emp,append))

array([[[0.41356834, 0.11340827],
        [0.39035098, 0.34350152],
        [0.50735309, 0.71549869],
        ...,
        [0.20607009, 0.78325177],
        [0.40298175, 0.13817783],
        [0.50899538, 0.7800266 ]]])

In [14]:
a = (5,2)
b = (10,*a)

In [15]:
b

(10, 5, 2)