In [71]:
import numpy as np
import time

from load_dataset import load_dataset, print_info, preprocessing
# NVARk
from model.NVARk import NVARk
from model.emb_pars import SSR_parameters

# # different tasks
# from sklearn.svm import SVC

# internal imports
import utils
import tasks

import warnings

warnings.filterwarnings("ignore")


datasets_list = [
       'CinCECGTorso',
    #'EmoDB',
    # 'RAVDESS'
]

"""global variables"""
# set to 'zero_padding' for matching the longest series in the dataset
# set to 'interpolate'
prepr_option = "zero_padding"  # 'none' / 'zero_padding' / 'interpolate'
experiment = "SVM_NVARk"  # SVM_NVARk, SVM_NVARk* , time_NVARk
random_iterations = 10
svm_C_list = np.logspace(-3, 3, 7)
# 'svd' or 'cholesky'  ('cholesky' is used in the paper, is faster but can be unstable for matrices with high collinearity)
solver = "svd"



In [72]:
for dataset_name in datasets_list:
    TRAIN_x_raw, TRAIN_y_raw, TEST_x_raw, TEST_y_raw = load_dataset(dataset_name)

In [73]:
import pandas as pd

def check_df_for_weird_values(df):
    print(f"Checking DataFrame for weird values...")
    
    def print_weird_values(series, row_index):
        """
        Check if the given pd.Series contains non-numeric values, NaNs, empty strings, infinities,
        or values outside a specified range, and print those specific weird values.
        """
        # Convert series to numeric, coercing errors to NaN
        numeric_series = pd.to_numeric(series, errors='coerce')
        
        # Check for NaN, non-numeric values, empty strings, and infinities
        is_nan_or_non_numeric = numeric_series.isna() | series.isna()
        is_empty_string = series == ""
        is_infinity = series.apply(lambda x: x in [np.inf, -np.inf])
        
        # Combine all conditions
        weird_values = series[is_nan_or_non_numeric | is_empty_string | is_infinity]
        
        # Print if there are any weird values
        if not weird_values.empty:
            print(f"Row {row_index} contains weird values: {weird_values.tolist()}")

    # Apply this check for each row in the DataFrame
    for row_index, row in df.iterrows():
        series = row[0]  # Assuming the pd.Series is in the first column of each row
        print_weird_values(series, row_index)

# Example usage for checking the datasets
#check_df_for_weird_values(TEST_x_raw)
#check_df_for_weird_values(TRAIN_x_raw)

In [74]:
def clean_and_crop(_dataset, max_len=2000):
    # Apply conversion to float and crop each pd.Series in place
    dataset = _dataset.copy()
    dataset = dataset.applymap(lambda x: x.iloc[:max_len])
    
    # Apply numeric conversion after the cropping
    dataset = dataset.applymap(pd.to_numeric, errors='coerce')
    
    return dataset

# Convert the datasets to float time series
TRAIN_clean = clean_and_crop(TRAIN_x_raw)
TEST_clean = clean_and_crop(TEST_x_raw)

In [75]:
check_df_for_weird_values(TRAIN_clean)
check_df_for_weird_values(TEST_clean)

Checking DataFrame for weird values...
Checking DataFrame for weird values...


In [76]:
print(TRAIN_clean["dim_0"].iloc[0])

0      -1.100932
1      -1.136304
2      -0.269700
3      -0.287386
4      -0.252014
          ...   
1634   -0.588044
1635   -0.658788
1636   -0.747217
1637   -0.871017
1638   -1.030189
Name: 0, Length: 1639, dtype: float64


In [77]:
info = print_info(dataset_name, TRAIN_clean, TEST_clean, y=TRAIN_y_raw)


### CinCECGTorso ###                  N       T_min|T_max     D       N_class 
------------------------------------------------------------------------------------------
CinCECGTorso train                     40       1639|1639      1        4         nans = False
CinCECGTorso test                      1380     1639|1639      1        4         nans = False
