In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [None]:
dataset_folder = '/kaggle/input/gpvs-fault/GPVS-Faults/GPVS-Faults'
filenames = os.listdir(dataset_folder)

class_names = [filename[:3] for filename in filenames]

le = LabelEncoder()
le.fit(class_names)
le.classes_

In [None]:
def sliding_window(df: pd.DataFrame, seq_len = 100, stride = 1) -> np.ndarray:
    columns = df.columns
    #df['Fault_type'] = df['Fault_type'].cat.codes
    np_df = df.to_numpy()

    ts = np.empty(shape=((len(df)-seq_len)//stride,seq_len,len(columns)))
    for index in tqdm(range((len(df)-seq_len)//stride)):
        ts[index,:,:] = np_df[index*stride:(index*stride)+seq_len,:]
    return ts

In [None]:
seq_len = 200
stride = 15

label = filenames[0][:3]
print(label)
df = pd.read_csv(os.path.join(dataset_folder, filenames[0]), index_col = 0)[6.7:]
X = sliding_window(df, seq_len, stride)
y = np.full((X.shape[0], 1), label)

for filename in filenames[1:]:
    label = filename[:3]
    print(label)
    df = pd.read_csv(os.path.join(dataset_folder, filename), index_col = 0)[6.7:]
    x = sliding_window(df, seq_len, stride)
    X = np.concatenate((X,x))
    y = np.concatenate((y,np.full((x.shape[0], 1), label)))

In [None]:
X.shape, y.shape

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [None]:
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
output_folder = ''

np.save(os.path.join(output_folder, 'X_train.npy'), X_train)
np.save(os.path.join(output_folder, 'y_train.npy'), y_train)
np.save(os.path.join(output_folder, 'X_test.npy'), X_test)
np.save(os.path.join(output_folder, 'y_test.npy'), y_test)