In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os

In [None]:
def sliding_window(df: pd.DataFrame, seq_len = 100, stride = 1) -> np.ndarray:
    columns = df.columns
    np_df = df.to_numpy()

    ts = np.empty(shape=((len(df)-seq_len)//stride,seq_len,len(columns)))
    for index in tqdm(range((len(df)-seq_len)//stride)):
        ts[index,:,:] = np_df[index*stride:(index*stride)+seq_len,:]
    return ts

def ndarray_to_sktime_df(a: np.ndarray, columns) -> pd.DataFrame:
    m,n,r = a.shape
    out_arr = np.column_stack((np.repeat(np.arange(m),n),a.reshape(m*n,-1)))
    columns = ['Seq'] + list(columns)
    out_df = pd.DataFrame(out_arr, columns=columns)
    out_df = out_df.groupby('Seq').agg(pd.Series.tolist)

    return out_df

def pd_df_to_sktime_df(df: pd.DataFrame, seq_len: int, stride: int) -> pd.DataFrame:
    columns = df.columns
    a = sliding_window(df, seq_len, stride)
    return ndarray_to_sktime_df(a, columns)

In [None]:
dataset_folder = '/kaggle/input/gpvs-fault/GPVS-Faults/GPVS-Faults'
seq_len = 200
stride = 15

result_df = pd.DataFrame()
for filename in os.listdir(dataset_folder):
    label = filename[:3]
    print(label)
    df = pd.read_csv(os.path.join(dataset_folder, filename), index_col = 0)[6.7:]
    df = pd_df_to_sktime_df(df, seq_len, stride)
    df['Fault_type'] = label
    df['Fault_type'] = df['Fault_type'].astype('category')

    result_df = pd.concat([result_df, df], ignore_index = True)

In [None]:
result_df

In [None]:
result_df['Fault_type'].value_counts()

In [None]:
from sklearn.model_selection import train_test_split

train_df, test_df = train_test_split(result_df, test_size = 0.35, stratify = result_df['Fault_type'])
val_df, test_df = train_test_split(test_df, test_size = 0.5, stratify = test_df['Fault_type'])

In [None]:
train_df['Fault_type'].value_counts()

In [None]:
val_df['Fault_type'].value_counts()

In [None]:
test_df['Fault_type'].value_counts()

In [None]:
os.makedirs(f"gpvs_sl{seq_len}_s{stride}")
output_path = f'gpvs_sl{seq_len}_s{stride}/gpvs_sl{seq_len}_s{stride}'
train_df.reset_index(drop=True).to_pickle(f'{output_path}_TRAIN.pkl')
val_df.reset_index(drop=True).to_pickle(f'{output_path}_VALI.pkl')
test_df.reset_index(drop=True).to_pickle(f'{output_path}_TEST.pkl')