# GPVS dataset with GAN augmentation

## Import libraries

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import os
from sklearn.model_selection import train_test_split

## Read dataset

In [None]:
feature_columns = ['Ipv', 'Vpv', 'Vdc', 'ia', 'ib', 'ic', 'va', 'vb', 'vc', 'Iabc', 'If', 'Vabc', 'Vf']
dataset_folder = '/kaggle/input/gpvs-ts-npy'

x_train = np.load(os.path.join(dataset_folder, 'X_train.npy'))
y_train = np.load(os.path.join(dataset_folder, 'y_train.npy'))
x_test = np.load(os.path.join(dataset_folder, 'X_test.npy'))
y_test = np.load(os.path.join(dataset_folder, 'y_test.npy'))

In [None]:
dataset_folder = '/kaggle/input/generateddata'

generated_data = np.load(os.path.join(dataset_folder, 'generated_data.npy'))
labels = np.load(os.path.join(dataset_folder, 'generated_labels.npy'))

In [None]:
data_concat = np.concatenate([x_train, generated_data])
labels_concat = np.concatenate([y_train, labels.reshape(len(labels),1)])

print(data_concat.shape, labels_concat.shape)

## Conversion from numpy to pandas

In [None]:
def ndarray_to_sktime_df(a: np.ndarray, columns) -> pd.DataFrame:
    m,n,r = a.shape
    out_arr = np.column_stack((np.repeat(np.arange(m),n),a.reshape(m*n,-1)))
    columns = ['Seq'] + list(columns)
    out_df = pd.DataFrame(out_arr, columns=columns)
    out_df = out_df.groupby('Seq').agg(pd.Series.tolist)
    
    return out_df

In [None]:
def npy_to_df(data, labels, columns):
    df = ndarray_to_sktime_df(data, columns)
    df['Fault_type'] = labels
    df['Fault_type'] = df['Fault_type'].astype('category')
    return df

In [None]:
gan_train = npy_to_df(data_concat, labels_concat, feature_columns)
train = npy_to_df(x_train, y_train, feature_columns)
test = npy_to_df(x_test, y_test, feature_columns)

val_df, test_df = train_test_split(test, test_size = 0.5, stratify = test['Fault_type'])

In [None]:
gan_train['Fault_type'].value_counts()

In [None]:
train['Fault_type'].value_counts()

## Save dataset

In [None]:
seq_len = 200
stride = 15

os.makedirs(f"gpvs_sl{seq_len}_s{stride}")
output_path = f'gpvs_sl{seq_len}_s{stride}/gpvs_sl{seq_len}_s{stride}'
train.reset_index(drop=True).to_pickle(f'{output_path}_TRAIN.pkl')
gan_train.reset_index(drop=True).to_pickle(f'{output_path}_GAN.pkl')
val_df.reset_index(drop=True).to_pickle(f'{output_path}_VALI.pkl')
test_df.reset_index(drop=True).to_pickle(f'{output_path}_TEST.pkl')