In [2]:
import pandas as pd 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from torch.utils.data import TensorDataset
import torch
from torch.utils.data import DataLoader

## Train Data

In [15]:
data_path = 'BATADAL_dataset03.csv'
data_df = pd.read_csv(data_path, parse_dates = ['DATETIME'], dayfirst=True)

In [16]:
dates_train = data_df['DATETIME']
sensor_cols = [col for col in data_df.columns if col not in ['DATETIME','ATT_FLAG']]

scaler = MinMaxScaler()
X = pd.DataFrame(index = data_df.index, columns = sensor_cols, data = scaler.fit_transform(data_df[sensor_cols]))

In [17]:
X_train, X_val, _, _ = train_test_split(X,X, test_size=0.2, random_state=42)

In [18]:
print(f'Train dataset size: {X_train.values.shape}')
print(f'Validation dataset size: {X_val.values.shape}')

Train dataset size: (7008, 43)
Validation dataset size: (1753, 43)


In [29]:

train_dataset = TensorDataset(torch.from_numpy(X_train.values).float(), 
                            torch.from_numpy(X_train.values).float())
val_dataset = TensorDataset(torch.from_numpy(X_val.values).float(),
                           torch.from_numpy(X_val.values).float())

print(f'Train dataset size: {train_dataset.tensors[0].shape}')
print(f'Validation dataset size: {val_dataset.tensors[0].shape}')

Train dataset size: torch.Size([7008, 43])
Validation dataset size: torch.Size([1753, 43])


In [32]:
# torch.save(train_dataset.tensors[0], "BATADAL_train_dataset.pt")
# torch.save(val_dataset.tensors[0], "BATADAL_val_dataset.pt")

In [13]:
# data loader
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

print(f'Train dataset size: {train_loader.dataset.tensors[0].shape}')
print(f'Validation dataset size: {val_loader.dataset.tensors[0].shape}')


Train dataset size: torch.Size([7008, 43])
Validation dataset size: torch.Size([1753, 43])


## Test Data

In [30]:
test_data_path = './dataset/BATADAL_dataset04.csv'
test_df = pd.read_csv(test_data_path, parse_dates = ['DATETIME'], dayfirst=True)
dates_test = test_df['DATETIME']
sensor_cols = [col for col in test_df.columns if col not in ['DATETIME','ATT_FLAG']]

scaler = MinMaxScaler()
X_test = pd.DataFrame(index = test_df.index, columns = sensor_cols, data = scaler.fit_transform(test_df[sensor_cols]))
y_test = test_df['ATT_FLAG']
y_test = y_test.apply(lambda x: 0 if x == -999 else x)
print(X_test.shape)
print(y_test.shape)

test_dataset = TensorDataset(torch.from_numpy(X_test.values).float(), 
                            torch.from_numpy(y_test.values).float())

torch.save((torch.from_numpy(X_test.values).float(),torch.from_numpy(y_test.values).float()), "./dataset/BATADAL_test_dataset.pt")


(4177, 43)
(4177,)


  test_df = pd.read_csv(test_data_path, parse_dates = ['DATETIME'], dayfirst=True)


In [28]:
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

for batch_X, batch_y in test_loader:
    print(batch_X.shape)
    print(batch_y.shape)
    break


torch.Size([64, 43])
torch.Size([64])
