In [1]:
import os
import sys
import pandas as pd
import pickle
import numpy as np

import torch
from torch.utils.data import Dataset, Subset

sys.path.append(os.path.dirname(os.getcwd()))
print(os.getcwd())

from EDP_Model_Testing.test_lstm import WindTurbineDataset, get_aggregate_data

/sfs/gpfs/tardis/home/ujx4ab/ondemand/dissecting_dist_inf/WF_Data/EDP/EDP_Model_Testing


In [2]:
def get_data_splits(dataset, split_timestamp, batch_size=32):
    if isinstance(split_timestamp, str):
        split_timestamp = pd.Timestamp(split_timestamp)

    indices = list(range(len(dataset)))
    timestamps = dataset.timestamps

    train_val_indices = [i for i, t in zip(indices, timestamps) if t < split_timestamp]
    test_indices = [i for i, t in zip(indices, timestamps) if t >= split_timestamp]

    test_dataset = Subset(dataset, test_indices)
    train_val_dataset = Subset(dataset, train_val_indices)

    return test_dataset, train_val_dataset

In [3]:
DATA_FOLDER = "/home/ujx4ab/ondemand/dissecting_dist_inf/WF_Data/EDP/EDP_Model_Testing/data_prep"
FAULTS_DATA_FOLDER = DATA_FOLDER + "_faults_included"

X_tensor, y_tensor, timestamps_array = get_aggregate_data(DATA_FOLDER)

faults_X_tensor, faults_y_tensor, faults_timestamps_array = get_aggregate_data(FAULTS_DATA_FOLDER)  

Aggregating data ...
EDP_WT_01.csv
EDP_WT_01_X.pt
EDP_WT_01_timestamp.pkl
EDP_WT_01_y.pt
EDP_WT_06.csv
EDP_WT_06_X.pt
EDP_WT_06_timestamp.pkl
EDP_WT_06_y.pt
EDP_WT_07.csv
EDP_WT_07_X.pt
EDP_WT_07_timestamp.pkl
EDP_WT_07_y.pt
EDP_WT_11.csv
EDP_WT_11_X.pt
EDP_WT_11_timestamp.pkl
EDP_WT_11_y.pt
accumulated_timestamps.csv
X_tensor shape: torch.Size([420477, 145, 22])
y_tensor shape: torch.Size([420477, 1])
Number of timestamps: 420477
Aggregating data ...
EDP_WT_01.csv
EDP_WT_01_X.pt
EDP_WT_01_timestamp.pkl
EDP_WT_01_y.pt
EDP_WT_06.csv
EDP_WT_06_X.pt
EDP_WT_06_timestamp.pkl
EDP_WT_06_y.pt
EDP_WT_07.csv
EDP_WT_07_X.pt
EDP_WT_07_timestamp.pkl
EDP_WT_07_y.pt
EDP_WT_11.csv
EDP_WT_11_X.pt
EDP_WT_11_timestamp.pkl
EDP_WT_11_y.pt
X_tensor shape: torch.Size([420477, 145, 22])
y_tensor shape: torch.Size([420477, 1])
Number of timestamps: 420477


In [7]:
y_tensor

tensor([[0.2763],
        [0.2761],
        [0.2364],
        ...,
        [0.0954],
        [0.1434],
        [0.1499]])

In [5]:
full_dataset = WindTurbineDataset(
    X=X_tensor,
    y=y_tensor,
    timestamps=timestamps_array
)

faults_full_dataset = WindTurbineDataset(
    X=X_tensor,
    y=y_tensor,
    timestamps=timestamps_array
)

In [6]:
if (X_tensor.shape != faults_X_tensor.shape or 
    y_tensor.shape != faults_y_tensor.shape or 
    timestamps_array.shape != faults_timestamps_array.shape):
    print("Datasets have different shapes.")
else:
    if (torch.equal(X_tensor, faults_X_tensor) and
        torch.equal(y_tensor, faults_y_tensor) and
        (timestamps_array == faults_timestamps_array).all()):
        print("Datasets are identical.")
    else:
        print("Datasets are different.")
if not (timestamps_array == faults_timestamps_array).all():
    print(f"Unique timestamps in DATA_FOLDER: {set(timestamps_1)}")
    print(f"Unique timestamps in FAULTS_DATA_FOLDER: {set(timestamps_2)}")

Datasets are different.


In [7]:
_, train_val_dataset = get_data_splits(full_dataset, split_timestamp="2017-06-01")
faults_test_dataset, _ = get_data_splits(faults_full_dataset, split_timestamp="2017-06-01")

In [46]:
def save_data(dataset, directory, set):
    os.makedirs(directory, exist_ok=True) 
    
    X_list = []  
    y_list = [] 
    mask_list = []  
    timestamps_list = []  

    for data, y, mask, timestamp in dataset:
        X_list.append(data.numpy()) 
        y_list.append(y)
        mask_list.append(mask.numpy())  
        timestamps_list.append(timestamp)

    # Stack lists into tensors or NumPy arrays
    X_array = np.stack(X_list)  # Shape: (num_samples, sequence_length, num_features)
    y_array = np.stack(y_list)  # Shape: (num_samples, ...)
    mask_array = np.stack(mask_list)  # Shape: (num_samples, ...)
    timestamps_array = np.stack(timestamps_list)  # Shape: (num_samples, ...)
    average_Bp_bin_values = X_array[:, :, -1].mean(axis=1)
    average_power_values = y_array.mean(axis=1)
    averages_for_indexing = np.stack((average_Bp_bin_values, average_power_values), axis=1)

    bin_edges = np.linspace(0, 1, 6)
    binned_Bp = np.digitize(averages_for_indexing[:, 0], bins=np.linspace(-.01, 1.01, 5), right=False) - 1
    binned_power = np.digitize(averages_for_indexing[:, 1], bins=np.linspace(-.01, 1.01, 3), right=False) - 1
    binned_feature_avgs = np.stack((binned_Bp, binned_power), axis=1)

    # Save as .npy files
    np.save(os.path.join(directory, f'{set}X.npy'), X_array)
    np.save(os.path.join(directory, f'{set}y.npy'), y_array)
    np.save(os.path.join(directory, f'{set}mask.npy'), mask_array)
    np.save(os.path.join(directory, f'{set}timestamps.npy'), timestamps_array)
    np.save(os.path.join(directory, f'{set}binned_feature_avgs.npy'), binned_feature_avgs)

    print(f"Data saved to {directory}")
    print(f"X shape: {X_array.shape}")
    print(f"y shape: {y_array.shape}")
    print(f"mask shape: {mask_array.shape}")
    print(f"timestamps shape: {timestamps_array.shape}")
    print(f"averages for indexing: {binned_feature_avgs.shape}")

    return averages_for_indexing, binned_feature_avgs


In [47]:
averages_for_indexing1, binned_feature_avgs1 = save_data(train_val_dataset, directory="/home/ujx4ab/ondemand/dissecting_dist_inf/datasets/LSTM_WTs/", set="train_val_dataset_")
averages_for_indexing2, binned_feature_avgs2 = save_data(faults_test_dataset, directory="/home/ujx4ab/ondemand/dissecting_dist_inf/datasets/LSTM_WTs/", set="faults_test_dataset_")

Data saved to /home/ujx4ab/ondemand/dissecting_dist_inf/datasets/LSTM_WTs/
X shape: (264921, 145, 22)
y shape: (264921, 1)
mask shape: (264921, 145)
timestamps shape: (264921,)
averages for indexing: (264921, 2)
Data saved to /home/ujx4ab/ondemand/dissecting_dist_inf/datasets/LSTM_WTs/
X shape: (105145, 145, 22)
y shape: (105145, 1)
mask shape: (105145, 145)
timestamps shape: (105145,)
averages for indexing: (105145, 2)


In [48]:
this_df1 = pd.DataFrame(averages_for_indexing1, columns=['Bp_bin', 'Power'])
this_df2 = pd.DataFrame(binned_feature_avgs1, columns=['Bp_bin', 'Power'])

print(this_df1['Power'].value_counts())
print(this_df2['Power'].value_counts())



Power
0.000000    74093
0.996615       62
0.996719       57
0.996725       55
0.996743       52
            ...  
0.114926        1
0.086484        1
0.076630        1
0.072542        1
0.010734        1
Name: count, Length: 166214, dtype: int64
Power
0    201327
1     63594
Name: count, dtype: int64


In [31]:
averages_for_indexing2

array([[0.9655172 , 0.18498261],
       [0.9724138 , 0.15083884],
       [0.9724138 , 0.21377672],
       ...,
       [0.6275862 , 0.0954418 ],
       [0.63448274, 0.14342895],
       [0.6413793 , 0.14989664]], dtype=float32)