In [26]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import IsolationForest
from sklearn.utils import shuffle

def normalize_data(data, max_values):
    data = data.astype(np.float32)
    for i in range(len(max_values)):
        data[:, i] /= max_values[i]
    return data

def detect_anomalies(data):
    iso_forest = IsolationForest(n_estimators=100, contamination='auto')
    return iso_forest.fit_predict(data) == 1

def load_and_process_data(paths, max_values, anomaly_detection=False, data_augmentation=False, use_one_hot_encoding=False, shuffle_data=False, window_size=180, step=1):
    X, Y = [], []
    for category, path_list in enumerate(paths):
        category_data = []
        for path in path_list:
            T_Data = pd.read_csv(path, header=0)
            T_Data = np.array(T_Data)
            if data_augmentation:
                for start_row in range(0, T_Data.shape[0] - window_size + 1, step):
                    Data_E = T_Data[start_row:start_row + window_size, 1:]
                    Data_E = normalize_data(Data_E, max_values)
                    category_data.append(Data_E.flatten())
            elif T_Data.shape[0] >= 180:
                Data_E1 = T_Data[0:180, 1:]
                Data_E1 = normalize_data(Data_E1, max_values)
                category_data.append(Data_E1.flatten())

        category_data = np.array(category_data)
        if anomaly_detection:
            normal_samples = detect_anomalies(category_data)
            category_data = category_data[normal_samples]

        X.extend(category_data.tolist())
        Y.extend([category] * len(category_data))

    X = np.array(X)
    Y = np.array(Y).reshape(-1, 1)
    Y = Y

    if use_one_hot_encoding:
        encoder = OneHotEncoder(sparse=False)
        Y = encoder.fit_transform(Y)
    
    if shuffle_data:
        X, Y = shuffle(X, Y)
        print("数据已洗牌。前几个样本的标签：", Y[:5])
    return X, Y


max_values_list = [1200, 1200, 1200, 1200, 0.5, 700, 12, 15, 10, 200, 20, 10, 10, 10, 30, 40]

# 构建文件路径
train_paths = [
    ['Dataset/train/AddWeight/AddWeight_'+str(i)+'.csv' for i in range(214)],
    ['Dataset/train/Normal/Normal_'+str(i)+'.csv' for i in range(146)],
    ['Dataset/train/PressureGain_constant/PressureGain_constant_'+str(i)+'.csv' for i in range(213)],
    ['Dataset/train/PropellerDamage_bad/PropellerDamage_bad_'+str(i)+'.csv' for i in range(199)],
    ['Dataset/train/PropellerDamage_slight/PropellerDamage_slight_'+str(i)+'.csv' for i in range(208)]
]

test_paths = [
    ['Dataset/test/AddWeight/AddWeight_'+str(i)+'.csv' for i in range(54)],
    ['Dataset/test/Normal/Normal_'+str(i)+'.csv' for i in range(36)],
    ['Dataset/test/PressureGain_constant/PressureGain_constant_'+str(i)+'.csv' for i in range(53)],
    ['Dataset/test/PropellerDamage_bad/PropellerDamage_bad_'+str(i)+'.csv' for i in range(50)],
    ['Dataset/test/PropellerDamage_slight/PropellerDamage_slight_'+str(i)+'.csv' for i in range(52)]
]
Xtrain, Ytrain = load_and_process_data(train_paths, max_values_list, anomaly_detection=False, data_augmentation=False, use_one_hot_encoding=False, shuffle_data=False, window_size=180, step=4)
Xtest, Ytest = load_and_process_data(test_paths, max_values_list, anomaly_detection=False, data_augmentation=False, use_one_hot_encoding=False, shuffle_data=False)


In [27]:
np.save('Xtrain_test.npy', Xtrain)
np.save('Ytrain_test.npy', Ytrain)

# 保存测试数据
np.save('Xtest_test.npy', Xtest)
np.save('Ytest_test.npy', Ytest)