In [1]:
# 模型
from keras.layers import Input, Dense, LSTM, merge, Conv1D, Dropout, Bidirectional, Multiply
from keras.layers.core import *
from keras.layers.recurrent import LSTM
from keras.models import *

import pandas as pd
import numpy as np
import openpyxl
import matplotlib.pyplot as plt
import csv

# 处理数据集
def create_dataset(dataset, time_steps):
    dataX, dataY = [], []
    step = time_steps // 2
    for i in range(step, len(dataset) - step):
        x = dataset[(i - step):(i + step + 1), 1:]
        dataX.append(x)
        y = [dataset[i, 0]]
        dataY.append(y)
    TrainX = np.array(dataX)
    TrainY = np.array(dataY)
    # print(TrainY)
    # print("TrainX.shape", TrainX.shape)
    # print("Train_Y.shape", TrainY.shape)

    return TrainX, TrainY


# 数据多维归一化  返回数据和最大最小值
def NormalizeMult(data):
    # normalize 用于反归一化
    normalize = np.arange(2*data.shape[1], dtype='float64')

    normalize = normalize.reshape(data.shape[1], 2)
    # print("normalize.shape:", normalize.shape)
    for i in range(0, data.shape[1]):
        # 第i列
        list = data[:, i]
        listlow, listhigh = np.percentile(list, [0, 100])
        # print(i)
        normalize[i, 0] = listlow
        normalize[i, 1] = listhigh
        delta = listhigh - listlow
        if delta != 0:
            # 第j行
            for j in range(0, data.shape[0]):
                data[j, i] = (data[j, i] - listlow)/delta

    return data, normalize


# 多维反归一化
def FNormalizeMult(data, normalize):
    data = np.array(data)
    for i in range(0, data.shape[1]):
        listlow = normalize[i, 0]
        listhigh = normalize[i, 1]
        delta = listhigh - listlow
        if delta != 0:
            # 第j行
            for j in range(0, data.shape[0]):
                data[j, i] = data[j, i]*delta + listlow

    return data


# 定义模型
def lstm_model():
    inputs = Input(shape=(TIME_STEPS, INPUT_DIMS))
    lstm_out = Bidirectional(LSTM(lstm_units, return_sequences=True))(inputs)
    # lstm_out = LSTM(lstm_units, return_sequences=True)(inputs)
    lstm_out = Dropout(drop)(lstm_out)
    lstm_out = Flatten()(lstm_out)

    output = Dense(1, activation='sigmoid')(lstm_out)
    model = Model(inputs=[inputs], outputs=output)
    return model

# 设置参数
INPUT_DIMS = 6
TIME_STEPS = 5
lstm_units = 64
drop = 0.2

ModuleNotFoundError: No module named 'keras'

In [2]:
# 读训练集数据
import os
import pandas as pd
import numpy as np

# 文件夹路径
folder_path = 'E:\\code\\BiLSTM\\BiLSTM\\trainA1_2'

# target_column = ["DZL", "ZRGM"]
all_data = []

# 遍历文件夹下的每个xlsx文件
for filename in os.listdir(folder_path):
    if filename.endswith('.xlsx'):
        # print(filename)
        file_path = os.path.join(folder_path, filename)

        # 读取Excel文件中的第二个表格
        xls = pd.ExcelFile(file_path)
        sheet_names = xls.sheet_names
        if len(sheet_names) > 1:
            df = pd.read_excel(file_path, sheet_name=sheet_names[1], header=None)

            # 获取第三行数据作为表头
            header = df.iloc[2]

            # 找到标签为"DZL"和"ZRGM"的列
            md_column = header[header == "DMZKMD"].index[0]
            dzl_column = header[header == "DZL"].index[0]
            zrgm_column = header[header == "ZRGM"].index[0]
            gg_column = header[header == "DMZKGG"].index[0]
            dssc_column = header[header == "DSSC"].index[0]
            zrdw_column = header[header == "ZRDW"].index[0]
            feature_column = header[header == "GG"].index[0]

            # 提取对应列的数据
            md_data = df.iloc[3:, md_column].values
            dzl_data = df.iloc[3:, dzl_column].values
            zrgm_data = df.iloc[3:, zrgm_column].values
            gg_data = df.iloc[3:, gg_column].values
            dssc_data = df.iloc[3:, dssc_column].values
            zrdw_data = df.iloc[3:, zrdw_column].values
            feature_data = df.iloc[3:, feature_column].values

            # 清洗数据：将列转换为数值类型
            md_data = pd.to_numeric(md_data, errors='coerce')
            dzl_data = pd.to_numeric(dzl_data, errors='coerce')
            zrgm_data = pd.to_numeric(zrgm_data, errors='coerce')
            gg_data = pd.to_numeric(gg_data, errors='coerce')
            dssc_data = pd.to_numeric(dssc_data, errors='coerce')
            zrdw_data = pd.to_numeric(zrdw_data, errors='coerce')
            feature_data = pd.to_numeric(feature_data, errors='coerce')

            # 将数据存储到NumPy数组中
            well_data = []
            if len(md_data) > 0 and len(dzl_data) > 0 and len(zrgm_data) > 0 and len(gg_data) > 0 and len(dssc_data) > 0 and len(zrdw_data) > 0 and len(feature_data) > 0:
                well_data = np.array([md_data, dzl_data, zrgm_data, gg_data, dssc_data, zrdw_data, feature_data])

            # 检测有缺失值的列并创建布尔索引
            mask = np.isnan(well_data).any(axis=0)
            # 使用布尔索引选择不包含缺失值的列
            well_data = well_data[:, ~mask]

            # 找到开头和结尾处的"0"值所在列
            start_indices = 0
            end_indices= len(well_data[0]) - 1
            while start_indices <= end_indices and np.any(well_data[:, start_indices] == 0):
                start_indices += 1
            while end_indices >= start_indices and np.any(well_data[:, end_indices] == 0):
                end_indices -= 1
            min_columns = end_indices - start_indices + 1

            # 将剔除开头结尾处"0"值的数据存储到新的NumPy数组中
            new_well_data = []
            if start_indices <= end_indices:
                for i in range(len(well_data)):
                    new_well_data.append(well_data[i][start_indices:end_indices + 1])

            new_well_data = np.array(new_well_data)
            # print(new_well_data)

            all_data.append(new_well_data)

train_data = np.array([])
# 循环处理每个二维数组
for data in all_data:
    # 使用 T 属性交换行列
    transposed_array = data.T
    # print(transposed_array)

    # 将交换后的数组连接到结果数组
    if train_data.size == 0:
        train_data = transposed_array
    else:
        train_data = np.concatenate((train_data, transposed_array), axis=0)

# 打印最终结果
print("============================================")
print(train_data)
print(train_data.shape)

FileNotFoundError: [WinError 3] 系统找不到指定的路径。: 'E:\\code\\BiLSTM\\BiLSTM\\trainA1_2'

In [3]:
# 模型训练
from keras.models import Model

print(drop)
# 归一化
train_data, _ = NormalizeMult(train_data)
# 处理训练集数据
train_X, train_Y = create_dataset(train_data, TIME_STEPS)

m = lstm_model()
m.summary()
m.compile(optimizer='adam', loss='mse')
m.fit([train_X], train_Y, epochs=20, batch_size=32, validation_split=0.2)
m.save("../model/model.h5")

ModuleNotFoundError: No module named 'keras'

In [4]:
# 读测试集数据
import os
import pandas as pd
import numpy as np

test_data = []

# 读取测试的xlsx文件
file_path = 'E:\\code\\BiLSTM\\BiLSTM\\testA1\\ZK003.xlsx'

# 读取Excel文件中的第二个表格
xls = pd.ExcelFile(file_path)
sheet_names = xls.sheet_names
if len(sheet_names) > 1:
    df = pd.read_excel(file_path, sheet_name=sheet_names[1], header=None)

    # 获取第三行数据作为表头
    header = df.iloc[2]

    # 找到标签为"DZL"和"ZRGM"的列
    sd_column = header[header == "DMZKSD"].index[0]
    md_column = header[header == "DMZKMD"].index[0]
    dzl_column = header[header == "DZL"].index[0]
    zrgm_column = header[header == "ZRGM"].index[0]
    gg_column = header[header == "DMZKGG"].index[0]
    dssc_column = header[header == "DSSC"].index[0]
    zrdw_column = header[header == "ZRDW"].index[0]
    feature_column = header[header == "GG"].index[0]

    # 提取对应列的数据
    sd_data = df.iloc[3:, sd_column].values
    md_data = df.iloc[3:, md_column].values
    dzl_data = df.iloc[3:, dzl_column].values
    zrgm_data = df.iloc[3:, zrgm_column].values
    gg_data = df.iloc[3:, gg_column].values
    dssc_data = df.iloc[3:, dssc_column].values
    zrdw_data = df.iloc[3:, zrdw_column].values
    feature_data = df.iloc[3:, feature_column].values

    # 清洗数据：将列转换为数值类型
    sd_data = pd.to_numeric(sd_data, errors='coerce')
    md_data = pd.to_numeric(md_data, errors='coerce')
    dzl_data = pd.to_numeric(dzl_data, errors='coerce')
    zrgm_data = pd.to_numeric(zrgm_data, errors='coerce')
    gg_data = pd.to_numeric(gg_data, errors='coerce')
    dssc_data = pd.to_numeric(dssc_data, errors='coerce')
    zrdw_data = pd.to_numeric(zrdw_data, errors='coerce')
    feature_data = pd.to_numeric(feature_data, errors='coerce')

    # 将数据存储到NumPy数组中
    well_data = []
    if len(sd_data) > 0 and len(md_data) > 0 and len(dzl_data) > 0 and len(zrgm_data) > 0 and len(gg_data) > 0 and len(dssc_data) > 0 and len(zrdw_data) > 0 and len(feature_data) > 0:
        well_data = np.array([sd_data, md_data, dzl_data, zrgm_data, gg_data, dssc_data, zrdw_data, feature_data])

    # 检测有缺失值的列并创建布尔索引
    mask = np.isnan(well_data).any(axis=0)
    # 使用布尔索引选择不包含缺失值的列
    well_data = well_data[:, ~mask]

    # 找到开头和结尾处的"0"值所在列
    start_indices = 0
    end_indices= len(well_data[0]) - 1
    while start_indices <= end_indices and np.any(well_data[:, start_indices] == 0):
        start_indices += 1
    while end_indices >= start_indices and np.any(well_data[:, end_indices] == 0):
        end_indices -= 1
    min_columns = end_indices - start_indices + 1

    # 将剔除开头结尾处"0"值的数据存储到新的NumPy数组中
    new_well_data = []
    if start_indices <= end_indices:
        for i in range(len(well_data)):
            new_well_data.append(well_data[i][start_indices:end_indices + 1])

    new_well_data = np.array(new_well_data)

test_data = new_well_data[1:].T
print(test_data)
print(test_data.shape)

# 归一化
test_data, normalize = NormalizeMult(np.array(test_data))
np.save("../normalize.npy", normalize)
# 处理测试集数据
test_X, test_Y = create_dataset(test_data, TIME_STEPS)

FileNotFoundError: [Errno 2] No such file or directory: 'E:\\code\\BiLSTM\\BiLSTM\\testA1\\ZK003.xlsx'

In [5]:
# 预测
import tensorflow as tf

# scores_test = m.evaluate([test_X], test_Y)
# print(scores_test)

# 进行预测
model = tf.keras.models.load_model('../model/model.h5')
results = model.predict([test_X])
# 反归一化
results = FNormalizeMult(results, normalize)
# 用第一个数据填充最前面几条没有被预测的数据
first_data = results[0]
end_data = results[results.shape[0] - 1]
for _ in range(TIME_STEPS // 2):
    results = np.insert(results, 0, [first_data], axis=0)
    results = np.append(results, [end_data], axis=0)
print(results)

ModuleNotFoundError: No module named 'tensorflow'

In [6]:
import pandas as pd

results = results.ravel()
print(len(results))
print(len(new_well_data[1]))

# 保存预测值
df = pd.DataFrame({'Predict': results, 'True': new_well_data[1], 'Depth': new_well_data[0]})
excel_file = '../res/GGBiLSTM_predict_data.xlsx'
df.to_excel(excel_file, index=False)

NameError: name 'results' is not defined