In [2]:
import csv
from typing import Optional

import arrow
import numpy as np
import pandas as pd
import tensorflow as tf
from keras import Sequential
from keras.src.layers import LSTM, Dropout, Dense
from pandas import DatetimeIndex
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import os
import pathlib
import xml.etree.ElementTree as ET
import xarray as xr
import codecs
import datetime

In [3]:

def get_test_array(test_read_path: str, training_read_path: str, issue_times_index: DatetimeIndex):
    """
        分别读取测试数据集以及实况数据集并进行训练
    :param test_read_path:
    :param training_read_path:
    :return:
    """
    if pathlib.Path(test_read_path).exists() and pathlib.Path(training_read_path).exists():
        df_test: pd.DataFrame = pd.read_csv(test_read_path)
        u_data_dict = {}
        v_data_dict = {}
        # 读取的预报风场——测试训练集 在 df 中是通过 xxx_u与 xxx_v 的形式进行存储
        # TODO:[-] 25-04-28 u 与 v 每个共613组预报数据
        for col_name in df_test.columns:
            try:
                col_vector = df_test[col_name]
                # yyyymmddhhss
                dt_temp_str: str = col_name.split('_')[0]
                # u or v
                var_temp_str: str = col_name.split('_')[1]
                if var_temp_str == 'u':
                    # u_data_dict[dt_temp_str] = col_vector.tolist()
                    u_data_dict[dt_temp_str] = col_vector
                elif var_temp_str == 'v':
                    # v_data_dict[dt_temp_str] = col_vector.tolist()
                    v_data_dict[dt_temp_str] = col_vector
                print(f'当前列:{col_name}处理成功~')
            except Exception as e:
                print(f'当前列:{col_name}处理错误!')
        # # step2: 将字典统一转换为二维数组
        # result_u_array = [val for key, val in u_data_dict.items()]
        # result_v_array = [val for key, val in v_data_dict.items()]
        # return [result_u_array, result_v_array]
        df_u = pd.DataFrame.from_dict(u_data_dict)
        df_v = pd.DataFrame.from_dict(v_data_dict)
        # 将时间字符串=>datetime
        df_u.columns = pd.to_datetime(df_u.columns)
        df_v.columns = pd.to_datetime(df_v.columns)
        # TODO:[*] 25-04-29
        # 需要根据起止时间及时间步长，生成对应的时间索引，并将该时间索引作为标准索引
        # 注意： reindex 后会返回一个新的 DataFrame，并不会修改原始df
        df_u = df_u.reindex(columns=issue_times_index)
        df_v = df_v.reindex(columns=issue_times_index)
        return df_u, df_v
        # pass
    return None

In [4]:
start_time = '2024-01-01 00:00:00'
end_time = '2024-12-31 23:00:00'
issue_times_index = pd.date_range(start=start_time, end=end_time, freq='12h')
    

In [5]:
# step3: 提取 test 与 training data 开始训练
# traning_ws(r'G:\05DATA\01TRAINING_DATA\WIND\merge.csv', r'G:\05DATA\01TRAINING_DATA\FUB\MF01001\2024_local.csv')
"""shape:(25,732)"""
df_u, df_v = get_test_array(r'G:\05DATA\01TRAINING_DATA\WIND\merge.csv',
                            r'G:\05DATA\01TRAINING_DATA\FUB\MF01001\2024_local.csv', issue_times_index)


当前列:Unnamed: 0.10处理错误!
当前列:20240101000000_u处理成功~
当前列:20240101000000_v处理成功~
当前列:20240101120000_u处理成功~
当前列:20240101120000_v处理成功~
当前列:20240102000000_u处理成功~
当前列:20240102000000_v处理成功~
当前列:20240102120000_u处理成功~
当前列:20240102120000_v处理成功~
当前列:20240103000000_u处理成功~
当前列:20240103000000_v处理成功~
当前列:20240103120000_u处理成功~
当前列:20240103120000_v处理成功~
当前列:20240104000000_u处理成功~
当前列:20240104000000_v处理成功~
当前列:20240104120000_u处理成功~
当前列:20240104120000_v处理成功~
当前列:20240105000000_u处理成功~
当前列:20240105000000_v处理成功~
当前列:20240105120000_u处理成功~
当前列:20240105120000_v处理成功~
当前列:20240106000000_u处理成功~
当前列:20240106000000_v处理成功~
当前列:20240106120000_u处理成功~
当前列:20240106120000_v处理成功~
当前列:20240107000000_u处理成功~
当前列:20240107000000_v处理成功~
当前列:20240107120000_u处理成功~
当前列:20240107120000_v处理成功~
当前列:20240108000000_u处理成功~
当前列:20240108000000_v处理成功~
当前列:20240108120000_u处理成功~
当前列:20240108120000_v处理成功~
当前列:20240109000000_u处理成功~
当前列:20240109000000_v处理成功~
当前列:20240109120000_u处理成功~
当前列:20240109120000_v处理成功~
当前列:20240110000000_u处理成功~
当前列:20240110000

In [6]:
df_u.head()

Unnamed: 0,2024-01-01 00:00:00,2024-01-01 12:00:00,2024-01-02 00:00:00,2024-01-02 12:00:00,2024-01-03 00:00:00,2024-01-03 12:00:00,2024-01-04 00:00:00,2024-01-04 12:00:00,2024-01-05 00:00:00,2024-01-05 12:00:00,...,2024-12-27 00:00:00,2024-12-27 12:00:00,2024-12-28 00:00:00,2024-12-28 12:00:00,2024-12-29 00:00:00,2024-12-29 12:00:00,2024-12-30 00:00:00,2024-12-30 12:00:00,2024-12-31 00:00:00,2024-12-31 12:00:00
0,2.766523,9.075234,5.00274,2.67506,-2.128356,5.750553,5.524846,6.92952,-4.396342,4.942668,...,,,,,,,,,,
1,6.289143,10.067031,3.94926,2.952109,-1.010674,7.330463,6.257961,6.028646,-3.273404,6.492066,...,,,,,,,,,,
2,8.247787,9.048562,3.358205,2.34316,2.870877,6.602916,8.165231,-2.08642,-0.03399,7.097127,...,,,,,,,,,,
3,8.072305,6.563037,4.347986,0.012131,3.517645,7.347518,8.678781,-3.71093,3.385559,0.824375,...,,,,,,,,,,
4,8.055267,3.616818,2.872303,0.777438,5.865928,6.892735,8.249434,-2.451965,6.310976,-0.92349,...,,,,,,,,,,


获取第一列的列向量数据

In [7]:
df_u.head().loc[0]

2024-01-01 00:00:00    2.766523
2024-01-01 12:00:00    9.075234
2024-01-02 00:00:00    5.002740
2024-01-02 12:00:00    2.675060
2024-01-03 00:00:00   -2.128356
                         ...   
2024-12-29 12:00:00         NaN
2024-12-30 00:00:00         NaN
2024-12-30 12:00:00         NaN
2024-12-31 00:00:00         NaN
2024-12-31 12:00:00         NaN
Freq: 12h, Name: 0, Length: 732, dtype: float64

#### 取出第一列的列向量方式一：

In [8]:
df_u.head().iloc[:,0]

0    2.766523
1    6.289143
2    8.247787
3    8.072305
4    8.055267
Name: 2024-01-01 00:00:00, dtype: float64

#### 方法2：

In [9]:
# 取出列title的第一个对应的时间戳
ts_str_temp=df_u.columns[0]
ts_temp=pd.Timestamp(ts_str_temp)
ts_temp

Timestamp('2024-01-01 00:00:00')

In [10]:
df_u[ts_temp]

0     2.766523
1     6.289143
2     8.247787
3     8.072305
4     8.055267
5     8.753942
6     8.148617
7     6.321377
8     2.050420
9    -0.808014
10   -2.366455
11   -1.871602
12    0.716504
13    2.416453
14    3.880541
15    2.583615
16    1.144564
17    2.046348
18    4.411166
19    7.166533
20    7.304152
21    6.830103
22    7.361506
23    8.052248
24    8.426732
Name: 2024-01-01 00:00:00, dtype: float64

将`u`与`v`进行横向合并，并将时间戳作为列索引按照时间合并  
使用`join`=`inner`，只保留`u`与`v`均存在的时间戳

In [11]:
featrues=pd.concat([df_u,df_v],axis=1,join='inner')

In [12]:
featrues.head()

Unnamed: 0,2024-01-01 00:00:00,2024-01-01 12:00:00,2024-01-02 00:00:00,2024-01-02 12:00:00,2024-01-03 00:00:00,2024-01-03 12:00:00,2024-01-04 00:00:00,2024-01-04 12:00:00,2024-01-05 00:00:00,2024-01-05 12:00:00,...,2024-12-27 00:00:00,2024-12-27 12:00:00,2024-12-28 00:00:00,2024-12-28 12:00:00,2024-12-29 00:00:00,2024-12-29 12:00:00,2024-12-30 00:00:00,2024-12-30 12:00:00,2024-12-31 00:00:00,2024-12-31 12:00:00
0,2.766523,9.075234,5.00274,2.67506,-2.128356,5.750553,5.524846,6.92952,-4.396342,4.942668,...,,,,,,,,,,
1,6.289143,10.067031,3.94926,2.952109,-1.010674,7.330463,6.257961,6.028646,-3.273404,6.492066,...,,,,,,,,,,
2,8.247787,9.048562,3.358205,2.34316,2.870877,6.602916,8.165231,-2.08642,-0.03399,7.097127,...,,,,,,,,,,
3,8.072305,6.563037,4.347986,0.012131,3.517645,7.347518,8.678781,-3.71093,3.385559,0.824375,...,,,,,,,,,,
4,8.055267,3.616818,2.872303,0.777438,5.865928,6.892735,8.249434,-2.451965,6.310976,-0.92349,...,,,,,,,,,,


In [13]:
featrues[ts_temp]

Unnamed: 0,2024-01-01,2024-01-01.1
0,2.766523,-0.946262
1,6.289143,1.760215
2,8.247787,3.317518
3,8.072305,4.068276
4,8.055267,3.163379
5,8.753942,2.255922
6,8.148617,0.571586
7,6.321377,-1.037594
8,2.05042,-3.13208
9,-0.808014,-4.138453
