In [11]:
from OneShotSamplesGenerator import gen_multi_output_samples
import pandas as pd
import numpy as np
import os
import warnings
warnings.filterwarnings("ignore")

calval_start = '1972-01-01'
calval_end = '2014-12-31'

test_start = '2014-01-01'
test_end = '2019-12-31'

hydro_stations = [
    'Tangnaihai', 
    'Guide', 
    'Xunhua']
hydrostation_abbrs = {'Tangnaihai':'TNH','Guide':'GD','Xunhua':'XH'}
hydrostation_channel = {'Tangnaihai':'3','Guide':'14','Xunhua':'9'}
# read observed climate data
hydrostation_metestations = {
    'Tangnaihai': [
        '玛多', '达日', '久治', '红原', '若尔盖', '玛曲', '玛沁', '河南', '兴海',
    ],
    'Guide': [
        '玛多', '达日', '久治', '红原', '若尔盖', '玛曲', '玛沁', '河南', '兴海',
        '贵南', '共和', '贵德',
    ],
    'Xunhua': [
        '玛多', '达日', '久治', '红原', '若尔盖', '玛曲', '玛沁', '河南', '兴海',
        '贵南', '共和', '贵德', '同仁',
    ]
}

metestation_controal_area = pd.read_csv('../data/MeteGaugeStationControlArea.csv')
metestation_controal_area_dict = dict(zip(metestation_controal_area['station'], metestation_controal_area['Shape_Area']))

In [20]:
# !
sample_path = '../samples_mete_wb_vif/InputOutputSamples_hismete_swatpsim/'
if not os.path.exists(sample_path):
    os.makedirs(sample_path)
selected_features = [
    'P2020(mm)', 'MAX-TEM(C)','MIN-TEM(C)',
    'latq(mm)', 'eplant(mm)', 'wet_evap(mm)', 'surq_cha(mm)', 'wet_oflo(mm)', 'surq_gen(mm)', 'snomlt(mm)', 'snofall(mm)', 'sw_change(mm)', 'snopack(mm)',
     'SWATPlusSimFlow', 'flow(m^3/s)']


for hydro_station in hydro_stations:
    station_names = hydrostation_metestations[hydro_station]
    index = pd.date_range(calval_start,test_end,freq='MS')
    # Initialize DataFrames to store aggregated data
    pcp_data = pd.DataFrame()
    maxtmp_data = pd.DataFrame()
    mintmp_data = pd.DataFrame()
    slr_data = pd.DataFrame()
    hmd_data = pd.DataFrame()
    wnd_data = pd.DataFrame()
    total_area = sum(metestation_controal_area_dict[station] for station in station_names)
    weights = {station: metestation_controal_area_dict[station] / total_area for station in station_names}
    for station in station_names:
        # Read climate data for each station
        station_data = pd.read_csv(f'D:/DataSpace/HydroMeteAnthropicDatabase/7.FilledRawMeteObsInfo/ChinaLandDailyMeteV3(InsertSolarRadiation)/{station}.csv', 
                                   index_col=['DATE'], parse_dates=['DATE'])
        station_calval = station_data.loc[calval_start:test_end]
        # Aggregate data
        pcp_data[station] = station_calval['P2020(mm)'].resample('MS').sum() * weights[station]
        maxtmp_data[station] = station_calval['MAX-TEM(C)'].resample('MS').mean() * weights[station]
        mintmp_data[station] = station_calval['MIN-TEM(C)'].resample('MS').mean() * weights[station]
        slr_data[station] = station_calval['SLR(MJ/m^2)'].resample('MS').sum() * weights[station]
        hmd_data[station] = station_calval['AVG-RHU(%)'].resample('MS').mean() * weights[station]
        wnd_data[station] = station_calval['AVG-WV(m/s)'].resample('MS').mean() * weights[station]

    # Calculate weighted averages across stations
    pcp_avg = pcp_data.sum(axis=1)
    pcp_avg.name = 'P2020(mm)'
    pcp_avg.index = index
    maxtmp_avg = maxtmp_data.sum(axis=1)
    maxtmp_avg.name = 'MAX-TEM(C)'
    maxtmp_avg.index = index
    mintmp_avg = mintmp_data.sum(axis=1)
    mintmp_avg.name = 'MIN-TEM(C)'
    mintmp_avg.index = index
    slr_avg = slr_data.sum(axis=1)
    slr_avg.name = 'SLR(MJ/m^2)'
    slr_avg.index = index
    hmd_avg = hmd_data.sum(axis=1)
    hmd_avg.name = 'AVG-RHU(%)'
    hmd_avg.index = index
    wnd_avg = wnd_data.sum(axis=1)
    wnd_avg.name = 'AVG-WV(m/s)'
    wnd_avg.index = index

    # Read water balance data
    wb = pd.read_csv(f'../result/SWATPlusCalValSimData/YellowRiver{hydrostation_abbrs[hydro_station]}_BasinWaterBalance_1972_2019.csv', 
                     index_col=['date'], parse_dates=['date'])
    wb = wb.loc[calval_start:test_end]
    wb = wb.drop(columns=['mon', 'day', 'yr', 'name'])
    wb = wb.sort_index()
    wb.index = index

    # Read monthly streamflow data
    flow = pd.read_csv(f'../data/{hydro_station}_natural_monthly_flow.csv', 
                       index_col=['date'], parse_dates=['date'])
    flow_calval = flow.loc[calval_start:test_end]
    flow_calval = flow_calval.sort_index()
    flow_calval.index = index

        # Read simulated streamflow
    swatplus_sim_flow = pd.read_csv(f'../result/SWATPlusCalValSimData/Channel_{hydrostation_channel[hydro_station]}_Monthly_River-Flow_{hydro_station}_Sim1972_2019.csv', 
                           index_col=['Date'], parse_dates=['Date'])
    swatplus_sim_flow.columns = ['SWATPlusSimFlow']
    swatplus_sim_flow = swatplus_sim_flow.loc[calval_start:test_end]
    swatplus_sim_flow = swatplus_sim_flow.sort_index()
    swatplus_sim_flow.index = flow_calval.index
    swatplus_sim_flow.index.name = 'date'

    # Concatenate all data
    all_data = pd.concat([pcp_avg, maxtmp_avg, mintmp_avg, slr_avg, hmd_avg, wnd_avg, wb, swatplus_sim_flow, flow_calval], axis=1)
    all_data.index.name = 'date'

    # Remove columns with all zero values
    all_data = all_data.loc[:, (all_data != 0).any(axis=0)]

    # Save all data
    all_data.to_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_FULL1972_2019_{hydro_station}.csv', index=True)
    
    # Check for null values in each column
    null_columns = all_data.columns[all_data.isnull().any()].tolist()
    if null_columns:
        print(f"Columns with null values in {hydro_station} data:")
        for col in null_columns:
            null_count = all_data[col].isnull().sum()
            print(f"  {col}: {null_count} null values")
    else:
        print(f"No null values found in {hydro_station} data.")
    
    print(f"Data for {hydro_station} processed and saved.")

    all_data = pd.read_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_FULL1972_2019_{hydro_station}.csv',index_col=['date'],parse_dates=['date'])
    feature_samples, target_samples = gen_multi_output_samples(
        timeseries=all_data.copy(),
        target_column='flow(m^3/s)',
        lag=12,
        lead=12,
    )
    feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_full_feature_samples.csv',index=True)
    target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_full_target_samples.csv',index=True)
# select features including ['snomlt(mm)', 'eplant(mm)', 'surq_cha(mm)', 'snofall(mm)', 'wet_oflo(mm)', 'wet_evap(mm)', 'snopack(mm)', 'sw_change(mm)', 'MIN-TEM(C)','flow(m^3/s)'] from all_data
# ['snomlt(mm)', 'eplant(mm)', 'surq_cha(mm)', 'snofall(mm)', 'wet_oflo(mm)', 'wet_evap(mm)', 'snopack(mm)', 'sw_change(mm)', 'MIN-TEM(C)']
for hydro_station in hydro_stations:
    all_data = pd.read_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_FULL1972_2019_{hydro_station}.csv',index_col=['date'],parse_dates=['date'])
    selected_data = all_data.loc[:,selected_features]

    feature_samples, target_samples = gen_multi_output_samples(
        timeseries=selected_data.copy(),
        target_column='flow(m^3/s)',
        lead=12,
        lag=12,
    )
    feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_feature_samples.csv',index=True)
    target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_target_samples.csv',index=True)

#!2-----纯数据驱动
sample_path = '../samples_mete_wb_vif/InputOutputSamples_hismete/'
if not os.path.exists(sample_path):
    os.makedirs(sample_path)
selected_features = [
    'P2020(mm)', 'MAX-TEM(C)','MIN-TEM(C)', 'flow(m^3/s)']


for hydro_station in hydro_stations:
    station_names = hydrostation_metestations[hydro_station]
    index = pd.date_range(calval_start,test_end,freq='MS')
    # Initialize DataFrames to store aggregated data
    pcp_data = pd.DataFrame()
    maxtmp_data = pd.DataFrame()
    mintmp_data = pd.DataFrame()
    slr_data = pd.DataFrame()
    hmd_data = pd.DataFrame()
    wnd_data = pd.DataFrame()
    total_area = sum(metestation_controal_area_dict[station] for station in station_names)
    weights = {station: metestation_controal_area_dict[station] / total_area for station in station_names}
    for station in station_names:
        # Read climate data for each station
        station_data = pd.read_csv(f'D:/DataSpace/HydroMeteAnthropicDatabase/7.FilledRawMeteObsInfo/ChinaLandDailyMeteV3(InsertSolarRadiation)/{station}.csv', 
                                   index_col=['DATE'], parse_dates=['DATE'])
        station_calval = station_data.loc[calval_start:test_end]
        # Aggregate data
        pcp_data[station] = station_calval['P2020(mm)'].resample('MS').sum() * weights[station]
        maxtmp_data[station] = station_calval['MAX-TEM(C)'].resample('MS').mean() * weights[station]
        mintmp_data[station] = station_calval['MIN-TEM(C)'].resample('MS').mean() * weights[station]
        slr_data[station] = station_calval['SLR(MJ/m^2)'].resample('MS').sum() * weights[station]
        hmd_data[station] = station_calval['AVG-RHU(%)'].resample('MS').mean() * weights[station]
        wnd_data[station] = station_calval['AVG-WV(m/s)'].resample('MS').mean() * weights[station]

    # Calculate weighted averages across stations
    pcp_avg = pcp_data.sum(axis=1)
    pcp_avg.name = 'P2020(mm)'
    pcp_avg.index = index
    maxtmp_avg = maxtmp_data.sum(axis=1)
    maxtmp_avg.name = 'MAX-TEM(C)'
    maxtmp_avg.index = index
    mintmp_avg = mintmp_data.sum(axis=1)
    mintmp_avg.name = 'MIN-TEM(C)'
    mintmp_avg.index = index
    slr_avg = slr_data.sum(axis=1)
    slr_avg.name = 'SLR(MJ/m^2)'
    slr_avg.index = index
    hmd_avg = hmd_data.sum(axis=1)
    hmd_avg.name = 'AVG-RHU(%)'
    hmd_avg.index = index
    wnd_avg = wnd_data.sum(axis=1)
    wnd_avg.name = 'AVG-WV(m/s)'
    wnd_avg.index = index

    # Read water balance data
    wb = pd.read_csv(f'../result/SWATPlusCalValSimData/YellowRiver{hydrostation_abbrs[hydro_station]}_BasinWaterBalance_1972_2019.csv', 
                     index_col=['date'], parse_dates=['date'])
    wb = wb.loc[calval_start:test_end]
    wb = wb.drop(columns=['mon', 'day', 'yr', 'name'])
    wb = wb.sort_index()
    wb.index = index

    # Read monthly streamflow data
    flow = pd.read_csv(f'../data/{hydro_station}_natural_monthly_flow.csv', 
                       index_col=['date'], parse_dates=['date'])
    flow_calval = flow.loc[calval_start:test_end]
    flow_calval = flow_calval.sort_index()
    flow_calval.index = index

        # Read simulated streamflow
    swatplus_sim_flow = pd.read_csv(f'../result/SWATPlusCalValSimData/Channel_{hydrostation_channel[hydro_station]}_Monthly_River-Flow_{hydro_station}_Sim1972_2019.csv', 
                           index_col=['Date'], parse_dates=['Date'])
    swatplus_sim_flow.columns = ['SWATPlusSimFlow']
    swatplus_sim_flow = swatplus_sim_flow.loc[calval_start:test_end]
    swatplus_sim_flow = swatplus_sim_flow.sort_index()
    swatplus_sim_flow.index = flow_calval.index
    swatplus_sim_flow.index.name = 'date'

    # Concatenate all data
    all_data = pd.concat([pcp_avg, maxtmp_avg, mintmp_avg, slr_avg, hmd_avg, wnd_avg, wb, swatplus_sim_flow, flow_calval], axis=1)
    all_data.index.name = 'date'

    # Remove columns with all zero values
    all_data = all_data.loc[:, (all_data != 0).any(axis=0)]

    # Save all data
    all_data.to_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_FULL1972_2019_{hydro_station}.csv', index=True)
    
    # Check for null values in each column
    null_columns = all_data.columns[all_data.isnull().any()].tolist()
    if null_columns:
        print(f"Columns with null values in {hydro_station} data:")
        for col in null_columns:
            null_count = all_data[col].isnull().sum()
            print(f"  {col}: {null_count} null values")
    else:
        print(f"No null values found in {hydro_station} data.")
    
    print(f"Data for {hydro_station} processed and saved.")

    all_data = pd.read_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_FULL1972_2019_{hydro_station}.csv',index_col=['date'],parse_dates=['date'])
    feature_samples, target_samples = gen_multi_output_samples(
        timeseries=all_data.copy(),
        target_column='flow(m^3/s)',
        lag=12,
        lead=12,
    )
    feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_full_feature_samples.csv',index=True)
    target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_full_target_samples.csv',index=True)
# select features including ['snomlt(mm)', 'eplant(mm)', 'surq_cha(mm)', 'snofall(mm)', 'wet_oflo(mm)', 'wet_evap(mm)', 'snopack(mm)', 'sw_change(mm)', 'MIN-TEM(C)','flow(m^3/s)'] from all_data
# ['snomlt(mm)', 'eplant(mm)', 'surq_cha(mm)', 'snofall(mm)', 'wet_oflo(mm)', 'wet_evap(mm)', 'snopack(mm)', 'sw_change(mm)', 'MIN-TEM(C)']
for hydro_station in hydro_stations:
    all_data = pd.read_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_FULL1972_2019_{hydro_station}.csv',index_col=['date'],parse_dates=['date'])
    selected_data = all_data.loc[:,selected_features]

    feature_samples, target_samples = gen_multi_output_samples(
        timeseries=selected_data.copy(),
        target_column='flow(m^3/s)',
        lead=12,
        lag=12,
    )
    feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_feature_samples.csv',index=True)
    target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_target_samples.csv',index=True)


    
    

No null values found in Tangnaihai data.
Data for Tangnaihai processed and saved.
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)', 'SLR(MJ/m^2)', 'AVG-RHU(%)', 'AVG-WV(m/s)', 'precip(mm)', 'snofall(mm)', 'snomlt(mm)', 'surq_gen(mm)', 'latq(mm)', 'wateryld(mm)', 'perc(mm)', 'et(mm)', 'ecanopy(mm)', 'eplant(mm)', 'esoil(mm)', 'surq_cont(mm)', 'cn', 'sw_init(mm)', 'sw_final(mm)', 'sw_ave(mm)', 'sw_300(mm)', 'sno_init(mm)', 'sno_final(mm)', 'snopack(mm)', 'pet(mm)', 'surq_cha(mm)', 'latq_cha(mm)', 'sw_change(mm)', 'lagsurf(mm)', 'laglatq(mm)', 'wet_evap(mm)', 'wet_oflo(mm)', 'wet_stor(mm)', 'SWATPlusSimFlow']
No null values found in Guide data.
Data for Guide processed and saved.
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)', 'SLR(MJ/m^2)', 'AVG-RHU(%)', 'AVG-WV(m/s)', 'precip(mm)', 'snofall(mm)', 'snomlt(mm)', 'surq_gen(mm)', 'latq(mm)', 'wateryld(mm)', 'perc(mm)', 'et(mm)', 'ecanopy(mm)', 'eplant(mm)', 'esoil(mm)', 'surq_cont(mm)', 'cn', 'sw_init(mm)', 'sw_final(mm)', 'sw_ave(m

# 1 Generate machine learning samples_mete_wb_vif using climate data of guage stations and water balance output.
Entire period:1972-2019,48 years and 576 months.
Calibration period:1972-2009,38 years and 456 months, 79.2% of the entire data.
Validation period:2010-2014, 5 years and 60 months, 10.4% of the entire data.
Test period(Simulate pred):2015-2019, 5 years and 60 months, 10.4% of the entire data.
生成候选训练样本：
1. 计算各雨量站加权平均值，获取整个集水区气象数据；
2. 将集水区气象数据、集水区水量平衡数据（SWAT+获得）、流量模拟数据(SARIMA,SWAT+)、实测流量数据综合形成候选学习样本。

生成学习样本（用于模型训练和模型验证）：
1. 以实测流量为预测目标，采用多输入（过去12个月与未来12个月预测信息）多输出模式（未来12个月流量），生成学习样本；
2. 所有的输入信息不做筛选，全部用来构造输入。

生成预测样本（获取预测因子）：
1. 输入包括：流域面气象信息（降水、最大气温、最小气温、相对湿度、太阳辐射、风速）、流域水量平衡数据、模拟流量数据（SARIMA,SWAT+）；
2. 输入经过多重共线性检验筛选，与训练验证阶段筛选获得的因子一致；
3. 预测阶段的气象因子从各个气象站相似年数据统计获得。

In [13]:
sample_path = '../samples_mete_wb_vif/InputOutputSamples_metesimyr_swatpsim_arimasim/'
if not os.path.exists(sample_path):
    os.makedirs(sample_path)
selected_features = [
    'P2020(mm)', 'MAX-TEM(C)','MIN-TEM(C)',
    'latq(mm)', 'eplant(mm)', 'wet_evap(mm)', 
    'surq_cha(mm)', 'wet_oflo(mm)', 'surq_gen(mm)', 
    'snomlt(mm)', 'snofall(mm)', 'sw_change(mm)', 'snopack(mm)',
             'SWATPlusSimFlow','ARIMASimFlow', 'flow(m^3/s)']

for hydro_station in hydro_stations:
    station_names = hydrostation_metestations[hydro_station]
    index = pd.date_range(calval_start,calval_end,freq='MS')
    # Initialize DataFrames to store aggregated data
    pcp_data = pd.DataFrame()
    maxtmp_data = pd.DataFrame()
    mintmp_data = pd.DataFrame()
    slr_data = pd.DataFrame()
    hmd_data = pd.DataFrame()
    wnd_data = pd.DataFrame()
    total_area = sum(metestation_controal_area_dict[station] for station in station_names)
    weights = {station: metestation_controal_area_dict[station] / total_area for station in station_names}
    for station in station_names:
        # Read climate data for each station
        station_data = pd.read_csv(f'D:/DataSpace/HydroMeteAnthropicDatabase/7.FilledRawMeteObsInfo/ChinaLandDailyMeteV3(InsertSolarRadiation)/{station}.csv', 
                                   index_col=['DATE'], parse_dates=['DATE'])
        station_calval = station_data.loc[calval_start:calval_end]
        # Aggregate data
        pcp_data[station] = station_calval['P2020(mm)'].resample('MS').sum() * weights[station]
        maxtmp_data[station] = station_calval['MAX-TEM(C)'].resample('MS').mean() * weights[station]
        mintmp_data[station] = station_calval['MIN-TEM(C)'].resample('MS').mean() * weights[station]
        slr_data[station] = station_calval['SLR(MJ/m^2)'].resample('MS').sum() * weights[station]
        hmd_data[station] = station_calval['AVG-RHU(%)'].resample('MS').mean() * weights[station]
        wnd_data[station] = station_calval['AVG-WV(m/s)'].resample('MS').mean() * weights[station]

    # Calculate weighted averages across stations
    pcp_avg = pcp_data.sum(axis=1)
    pcp_avg.name = 'P2020(mm)'
    pcp_avg.index = index
    maxtmp_avg = maxtmp_data.sum(axis=1)
    maxtmp_avg.name = 'MAX-TEM(C)'
    maxtmp_avg.index = index
    mintmp_avg = mintmp_data.sum(axis=1)
    mintmp_avg.name = 'MIN-TEM(C)'
    mintmp_avg.index = index
    slr_avg = slr_data.sum(axis=1)
    slr_avg.name = 'SLR(MJ/m^2)'
    slr_avg.index = index
    hmd_avg = hmd_data.sum(axis=1)
    hmd_avg.name = 'AVG-RHU(%)'
    hmd_avg.index = index
    wnd_avg = wnd_data.sum(axis=1)
    wnd_avg.name = 'AVG-WV(m/s)'
    wnd_avg.index = index

    # Read water balance data
    wb = pd.read_csv(f'../result/SWATPlusWaterBlanceDataFromMeteSimYr/YellowRiver{hydrostation_abbrs[hydro_station]}_BasinWaterBalance_pred2019.csv', 
                     index_col=['date'], parse_dates=['date'])
    wb_calval = wb.loc[calval_start:calval_end]
    wb_calval = wb_calval.drop(columns=['mon', 'day', 'yr', 'name'])
    wb_calval = wb_calval.sort_index()
    wb_calval.index = index

    # Read monthly streamflow data
    flow = pd.read_csv(f'../data/{hydro_station}_natural_monthly_flow.csv', 
                       index_col=['date'], parse_dates=['date'])
    flow_calval = flow.loc[calval_start:calval_end]
    flow_calval = flow_calval.sort_index()
    flow_calval.index = index

    arima_sim =pd.read_csv(f'../result/ARIMAPredData/seasonal_decompose_multiplicative_arima_train_sim_{hydro_station}_before_2015.csv',index_col=['date'],parse_dates=['date'])
    arima_sim = arima_sim.loc[calval_start:calval_end,'SimFlow(m^3/s)']
    arima_sim.name = 'ARIMASimFlow'
    arima_sim = arima_sim.sort_index()
    arima_sim.index = index

    # Read simulated streamflow
    swatplus_sim_flow = pd.read_csv(f'../result/SWATPlusCalValSimData/Channel_{hydrostation_channel[hydro_station]}_Monthly_River-Flow_{hydro_station}_Sim1972_2019.csv', 
                           index_col=['Date'], parse_dates=['Date'])
    swatplus_sim_flow.columns = ['SWATPlusSimFlow']
    swatplus_sim_flow = swatplus_sim_flow.loc[calval_start:calval_end]
    swatplus_sim_flow = swatplus_sim_flow.sort_index()
    swatplus_sim_flow.index = flow_calval.index
    swatplus_sim_flow.index.name = 'date'

    # Concatenate all data
    all_data = pd.concat([pcp_avg, maxtmp_avg, mintmp_avg, slr_avg, hmd_avg, wnd_avg, wb_calval, swatplus_sim_flow, arima_sim, flow_calval], axis=1)
    all_data.index.name = 'date'

    # Remove columns with all zero values
    all_data = all_data.loc[:, (all_data != 0).any(axis=0)]

    # Save all data
    all_data.to_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv', index=True)
    
    # Check for null values in each column
    null_columns = all_data.columns[all_data.isnull().any()].tolist()
    if null_columns:
        print(f"Columns with null values in {hydro_station} data:")
        for col in null_columns:
            null_count = all_data[col].isnull().sum()
            print(f"  {col}: {null_count} null values")
    else:
        print(f"No null values found in {hydro_station} data.")
    
    print(f"Data for {hydro_station} processed and saved.")

    all_data = pd.read_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv',index_col=['date'],parse_dates=['date'])
    feature_samples, target_samples = gen_multi_output_samples(
        timeseries=all_data.copy(),
        target_column='flow(m^3/s)',
        lag=12,
        lead=12,
    )
    feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_full_feature_samples_calval.csv',index=True)
    target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_full_target_samples_calval.csv',index=True)
# select features including ['snomlt(mm)', 'eplant(mm)', 'surq_cha(mm)', 'snofall(mm)', 'wet_oflo(mm)', 'wet_evap(mm)', 'snopack(mm)', 'sw_change(mm)', 'MIN-TEM(C)','flow(m^3/s)'] from all_data
# ['snomlt(mm)', 'eplant(mm)', 'surq_cha(mm)', 'snofall(mm)', 'wet_oflo(mm)', 'wet_evap(mm)', 'snopack(mm)', 'sw_change(mm)', 'MIN-TEM(C)']
for hydro_station in hydro_stations:
    all_data = pd.read_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv',index_col=['date'],parse_dates=['date'])
    selected_data = all_data.loc[:,selected_features]

    feature_samples, target_samples = gen_multi_output_samples(
        timeseries=selected_data.copy(),
        target_column='flow(m^3/s)',
        lead=12,
        lag=12,
    )
    feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_feature_samples_calval.csv',index=True)
    target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_target_samples_calval.csv',index=True)

pred_years = np.arange(2015, 2020)
start_year = 1972

for hydro_station in hydro_stations:
    similarity_years = pd.read_csv(f'../result/SimilarityYears/SimilarityYears_{hydro_station}.csv', index_col=['tar_year'])
    station_names = hydrostation_metestations[hydro_station]
    
    for pred_year in pred_years:

        pred_index = pd.date_range(f'{start_year}-01-01', f'{pred_year}-12-31', freq='MS')

        # Initialize DataFrames to store aggregated data
        pcp_data = pd.DataFrame(index=pred_index)
        maxtmp_data = pd.DataFrame(index=pred_index)
        mintmp_data = pd.DataFrame(index=pred_index)
        slr_data = pd.DataFrame(index=pred_index)
        hmd_data = pd.DataFrame(index=pred_index)
        wnd_data = pd.DataFrame(index=pred_index)

        total_area = sum(metestation_controal_area_dict[station] for station in station_names)
        weights = {station: metestation_controal_area_dict[station] / total_area for station in station_names}

        for station_name in station_names:
            climate_data = pd.read_csv(f'D:/DataSpace/HydroMeteAnthropicDatabase/7.FilledRawMeteObsInfo/ChinaLandDailyMeteV3(InsertSolarRadiation)/{station_name}.csv', 
                                       index_col=['DATE'], parse_dates=['DATE'])
            
            tar_year = pred_year #将预测年份数据进行替换
            ref_year = similarity_years.loc[tar_year-1, station_name] + 1
            
            target_data = climate_data[climate_data.index.year == tar_year]
            reference_data = climate_data[climate_data.index.year == ref_year]
            
            if len(reference_data) == len(target_data):
                climate_data.loc[target_data.index, :] = reference_data.values
            elif len(reference_data) > len(target_data):
                reference_data = reference_data[:-1]
                climate_data.loc[target_data.index, :] = reference_data.values
            else:
                last_day = reference_data.iloc[-1:]
                reference_data = pd.concat([reference_data, last_day])
                climate_data.loc[target_data.index, :] = reference_data.values

            climate_data = climate_data.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
            
            pcp_data[station_name] = climate_data['P2020(mm)'].resample('MS').sum() * weights[station_name]
            maxtmp_data[station_name] = climate_data['MAX-TEM(C)'].resample('MS').mean() * weights[station_name]
            mintmp_data[station_name] = climate_data['MIN-TEM(C)'].resample('MS').mean() * weights[station_name]
            slr_data[station_name] = climate_data['SLR(MJ/m^2)'].resample('MS').sum() * weights[station_name]
            hmd_data[station_name] = climate_data['AVG-RHU(%)'].resample('MS').mean() * weights[station_name]
            wnd_data[station_name] = climate_data['AVG-WV(m/s)'].resample('MS').mean() * weights[station_name]

        # Calculate weighted averages across stations
        pcp_avg = pcp_data.sum(axis=1)
        pcp_avg.name = 'P2020(mm)'
        pcp_avg = pcp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        pcp_avg = pcp_avg.sort_index()
        pcp_avg.index = pred_index

        maxtmp_avg = maxtmp_data.sum(axis=1)
        maxtmp_avg.name = 'MAX-TEM(C)'
        maxtmp_avg = maxtmp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        maxtmp_avg = maxtmp_avg.sort_index()
        maxtmp_avg.index = pred_index

        mintmp_avg = mintmp_data.sum(axis=1)
        mintmp_avg.name = 'MIN-TEM(C)'
        mintmp_avg = mintmp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        mintmp_avg = mintmp_avg.sort_index()
        mintmp_avg.index = pred_index

        slr_avg = slr_data.sum(axis=1)
        slr_avg.name = 'SLR(MJ/m^2)'
        slr_avg = slr_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        slr_avg = slr_avg.sort_index()
        slr_avg.index = pred_index

        hmd_avg = hmd_data.sum(axis=1)
        hmd_avg.name = 'AVG-RHU(%)'
        hmd_avg = hmd_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        hmd_avg = hmd_avg.sort_index()
        hmd_avg.index = pred_index

        wnd_avg = wnd_data.sum(axis=1)
        wnd_avg.name = 'AVG-WV(m/s)'
        wnd_avg = wnd_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        wnd_avg = wnd_avg.sort_index()
        wnd_avg.index = pred_index

        # Read water balance data
        wb = pd.read_csv(f'../result/SWATPlusWaterBlanceDataFromMeteSimYr/YellowRiver{hydrostation_abbrs[hydro_station]}_BasinWaterBalance_pred{pred_year}.csv', 
                         index_col=['date'], parse_dates=['date'])
        wb = wb.drop(columns=['mon', 'day', 'yr', 'name'])
        wb = wb.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        wb = wb.sort_index()
        wb.index = pred_index


        # Read monthly streamflow data
        flow = pd.read_csv(f'../data/{hydro_station}_natural_monthly_flow.csv', 
                           index_col=['date'], parse_dates=['date'])
        flow = flow.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        flow = flow.sort_index()
        flow.index = pred_index

        # read simulated streamflow of SWAT+
        SWATPlus_sim_flow = pd.read_csv(f'../result/SWATPlusCalValSimData/Channel_{hydrostation_channel[hydro_station]}_Monthly_River-Flow_{hydro_station}_Sim1972_2019.csv', 
                               index_col=['Date'], parse_dates=['Date'])
        SWATPlus_sim_flow = SWATPlus_sim_flow['Value']
        SWATPlus_sim_flow.index.name = 'date'
        SWATPlus_sim_flow.name = 'SWATPlusSimFlow'
        SWATPlus_sim_flow = SWATPlus_sim_flow.loc[f'{start_year}-01-01':f'{pred_year-1}-12-31'] # 历史模拟流量，也可以算作集水区初始条件
        SWATPlus_sim_flow = SWATPlus_sim_flow.sort_index()
        SWATPlus_sim_flow.index = pd.date_range(f'{start_year}-01-01', f'{pred_year-1}-12-31', freq='MS')

        # Read preddiced streamflow of SWAT+, 首选获取SWAT+模型预测结果；这样做的目的是对SWAT+预测结果进行修正
        SWATPlus_pred_flow = pd.read_csv(f'../result/SWATPlusPredUsingMeteSimYearData/{hydro_station}_SWATPlus_pred_obs_2015_2019.csv', 
                               index_col=['date'], parse_dates=['date'])
        SWATPlus_pred_flow = SWATPlus_pred_flow['pred']
        SWATPlus_pred_flow.index.name = 'date'
        SWATPlus_pred_flow.name = 'SWATPlusSimFlow'
        SWATPlus_pred_flow = SWATPlus_pred_flow.loc[f'{pred_year}-01-01':f'{pred_year}-12-31'] #获取预测年份预测流量
        SWATPlus_pred_flow = SWATPlus_pred_flow.sort_index()
        SWATPlus_pred_flow.index = pd.date_range(f'{pred_year}-01-01', f'{pred_year}-12-31', freq='MS')

        SWATPlus_flow = pd.concat([SWATPlus_sim_flow, SWATPlus_pred_flow], axis=0)

        arima_sim =pd.read_csv(f'../result/ARIMAPredData/seasonal_decompose_multiplicative_arima_train_sim_{hydro_station}_before_{pred_year}.csv',index_col=['date'],parse_dates=['date'])
        arima_sim = arima_sim.loc[f'{start_year}-01-01':f'{pred_year-1}-12-31','SimFlow(m^3/s)']
        arima_sim.name = 'ARIMASimFlow'
        arima_sim = arima_sim.sort_index()
        arima_sim.index = pd.date_range(f'{start_year}-01-01', f'{pred_year-1}-12-31', freq='MS')

        arima_pred = pd.read_csv(f'../result/ARIMAPredData/seasonal_decompose_multiplicative_arima_pred_{hydro_station}_{pred_years[0]}_{pred_years[-1]}.csv',index_col=['date'],parse_dates=['date'])
        arima_pred = arima_pred.loc[f'{pred_year}-01-01':f'{pred_year}-12-31','flow(m^3/s)']
        arima_pred.name = 'ARIMASimFlow'
        arima_pred = arima_pred.sort_index()
        arima_pred.index = pd.date_range(f'{pred_year}-01-01', f'{pred_year}-12-31', freq='MS')

        arima_flow = pd.concat([arima_sim, arima_pred], axis=0)

        # Concatenate all data
        all_data = pd.concat([pcp_avg, maxtmp_avg, mintmp_avg, slr_avg, hmd_avg, wnd_avg, wb,SWATPlus_flow,arima_flow,flow], axis=1)

             
        # Drop columns with all zero values
        all_data = all_data.loc[:, (all_data != 0).any(axis=0)]
        
        # Set index name
        all_data.index.name = 'date'

        all_data.to_csv(sample_path+f'{hydro_station}_MeteAVGCalvalFeatureDataForML_PRED{pred_year}.csv', index=True)
        selected_data = all_data.loc[:, selected_features]

        feature_samples, target_samples = gen_multi_output_samples(
            timeseries=selected_data.copy(),
            target_column='flow(m^3/s)',
            lead=12,
            lag=12,
        )
        feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_feature_samples_pred{pred_year}.csv', index=True)
        target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_target_samples_pred{pred_year}.csv', index=True)

    print(f"Data for {hydro_station} processed and saved.")
    

No null values found in Tangnaihai data.
Data for Tangnaihai processed and saved.
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)', 'SLR(MJ/m^2)', 'AVG-RHU(%)', 'AVG-WV(m/s)', 'precip(mm)', 'snofall(mm)', 'snomlt(mm)', 'surq_gen(mm)', 'latq(mm)', 'wateryld(mm)', 'perc(mm)', 'et(mm)', 'ecanopy(mm)', 'eplant(mm)', 'esoil(mm)', 'surq_cont(mm)', 'cn', 'sw_init(mm)', 'sw_final(mm)', 'sw_ave(mm)', 'sw_300(mm)', 'sno_init(mm)', 'sno_final(mm)', 'snopack(mm)', 'pet(mm)', 'surq_cha(mm)', 'latq_cha(mm)', 'sw_change(mm)', 'lagsurf(mm)', 'laglatq(mm)', 'wet_evap(mm)', 'wet_oflo(mm)', 'wet_stor(mm)', 'SWATPlusSimFlow', 'ARIMASimFlow']
No null values found in Guide data.
Data for Guide processed and saved.
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)', 'SLR(MJ/m^2)', 'AVG-RHU(%)', 'AVG-WV(m/s)', 'precip(mm)', 'snofall(mm)', 'snomlt(mm)', 'surq_gen(mm)', 'latq(mm)', 'wateryld(mm)', 'perc(mm)', 'et(mm)', 'ecanopy(mm)', 'eplant(mm)', 'esoil(mm)', 'surq_cont(mm)', 'cn', 'sw_init(mm)', 'sw_final

In [14]:
# !1
sample_path = '../samples_mete_wb_vif/InputOutputSamples_metesimyr/'
if not os.path.exists(sample_path):
    os.makedirs(sample_path)
selected_features = ['P2020(mm)','MAX-TEM(C)','MIN-TEM(C)','flow(m^3/s)']

for hydro_station in hydro_stations:
    station_names = hydrostation_metestations[hydro_station]
    index = pd.date_range(calval_start,calval_end,freq='MS')
    # Initialize DataFrames to store aggregated data
    pcp_data = pd.DataFrame()
    maxtmp_data = pd.DataFrame()
    mintmp_data = pd.DataFrame()
    slr_data = pd.DataFrame()
    hmd_data = pd.DataFrame()
    wnd_data = pd.DataFrame()
    total_area = sum(metestation_controal_area_dict[station] for station in station_names)
    weights = {station: metestation_controal_area_dict[station] / total_area for station in station_names}
    for station in station_names:
        # Read climate data for each station
        station_data = pd.read_csv(f'D:/DataSpace/HydroMeteAnthropicDatabase/7.FilledRawMeteObsInfo/ChinaLandDailyMeteV3(InsertSolarRadiation)/{station}.csv', 
                                   index_col=['DATE'], parse_dates=['DATE'])
        station_calval = station_data.loc[calval_start:calval_end]
        # Aggregate data
        pcp_data[station] = station_calval['P2020(mm)'].resample('MS').sum() * weights[station]
        maxtmp_data[station] = station_calval['MAX-TEM(C)'].resample('MS').mean() * weights[station]
        mintmp_data[station] = station_calval['MIN-TEM(C)'].resample('MS').mean() * weights[station]
        slr_data[station] = station_calval['SLR(MJ/m^2)'].resample('MS').sum() * weights[station]
        hmd_data[station] = station_calval['AVG-RHU(%)'].resample('MS').mean() * weights[station]
        wnd_data[station] = station_calval['AVG-WV(m/s)'].resample('MS').mean() * weights[station]

    # Calculate weighted averages across stations
    pcp_avg = pcp_data.sum(axis=1)
    pcp_avg.name = 'P2020(mm)'
    pcp_avg.index = index
    maxtmp_avg = maxtmp_data.sum(axis=1)
    maxtmp_avg.name = 'MAX-TEM(C)'
    maxtmp_avg.index = index
    mintmp_avg = mintmp_data.sum(axis=1)
    mintmp_avg.name = 'MIN-TEM(C)'
    mintmp_avg.index = index
    slr_avg = slr_data.sum(axis=1)
    slr_avg.name = 'SLR(MJ/m^2)'
    slr_avg.index = index
    hmd_avg = hmd_data.sum(axis=1)
    hmd_avg.name = 'AVG-RHU(%)'
    hmd_avg.index = index
    wnd_avg = wnd_data.sum(axis=1)
    wnd_avg.name = 'AVG-WV(m/s)'
    wnd_avg.index = index

        # Read monthly streamflow data
    flow = pd.read_csv(f'../data/{hydro_station}_natural_monthly_flow.csv', 
                       index_col=['date'], parse_dates=['date'])
    flow_calval = flow.loc[calval_start:calval_end]
    flow_calval = flow_calval.sort_index()
    flow_calval.index = index

    # Concatenate all data
    all_data = pd.concat([pcp_avg, maxtmp_avg, mintmp_avg, slr_avg, hmd_avg, wnd_avg, flow_calval], axis=1)
    all_data.index.name = 'date'

    # Remove columns with all zero values
    all_data = all_data.loc[:, (all_data != 0).any(axis=0)]

    # Save all data
    all_data.to_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv', index=True)
    
    # Check for null values in each column
    null_columns = all_data.columns[all_data.isnull().any()].tolist()
    if null_columns:
        print(f"Columns with null values in {hydro_station} data:")
        for col in null_columns:
            null_count = all_data[col].isnull().sum()
            print(f"  {col}: {null_count} null values")
    else:
        print(f"No null values found in {hydro_station} data.")
    
    print(f"Data for {hydro_station} processed and saved.")

    all_data = pd.read_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv',index_col=['date'],parse_dates=['date'])
    feature_samples, target_samples = gen_multi_output_samples(
        timeseries=all_data.copy(),
        target_column='flow(m^3/s)',
        lag=12,
        lead=12,
    )
    feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_full_feature_samples_calval.csv',index=True)
    target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_full_target_samples_calval.csv',index=True)
# select features including ['snomlt(mm)', 'eplant(mm)', 'surq_cha(mm)', 'snofall(mm)', 'wet_oflo(mm)', 'wet_evap(mm)', 'snopack(mm)', 'sw_change(mm)', 'MIN-TEM(C)','flow(m^3/s)'] from all_data
# ['snomlt(mm)', 'eplant(mm)', 'surq_cha(mm)', 'snofall(mm)', 'wet_oflo(mm)', 'wet_evap(mm)', 'snopack(mm)', 'sw_change(mm)', 'MIN-TEM(C)']
for hydro_station in hydro_stations:
    all_data = pd.read_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv',index_col=['date'],parse_dates=['date'])
    selected_data = all_data.loc[:,selected_features]

    feature_samples, target_samples = gen_multi_output_samples(
        timeseries=selected_data.copy(),
        target_column='flow(m^3/s)',
        lead=12,
        lag=12,
    )
    feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_feature_samples_calval.csv',index=True)
    target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_target_samples_calval.csv',index=True)

pred_years = np.arange(2015, 2020)
start_year = 1972

for hydro_station in hydro_stations:
    similarity_years = pd.read_csv(f'../result/SimilarityYears/SimilarityYears_{hydro_station}.csv', index_col=['tar_year'])
    station_names = hydrostation_metestations[hydro_station]
    
    for pred_year in pred_years:

        pred_index = pd.date_range(f'{start_year}-01-01', f'{pred_year}-12-31', freq='MS')

        # Initialize DataFrames to store aggregated data
        pcp_data = pd.DataFrame(index=pred_index)
        maxtmp_data = pd.DataFrame(index=pred_index)
        mintmp_data = pd.DataFrame(index=pred_index)
        slr_data = pd.DataFrame(index=pred_index)
        hmd_data = pd.DataFrame(index=pred_index)
        wnd_data = pd.DataFrame(index=pred_index)

        total_area = sum(metestation_controal_area_dict[station] for station in station_names)
        weights = {station: metestation_controal_area_dict[station] / total_area for station in station_names}

        for station_name in station_names:
            climate_data = pd.read_csv(f'D:/DataSpace/HydroMeteAnthropicDatabase/7.FilledRawMeteObsInfo/ChinaLandDailyMeteV3(InsertSolarRadiation)/{station_name}.csv', 
                                       index_col=['DATE'], parse_dates=['DATE'])
            
            tar_year = pred_year #将预测年份数据进行替换
            ref_year = similarity_years.loc[tar_year-1, station_name] + 1
            
            target_data = climate_data[climate_data.index.year == tar_year]
            reference_data = climate_data[climate_data.index.year == ref_year]
            
            if len(reference_data) == len(target_data):
                climate_data.loc[target_data.index, :] = reference_data.values
            elif len(reference_data) > len(target_data):
                reference_data = reference_data[:-1]
                climate_data.loc[target_data.index, :] = reference_data.values
            else:
                last_day = reference_data.iloc[-1:]
                reference_data = pd.concat([reference_data, last_day])
                climate_data.loc[target_data.index, :] = reference_data.values

            climate_data = climate_data.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
            
            pcp_data[station_name] = climate_data['P2020(mm)'].resample('MS').sum() * weights[station_name]
            maxtmp_data[station_name] = climate_data['MAX-TEM(C)'].resample('MS').mean() * weights[station_name]
            mintmp_data[station_name] = climate_data['MIN-TEM(C)'].resample('MS').mean() * weights[station_name]
            slr_data[station_name] = climate_data['SLR(MJ/m^2)'].resample('MS').sum() * weights[station_name]
            hmd_data[station_name] = climate_data['AVG-RHU(%)'].resample('MS').mean() * weights[station_name]
            wnd_data[station_name] = climate_data['AVG-WV(m/s)'].resample('MS').mean() * weights[station_name]

        # Calculate weighted averages across stations
        pcp_avg = pcp_data.sum(axis=1)
        pcp_avg.name = 'P2020(mm)'
        pcp_avg = pcp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        pcp_avg = pcp_avg.sort_index()
        pcp_avg.index = pred_index

        maxtmp_avg = maxtmp_data.sum(axis=1)
        maxtmp_avg.name = 'MAX-TEM(C)'
        maxtmp_avg = maxtmp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        maxtmp_avg = maxtmp_avg.sort_index()
        maxtmp_avg.index = pred_index

        mintmp_avg = mintmp_data.sum(axis=1)
        mintmp_avg.name = 'MIN-TEM(C)'
        mintmp_avg = mintmp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        mintmp_avg = mintmp_avg.sort_index()
        mintmp_avg.index = pred_index

        slr_avg = slr_data.sum(axis=1)
        slr_avg.name = 'SLR(MJ/m^2)'
        slr_avg = slr_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        slr_avg = slr_avg.sort_index()
        slr_avg.index = pred_index

        hmd_avg = hmd_data.sum(axis=1)
        hmd_avg.name = 'AVG-RHU(%)'
        hmd_avg = hmd_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        hmd_avg = hmd_avg.sort_index()
        hmd_avg.index = pred_index

        wnd_avg = wnd_data.sum(axis=1)
        wnd_avg.name = 'AVG-WV(m/s)'
        wnd_avg = wnd_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        wnd_avg = wnd_avg.sort_index()
        wnd_avg.index = pred_index

                # Read monthly streamflow data
        flow = pd.read_csv(f'../data/{hydro_station}_natural_monthly_flow.csv', 
                           index_col=['date'], parse_dates=['date'])
        flow = flow.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        flow = flow.sort_index()
        flow.index = pred_index

        # Concatenate all data
        all_data = pd.concat([pcp_avg, maxtmp_avg, mintmp_avg, slr_avg, hmd_avg, wnd_avg, flow], axis=1)

             
        # Drop columns with all zero values
        all_data = all_data.loc[:, (all_data != 0).any(axis=0)]
        
        # Set index name
        all_data.index.name = 'date'

        all_data.to_csv(sample_path+f'{hydro_station}_MeteAVGCalvalFeatureDataForML_PRED{pred_year}.csv', index=True)
        selected_data = all_data.loc[:, selected_features]

        feature_samples, target_samples = gen_multi_output_samples(
            timeseries=selected_data.copy(),
            target_column='flow(m^3/s)',
            lead=12,
            lag=12,
        )
        feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_feature_samples_pred{pred_year}.csv', index=True)
        target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_target_samples_pred{pred_year}.csv', index=True)

    print(f"Data for {hydro_station} processed and saved.")
    

No null values found in Tangnaihai data.
Data for Tangnaihai processed and saved.
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)', 'SLR(MJ/m^2)', 'AVG-RHU(%)', 'AVG-WV(m/s)']
No null values found in Guide data.
Data for Guide processed and saved.
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)', 'SLR(MJ/m^2)', 'AVG-RHU(%)', 'AVG-WV(m/s)']
No null values found in Xunhua data.
Data for Xunhua processed and saved.
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)', 'SLR(MJ/m^2)', 'AVG-RHU(%)', 'AVG-WV(m/s)']
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)']
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)']
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)']
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)']
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)']
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)']
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)']
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)']
Data for Tangnaihai processed and saved.
features:  ['P20

# 2 Generate machine learning samples_mete_wb_vif using climate data of guage stations and water balance output for catchment.
Entire period:1972-2019,48 years and 576 months.
Calibration period:1972-2009,38 years and 456 months, 79.2% of the entire data.
Validation period:2010-2014, 5 years and 60 months, 10.4% of the entire data.
Test period(Simulate pred):2015-2019, 5 years and 60 months, 10.4% of the entire data.
生成候选训练样本：
1. 计算各雨量站加权平均值，获取整个集水区气象数据；
2. 将集水区气象数据、集水区水量平衡数据（SWAT+获得）、流量模拟数据(SARIMA)、实测流量数据综合形成候选学习样本。

生成学习样本（用于模型训练和模型验证）：
1. 以实测流量为预测目标，采用多输入（过去12个月与未来12个月预测信息）多输出模式（未来12个月流量），生成学习样本；
2. 所有的输入信息不做筛选，全部用来构造输入。

生成预测样本（获取预测因子）：
1. 输入包括：流域面气象信息（降水、最大气温、最小气温、相对湿度、太阳辐射、风速）、流域水量平衡数据、模拟流量数据（SARIMA）；
2. 输入经过多重共线性检验筛选，与训练验证阶段筛选获得的因子一致；
3. 预测阶段的气象因子从各个气象站相似年数据统计获得。

In [15]:
sample_path = '../samples_mete_wb_vif/InputOutputSamples_metesimyr_arimasim/'
if not os.path.exists(sample_path):
    os.makedirs(sample_path)
selected_features = [
    'P2020(mm)', 'MAX-TEM(C)','MIN-TEM(C)',
    'latq(mm)', 'eplant(mm)', 'wet_evap(mm)', 
    'surq_cha(mm)', 'wet_oflo(mm)', 'surq_gen(mm)', 
    'snomlt(mm)', 'snofall(mm)', 'sw_change(mm)', 'snopack(mm)',
                             'ARIMASimFlow', 'flow(m^3/s)']

for hydro_station in hydro_stations:
    station_names = hydrostation_metestations[hydro_station]

    index = pd.date_range(calval_start,calval_end,freq='M')
    
    # Initialize DataFrames to store aggregated data
    pcp_data = pd.DataFrame()
    maxtmp_data = pd.DataFrame()
    mintmp_data = pd.DataFrame()
    slr_data = pd.DataFrame()
    hmd_data = pd.DataFrame()
    wnd_data = pd.DataFrame()

    total_area = sum(metestation_controal_area_dict[station] for station in station_names)
    weights = {station: metestation_controal_area_dict[station] / total_area for station in station_names}

    for station in station_names:
        # Read climate data for each station
        station_data = pd.read_csv(f'D:/DataSpace/HydroMeteAnthropicDatabase/7.FilledRawMeteObsInfo/ChinaLandDailyMeteV3(InsertSolarRadiation)/{station}.csv', 
                                   index_col=['DATE'], parse_dates=['DATE'])
        station_calval = station_data.loc[calval_start:calval_end]

        # Aggregate data
        pcp_data[station] = station_calval['P2020(mm)'].resample('MS').sum() * weights[station]
        maxtmp_data[station] = station_calval['MAX-TEM(C)'].resample('MS').mean() * weights[station]
        mintmp_data[station] = station_calval['MIN-TEM(C)'].resample('MS').mean() * weights[station]
        slr_data[station] = station_calval['SLR(MJ/m^2)'].resample('MS').sum() * weights[station]
        hmd_data[station] = station_calval['AVG-RHU(%)'].resample('MS').mean() * weights[station]
        wnd_data[station] = station_calval['AVG-WV(m/s)'].resample('MS').mean() * weights[station]

    # Calculate weighted averages across stations
    pcp_avg = pcp_data.sum(axis=1)
    pcp_avg.name = 'P2020(mm)'
    pcp_avg.index = index
    maxtmp_avg = maxtmp_data.sum(axis=1)
    maxtmp_avg.name = 'MAX-TEM(C)'
    maxtmp_avg.index = index
    mintmp_avg = mintmp_data.sum(axis=1)
    mintmp_avg.name = 'MIN-TEM(C)'
    mintmp_avg.index = index
    slr_avg = slr_data.sum(axis=1)
    slr_avg.name = 'SLR(MJ/m^2)'
    slr_avg.index = index
    hmd_avg = hmd_data.sum(axis=1)
    hmd_avg.name = 'AVG-RHU(%)'
    hmd_avg.index = index
    wnd_avg = wnd_data.sum(axis=1)
    wnd_avg.name = 'AVG-WV(m/s)'
    wnd_avg.index = index


    # Read water balance data
    wb = pd.read_csv(f'../result/SWATPlusWaterBlanceDataFromMeteSimYr/YellowRiver{hydrostation_abbrs[hydro_station]}_BasinWaterBalance_pred2019.csv', 
                     index_col=['date'], parse_dates=['date'])
    wb_calval = wb.loc[calval_start:calval_end]
    wb_calval = wb_calval.drop(columns=['mon', 'day', 'yr', 'name'])
    wb_calval = wb_calval.sort_index()
    wb_calval.index = index

    # print(wb_calval.isnull().any())

    # Read monthly streamflow data
    flow = pd.read_csv(f'../data/{hydro_station}_natural_monthly_flow.csv', 
                       index_col=['date'], parse_dates=['date'])
    flow_calval = flow.loc[calval_start:calval_end]
    flow_calval = flow_calval.sort_index()
    flow_calval.index = index


    arima_sim =pd.read_csv(f'../result/ARIMAPredData/seasonal_decompose_multiplicative_arima_train_sim_{hydro_station}_before_2015.csv',index_col=['date'],parse_dates=['date'])
    arima_sim = arima_sim.loc[calval_start:calval_end,'SimFlow(m^3/s)']
    arima_sim.name = 'ARIMASimFlow'
    arima_sim = arima_sim.sort_index()
    arima_sim.index = index

    # Read simulated streamflow
    swatplus_sim_flow = pd.read_csv(f'../result/SWATPlusCalValSimData/Channel_{hydrostation_channel[hydro_station]}_Monthly_River-Flow_{hydro_station}_Sim1972_2019.csv', 
                           index_col=['Date'], parse_dates=['Date'])
    swatplus_sim_flow.columns = ['SWATPlusSimFlow']
    swatplus_sim_flow = swatplus_sim_flow.loc[calval_start:calval_end]
    swatplus_sim_flow = swatplus_sim_flow.sort_index()
    swatplus_sim_flow.index = flow_calval.index
    swatplus_sim_flow.index.name = 'date'
    
    

    # Concatenate all data
    all_data = pd.concat([pcp_avg, maxtmp_avg, mintmp_avg, slr_avg, hmd_avg, wnd_avg, wb_calval, arima_sim, flow_calval], axis=1)
    all_data.index.name = 'date'

    # Remove columns with all zero values
    all_data = all_data.loc[:, (all_data != 0).any(axis=0)]

    # Save all data
    all_data.to_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv', index=True)
    
    # Check for null values in each column
    null_columns = all_data.columns[all_data.isnull().any()].tolist()
    if null_columns:
        print(f"Columns with null values in {hydro_station} data:")
        for col in null_columns:
            null_count = all_data[col].isnull().sum()
            print(f"  {col}: {null_count} null values")
    else:
        print(f"No null values found in {hydro_station} data.")
    
    print(f"Data for {hydro_station} processed and saved.")

    all_data = pd.read_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv',index_col=['date'],parse_dates=['date'])
    feature_samples, target_samples = gen_multi_output_samples(
        timeseries=all_data.copy(),
        target_column='flow(m^3/s)',
        lag=12,
        lead=12,
    )
    feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_full_feature_samples_calval.csv',index=True)
    target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_full_target_samples_calval.csv',index=True)
# select features including ['snomlt(mm)', 'eplant(mm)', 'surq_cha(mm)', 'snofall(mm)', 'wet_oflo(mm)', 'wet_evap(mm)', 'snopack(mm)', 'sw_change(mm)', 'MIN-TEM(C)','flow(m^3/s)'] from all_data
# ['snomlt(mm)', 'eplant(mm)', 'surq_cha(mm)', 'snofall(mm)', 'wet_oflo(mm)', 'wet_evap(mm)', 'snopack(mm)', 'sw_change(mm)', 'MIN-TEM(C)']
for hydro_station in hydro_stations:
    all_data = pd.read_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv',index_col=['date'],parse_dates=['date'])
    selected_data = all_data.loc[:,selected_features]

    feature_samples, target_samples = gen_multi_output_samples(
        timeseries=selected_data.copy(),
        target_column='flow(m^3/s)',
        lead=12,
        lag=12,
    )
    feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_feature_samples_calval.csv',index=True)
    target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_target_samples_calval.csv',index=True)

pred_years = np.arange(2015, 2020)
start_year = 1972

for hydro_station in hydro_stations:
    similarity_years = pd.read_csv(f'../result/SimilarityYears/SimilarityYears_{hydro_station}.csv', index_col=['tar_year'])
    station_names = hydrostation_metestations[hydro_station]
    
    for pred_year in pred_years:

        pred_index = pd.date_range(f'{start_year}-01-01', f'{pred_year}-12-31', freq='MS')

        # Initialize DataFrames to store aggregated data
        pcp_data = pd.DataFrame(index=pred_index)
        maxtmp_data = pd.DataFrame(index=pred_index)
        mintmp_data = pd.DataFrame(index=pred_index)
        slr_data = pd.DataFrame(index=pred_index)
        hmd_data = pd.DataFrame(index=pred_index)
        wnd_data = pd.DataFrame(index=pred_index)

        total_area = sum(metestation_controal_area_dict[station] for station in station_names)
        weights = {station: metestation_controal_area_dict[station] / total_area for station in station_names}

        for station_name in station_names:
            climate_data = pd.read_csv(f'D:/DataSpace/HydroMeteAnthropicDatabase/7.FilledRawMeteObsInfo/ChinaLandDailyMeteV3(InsertSolarRadiation)/{station_name}.csv', 
                                       index_col=['DATE'], parse_dates=['DATE'])
            
            tar_year = pred_year #将预测年份数据进行替换
            ref_year = similarity_years.loc[tar_year-1, station_name] + 1
            
            target_data = climate_data[climate_data.index.year == tar_year]
            reference_data = climate_data[climate_data.index.year == ref_year]
            
            if len(reference_data) == len(target_data):
                climate_data.loc[target_data.index, :] = reference_data.values
            elif len(reference_data) > len(target_data):
                reference_data = reference_data[:-1]
                climate_data.loc[target_data.index, :] = reference_data.values
            else:
                last_day = reference_data.iloc[-1:]
                reference_data = pd.concat([reference_data, last_day])
                climate_data.loc[target_data.index, :] = reference_data.values

            climate_data = climate_data.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
            
            pcp_data[station_name] = climate_data['P2020(mm)'].resample('MS').sum() * weights[station_name]
            maxtmp_data[station_name] = climate_data['MAX-TEM(C)'].resample('MS').mean() * weights[station_name]
            mintmp_data[station_name] = climate_data['MIN-TEM(C)'].resample('MS').mean() * weights[station_name]
            slr_data[station_name] = climate_data['SLR(MJ/m^2)'].resample('MS').sum() * weights[station_name]
            hmd_data[station_name] = climate_data['AVG-RHU(%)'].resample('MS').mean() * weights[station_name]
            wnd_data[station_name] = climate_data['AVG-WV(m/s)'].resample('MS').mean() * weights[station_name]

        # Calculate weighted averages across stations
        pcp_avg = pcp_data.sum(axis=1)
        pcp_avg.name = 'P2020(mm)'
        pcp_avg = pcp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        pcp_avg = pcp_avg.sort_index()
        pcp_avg.index = pred_index

        maxtmp_avg = maxtmp_data.sum(axis=1)
        maxtmp_avg.name = 'MAX-TEM(C)'
        maxtmp_avg = maxtmp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        maxtmp_avg = maxtmp_avg.sort_index()
        maxtmp_avg.index = pred_index

        mintmp_avg = mintmp_data.sum(axis=1)
        mintmp_avg.name = 'MIN-TEM(C)'
        mintmp_avg = mintmp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        mintmp_avg = mintmp_avg.sort_index()
        mintmp_avg.index = pred_index

        slr_avg = slr_data.sum(axis=1)
        slr_avg.name = 'SLR(MJ/m^2)'
        slr_avg = slr_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        slr_avg = slr_avg.sort_index()
        slr_avg.index = pred_index

        hmd_avg = hmd_data.sum(axis=1)
        hmd_avg.name = 'AVG-RHU(%)'
        hmd_avg = hmd_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        hmd_avg = hmd_avg.sort_index()
        hmd_avg.index = pred_index

        wnd_avg = wnd_data.sum(axis=1)
        wnd_avg.name = 'AVG-WV(m/s)'
        wnd_avg = wnd_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        wnd_avg = wnd_avg.sort_index()
        wnd_avg.index = pred_index

        # Read water balance data
        wb = pd.read_csv(f'../result/SWATPlusWaterBlanceDataFromMeteSimYr/YellowRiver{hydrostation_abbrs[hydro_station]}_BasinWaterBalance_pred{pred_year}.csv', 
                         index_col=['date'], parse_dates=['date'])
        wb = wb.drop(columns=['mon', 'day', 'yr', 'name'])
        wb = wb.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        wb = wb.sort_index()
        wb.index = pred_index


        # Read monthly streamflow data
        flow = pd.read_csv(f'../data/{hydro_station}_natural_monthly_flow.csv', 
                           index_col=['date'], parse_dates=['date'])
        flow = flow.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        flow = flow.sort_index()
        flow.index = pred_index

        # read simulated streamflow of SWAT+
        SWATPlus_sim_flow = pd.read_csv(f'../result/SWATPlusCalValSimData/Channel_{hydrostation_channel[hydro_station]}_Monthly_River-Flow_{hydro_station}_Sim1972_2019.csv', 
                               index_col=['Date'], parse_dates=['Date'])
        SWATPlus_sim_flow = SWATPlus_sim_flow['Value']
        SWATPlus_sim_flow.index.name = 'date'
        SWATPlus_sim_flow.name = 'SWATPlusSimFlow'
        SWATPlus_sim_flow = SWATPlus_sim_flow.loc[f'{start_year}-01-01':f'{pred_year-1}-12-31'] # 历史模拟流量，也可以算作集水区初始条件
        SWATPlus_sim_flow = SWATPlus_sim_flow.sort_index()
        SWATPlus_sim_flow.index = pd.date_range(f'{start_year}-01-01', f'{pred_year-1}-12-31', freq='MS')

        # Read preddiced streamflow of SWAT+, 首选获取SWAT+模型预测结果；这样做的目的是对SWAT+预测结果进行修正
        SWATPlus_pred_flow = pd.read_csv(f'../result/SWATPlusPredUsingMeteSimYearData/{hydro_station}_SWATPlus_pred_obs_2015_2019.csv', 
                               index_col=['date'], parse_dates=['date'])
        SWATPlus_pred_flow = SWATPlus_pred_flow['pred']
        SWATPlus_pred_flow.index.name = 'date'
        SWATPlus_pred_flow.name = 'SWATPlusSimFlow'
        SWATPlus_pred_flow = SWATPlus_pred_flow.loc[f'{pred_year}-01-01':f'{pred_year}-12-31'] #获取预测年份预测流量
        SWATPlus_pred_flow = SWATPlus_pred_flow.sort_index()
        SWATPlus_pred_flow.index = pd.date_range(f'{pred_year}-01-01', f'{pred_year}-12-31', freq='MS')

        SWATPlus_flow = pd.concat([SWATPlus_sim_flow, SWATPlus_pred_flow], axis=0)

        arima_sim =pd.read_csv(f'../result/ARIMAPredData/seasonal_decompose_multiplicative_arima_train_sim_{hydro_station}_before_{pred_year}.csv',index_col=['date'],parse_dates=['date'])
        arima_sim = arima_sim.loc[f'{start_year}-01-01':f'{pred_year-1}-12-31','SimFlow(m^3/s)']
        arima_sim.name = 'ARIMASimFlow'
        arima_sim = arima_sim.sort_index()
        arima_sim.index = pd.date_range(f'{start_year}-01-01', f'{pred_year-1}-12-31', freq='MS')

        arima_pred = pd.read_csv(f'../result/ARIMAPredData/seasonal_decompose_multiplicative_arima_pred_{hydro_station}_{pred_years[0]}_{pred_years[-1]}.csv',index_col=['date'],parse_dates=['date'])
        arima_pred = arima_pred.loc[f'{pred_year}-01-01':f'{pred_year}-12-31','flow(m^3/s)']
        arima_pred.name = 'ARIMASimFlow'
        arima_pred = arima_pred.sort_index()
        arima_pred.index = pd.date_range(f'{pred_year}-01-01', f'{pred_year}-12-31', freq='MS')

        arima_flow = pd.concat([arima_sim, arima_pred], axis=0)

        # Concatenate all data
        all_data = pd.concat([pcp_avg, maxtmp_avg, mintmp_avg, slr_avg, hmd_avg, wnd_avg, wb,arima_flow,flow], axis=1)

             
        # Drop columns with all zero values
        all_data = all_data.loc[:, (all_data != 0).any(axis=0)]
        
        # Set index name
        all_data.index.name = 'date'

        all_data.to_csv(sample_path+f'{hydro_station}_MeteAVGCalvalFeatureDataForML_PRED{pred_year}.csv', index=True)

        
        selected_data = all_data.loc[:, selected_features]

        feature_samples, target_samples = gen_multi_output_samples(
            timeseries=selected_data.copy(),
            target_column='flow(m^3/s)',
            lead=12,
            lag=12,
        )
        feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_feature_samples_pred{pred_year}.csv', index=True)
        target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_target_samples_pred{pred_year}.csv', index=True)

    print(f"Data for {hydro_station} processed and saved.")

No null values found in Tangnaihai data.
Data for Tangnaihai processed and saved.
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)', 'SLR(MJ/m^2)', 'AVG-RHU(%)', 'AVG-WV(m/s)', 'precip(mm)', 'snofall(mm)', 'snomlt(mm)', 'surq_gen(mm)', 'latq(mm)', 'wateryld(mm)', 'perc(mm)', 'et(mm)', 'ecanopy(mm)', 'eplant(mm)', 'esoil(mm)', 'surq_cont(mm)', 'cn', 'sw_init(mm)', 'sw_final(mm)', 'sw_ave(mm)', 'sw_300(mm)', 'sno_init(mm)', 'sno_final(mm)', 'snopack(mm)', 'pet(mm)', 'surq_cha(mm)', 'latq_cha(mm)', 'sw_change(mm)', 'lagsurf(mm)', 'laglatq(mm)', 'wet_evap(mm)', 'wet_oflo(mm)', 'wet_stor(mm)', 'ARIMASimFlow']
No null values found in Guide data.
Data for Guide processed and saved.
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)', 'SLR(MJ/m^2)', 'AVG-RHU(%)', 'AVG-WV(m/s)', 'precip(mm)', 'snofall(mm)', 'snomlt(mm)', 'surq_gen(mm)', 'latq(mm)', 'wateryld(mm)', 'perc(mm)', 'et(mm)', 'ecanopy(mm)', 'eplant(mm)', 'esoil(mm)', 'surq_cont(mm)', 'cn', 'sw_init(mm)', 'sw_final(mm)', 'sw_ave(mm)'

生成预测样本（获取预测因子）：
1. 输入包括：流域面气象信息（降水、最大气温、最小气温、相对湿度、太阳辐射、风速）、流域水量平衡数据、模拟流量数据；
2. 输入经过多重共线性检验筛选，与训练验证阶段筛选获得的因子一致；
3. 预测阶段的气象因子从水文相似年各气象站数据统计获得。

In [16]:
sample_path = '../samples_mete_wb_vif/InputOutputSamples_metesimyr_swatpsim/'
if not os.path.exists(sample_path):
    os.makedirs(sample_path)
selected_features = [
    'P2020(mm)', 'MAX-TEM(C)','MIN-TEM(C)',
    'latq(mm)', 'eplant(mm)', 'wet_evap(mm)', 
    'surq_cha(mm)', 'wet_oflo(mm)', 'surq_gen(mm)', 
    'snomlt(mm)', 'snofall(mm)', 'sw_change(mm)', 'snopack(mm)',
                         'SWATPlusSimFlow','flow(m^3/s)']

for hydro_station in hydro_stations:
    station_names = hydrostation_metestations[hydro_station]

    index = pd.date_range(calval_start,calval_end,freq='M')
    
    # Initialize DataFrames to store aggregated data
    pcp_data = pd.DataFrame()
    maxtmp_data = pd.DataFrame()
    mintmp_data = pd.DataFrame()
    slr_data = pd.DataFrame()
    hmd_data = pd.DataFrame()
    wnd_data = pd.DataFrame()

    total_area = sum(metestation_controal_area_dict[station] for station in station_names)
    weights = {station: metestation_controal_area_dict[station] / total_area for station in station_names}

    for station in station_names:
        # Read climate data for each station
        station_data = pd.read_csv(f'D:/DataSpace/HydroMeteAnthropicDatabase/7.FilledRawMeteObsInfo/ChinaLandDailyMeteV3(InsertSolarRadiation)/{station}.csv', 
                                   index_col=['DATE'], parse_dates=['DATE'])
        station_calval = station_data.loc[calval_start:calval_end]

        # Aggregate data
        pcp_data[station] = station_calval['P2020(mm)'].resample('MS').sum() * weights[station]
        maxtmp_data[station] = station_calval['MAX-TEM(C)'].resample('MS').mean() * weights[station]
        mintmp_data[station] = station_calval['MIN-TEM(C)'].resample('MS').mean() * weights[station]
        slr_data[station] = station_calval['SLR(MJ/m^2)'].resample('MS').sum() * weights[station]
        hmd_data[station] = station_calval['AVG-RHU(%)'].resample('MS').mean() * weights[station]
        wnd_data[station] = station_calval['AVG-WV(m/s)'].resample('MS').mean() * weights[station]

    # Calculate weighted averages across stations
    pcp_avg = pcp_data.sum(axis=1)
    pcp_avg.name = 'P2020(mm)'
    pcp_avg.index = index
    maxtmp_avg = maxtmp_data.sum(axis=1)
    maxtmp_avg.name = 'MAX-TEM(C)'
    maxtmp_avg.index = index
    mintmp_avg = mintmp_data.sum(axis=1)
    mintmp_avg.name = 'MIN-TEM(C)'
    mintmp_avg.index = index
    slr_avg = slr_data.sum(axis=1)
    slr_avg.name = 'SLR(MJ/m^2)'
    slr_avg.index = index
    hmd_avg = hmd_data.sum(axis=1)
    hmd_avg.name = 'AVG-RHU(%)'
    hmd_avg.index = index
    wnd_avg = wnd_data.sum(axis=1)
    wnd_avg.name = 'AVG-WV(m/s)'
    wnd_avg.index = index


    # Read water balance data
    wb = pd.read_csv(f'../result/SWATPlusWaterBlanceDataFromMeteSimYr/YellowRiver{hydrostation_abbrs[hydro_station]}_BasinWaterBalance_pred2019.csv', 
                     index_col=['date'], parse_dates=['date'])
    wb_calval = wb.loc[calval_start:calval_end]
    wb_calval = wb_calval.drop(columns=['mon', 'day', 'yr', 'name'])
    wb_calval = wb_calval.sort_index()
    wb_calval.index = index

    # print(wb_calval.isnull().any())

    # Read monthly streamflow data
    flow = pd.read_csv(f'../data/{hydro_station}_natural_monthly_flow.csv', 
                       index_col=['date'], parse_dates=['date'])
    flow_calval = flow.loc[calval_start:calval_end]
    flow_calval = flow_calval.sort_index()
    flow_calval.index = index


    arima_sim =pd.read_csv(f'../result/ARIMAPredData/seasonal_decompose_multiplicative_arima_train_sim_{hydro_station}_before_2015.csv',index_col=['date'],parse_dates=['date'])
    arima_sim = arima_sim.loc[calval_start:calval_end,'SimFlow(m^3/s)']
    arima_sim.name = 'ARIMASimFlow'
    arima_sim = arima_sim.sort_index()
    arima_sim.index = index

    # Read simulated streamflow
    swatplus_sim_flow = pd.read_csv(f'../result/SWATPlusCalValSimData/Channel_{hydrostation_channel[hydro_station]}_Monthly_River-Flow_{hydro_station}_Sim1972_2019.csv', 
                           index_col=['Date'], parse_dates=['Date'])
    swatplus_sim_flow.columns = ['SWATPlusSimFlow']
    swatplus_sim_flow = swatplus_sim_flow.loc[calval_start:calval_end]
    swatplus_sim_flow = swatplus_sim_flow.sort_index()
    swatplus_sim_flow.index = flow_calval.index
    swatplus_sim_flow.index.name = 'date'
    
    

    # Concatenate all data
    all_data = pd.concat([pcp_avg, maxtmp_avg, mintmp_avg, slr_avg, hmd_avg, wnd_avg, wb_calval, swatplus_sim_flow, flow_calval], axis=1)
    print(f"No null values found in {hydro_station} data.")
    
    print(f"Data for {hydro_station} processed and saved.")


    # Save all data
    all_data.to_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv', index=True)
    
    # Check for null values in each column
    null_columns = all_data.columns[all_data.isnull().any()].tolist()
    if null_columns:
        print(f"Columns with null values in {hydro_station} data:")
        for col in null_columns:
            null_count = all_data[col].isnull().sum()
            print(f"  {col}: {null_count} null values")
    else:
        print(f"No null values found in {hydro_station} data.")
    
    print(f"Data for {hydro_station} processed and saved.")

    all_data = pd.read_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv',index_col=['date'],parse_dates=['date'])
    feature_samples, target_samples = gen_multi_output_samples(
        timeseries=all_data.copy(),
        target_column='flow(m^3/s)',
        lag=12,
        lead=12,
    )
    feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_full_feature_samples_calval.csv',index=True)
    target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_full_target_samples_calval.csv',index=True)
# select features including ['snomlt(mm)', 'eplant(mm)', 'surq_cha(mm)', 'snofall(mm)', 'wet_oflo(mm)', 'wet_evap(mm)', 'snopack(mm)', 'sw_change(mm)', 'MIN-TEM(C)','flow(m^3/s)'] from all_data
# ['snomlt(mm)', 'eplant(mm)', 'surq_cha(mm)', 'snofall(mm)', 'wet_oflo(mm)', 'wet_evap(mm)', 'snopack(mm)', 'sw_change(mm)', 'MIN-TEM(C)']
for hydro_station in hydro_stations:
    all_data = pd.read_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv',index_col=['date'],parse_dates=['date'])
    selected_data = all_data.loc[:,selected_features]

    feature_samples, target_samples = gen_multi_output_samples(
        timeseries=selected_data.copy(),
        target_column='flow(m^3/s)',
        lead=12,
        lag=12,
    )
    feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_feature_samples_calval.csv',index=True)
    target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_target_samples_calval.csv',index=True)

pred_years = np.arange(2015, 2020)
start_year = 1972

for hydro_station in hydro_stations:
    similarity_years = pd.read_csv(f'../result/SimilarityYears/SimilarityYears_{hydro_station}.csv', index_col=['tar_year'])
    station_names = hydrostation_metestations[hydro_station]
    
    for pred_year in pred_years:

        pred_index = pd.date_range(f'{start_year}-01-01', f'{pred_year}-12-31', freq='MS')

        # Initialize DataFrames to store aggregated data
        pcp_data = pd.DataFrame(index=pred_index)
        maxtmp_data = pd.DataFrame(index=pred_index)
        mintmp_data = pd.DataFrame(index=pred_index)
        slr_data = pd.DataFrame(index=pred_index)
        hmd_data = pd.DataFrame(index=pred_index)
        wnd_data = pd.DataFrame(index=pred_index)

        total_area = sum(metestation_controal_area_dict[station] for station in station_names)
        weights = {station: metestation_controal_area_dict[station] / total_area for station in station_names}

        for station_name in station_names:
            climate_data = pd.read_csv(f'D:/DataSpace/HydroMeteAnthropicDatabase/7.FilledRawMeteObsInfo/ChinaLandDailyMeteV3(InsertSolarRadiation)/{station_name}.csv', 
                                       index_col=['DATE'], parse_dates=['DATE'])
            
            tar_year = pred_year #将预测年份数据进行替换
            ref_year = similarity_years.loc[tar_year-1, station_name] + 1
            
            target_data = climate_data[climate_data.index.year == tar_year]
            reference_data = climate_data[climate_data.index.year == ref_year]
            
            if len(reference_data) == len(target_data):
                climate_data.loc[target_data.index, :] = reference_data.values
            elif len(reference_data) > len(target_data):
                reference_data = reference_data[:-1]
                climate_data.loc[target_data.index, :] = reference_data.values
            else:
                last_day = reference_data.iloc[-1:]
                reference_data = pd.concat([reference_data, last_day])
                climate_data.loc[target_data.index, :] = reference_data.values

            climate_data = climate_data.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
            
            pcp_data[station_name] = climate_data['P2020(mm)'].resample('MS').sum() * weights[station_name]
            maxtmp_data[station_name] = climate_data['MAX-TEM(C)'].resample('MS').mean() * weights[station_name]
            mintmp_data[station_name] = climate_data['MIN-TEM(C)'].resample('MS').mean() * weights[station_name]
            slr_data[station_name] = climate_data['SLR(MJ/m^2)'].resample('MS').sum() * weights[station_name]
            hmd_data[station_name] = climate_data['AVG-RHU(%)'].resample('MS').mean() * weights[station_name]
            wnd_data[station_name] = climate_data['AVG-WV(m/s)'].resample('MS').mean() * weights[station_name]

        # Calculate weighted averages across stations
        pcp_avg = pcp_data.sum(axis=1)
        pcp_avg.name = 'P2020(mm)'
        pcp_avg = pcp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        pcp_avg = pcp_avg.sort_index()
        pcp_avg.index = pred_index

        maxtmp_avg = maxtmp_data.sum(axis=1)
        maxtmp_avg.name = 'MAX-TEM(C)'
        maxtmp_avg = maxtmp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        maxtmp_avg = maxtmp_avg.sort_index()
        maxtmp_avg.index = pred_index

        mintmp_avg = mintmp_data.sum(axis=1)
        mintmp_avg.name = 'MIN-TEM(C)'
        mintmp_avg = mintmp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        mintmp_avg = mintmp_avg.sort_index()
        mintmp_avg.index = pred_index

        slr_avg = slr_data.sum(axis=1)
        slr_avg.name = 'SLR(MJ/m^2)'
        slr_avg = slr_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        slr_avg = slr_avg.sort_index()
        slr_avg.index = pred_index

        hmd_avg = hmd_data.sum(axis=1)
        hmd_avg.name = 'AVG-RHU(%)'
        hmd_avg = hmd_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        hmd_avg = hmd_avg.sort_index()
        hmd_avg.index = pred_index

        wnd_avg = wnd_data.sum(axis=1)
        wnd_avg.name = 'AVG-WV(m/s)'
        wnd_avg = wnd_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        wnd_avg = wnd_avg.sort_index()
        wnd_avg.index = pred_index

        # Read water balance data
        wb = pd.read_csv(f'../result/SWATPlusWaterBlanceDataFromMeteSimYr/YellowRiver{hydrostation_abbrs[hydro_station]}_BasinWaterBalance_pred{pred_year}.csv', 
                         index_col=['date'], parse_dates=['date'])
        wb = wb.drop(columns=['mon', 'day', 'yr', 'name'])
        wb = wb.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        wb = wb.sort_index()
        wb.index = pred_index


        # Read monthly streamflow data
        flow = pd.read_csv(f'../data/{hydro_station}_natural_monthly_flow.csv', 
                           index_col=['date'], parse_dates=['date'])
        flow = flow.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        flow = flow.sort_index()
        flow.index = pred_index

        # read simulated streamflow of SWAT+
        SWATPlus_sim_flow = pd.read_csv(f'../result/SWATPlusCalValSimData/Channel_{hydrostation_channel[hydro_station]}_Monthly_River-Flow_{hydro_station}_Sim1972_2019.csv', 
                               index_col=['Date'], parse_dates=['Date'])
        SWATPlus_sim_flow = SWATPlus_sim_flow['Value']
        SWATPlus_sim_flow.index.name = 'date'
        SWATPlus_sim_flow.name = 'SWATPlusSimFlow'
        SWATPlus_sim_flow = SWATPlus_sim_flow.loc[f'{start_year}-01-01':f'{pred_year-1}-12-31'] # 历史模拟流量，也可以算作集水区初始条件
        SWATPlus_sim_flow = SWATPlus_sim_flow.sort_index()
        SWATPlus_sim_flow.index = pd.date_range(f'{start_year}-01-01', f'{pred_year-1}-12-31', freq='MS')

        # Read preddiced streamflow of SWAT+, 首选获取SWAT+模型预测结果；这样做的目的是对SWAT+预测结果进行修正
        SWATPlus_pred_flow = pd.read_csv(f'../result/SWATPlusPredUsingMeteSimYearData/{hydro_station}_SWATPlus_pred_obs_2015_2019.csv', 
                               index_col=['date'], parse_dates=['date'])
        SWATPlus_pred_flow = SWATPlus_pred_flow['pred']
        SWATPlus_pred_flow.index.name = 'date'
        SWATPlus_pred_flow.name = 'SWATPlusSimFlow'
        SWATPlus_pred_flow = SWATPlus_pred_flow.loc[f'{pred_year}-01-01':f'{pred_year}-12-31'] #获取预测年份预测流量
        SWATPlus_pred_flow = SWATPlus_pred_flow.sort_index()
        SWATPlus_pred_flow.index = pd.date_range(f'{pred_year}-01-01', f'{pred_year}-12-31', freq='MS')

        SWATPlus_flow = pd.concat([SWATPlus_sim_flow, SWATPlus_pred_flow], axis=0)

        arima_sim =pd.read_csv(f'../result/ARIMAPredData/seasonal_decompose_multiplicative_arima_train_sim_{hydro_station}_before_{pred_year}.csv',index_col=['date'],parse_dates=['date'])
        arima_sim = arima_sim.loc[f'{start_year}-01-01':f'{pred_year-1}-12-31','SimFlow(m^3/s)']
        arima_sim.name = 'ARIMASimFlow'
        arima_sim = arima_sim.sort_index()
        arima_sim.index = pd.date_range(f'{start_year}-01-01', f'{pred_year-1}-12-31', freq='MS')

        arima_pred = pd.read_csv(f'../result/ARIMAPredData/seasonal_decompose_multiplicative_arima_pred_{hydro_station}_{pred_years[0]}_{pred_years[-1]}.csv',index_col=['date'],parse_dates=['date'])
        arima_pred = arima_pred.loc[f'{pred_year}-01-01':f'{pred_year}-12-31','flow(m^3/s)']
        arima_pred.name = 'ARIMASimFlow'
        arima_pred = arima_pred.sort_index()
        arima_pred.index = pd.date_range(f'{pred_year}-01-01', f'{pred_year}-12-31', freq='MS')

        arima_flow = pd.concat([arima_sim, arima_pred], axis=0)

        # Concatenate all data
        all_data = pd.concat([pcp_avg, maxtmp_avg, mintmp_avg, slr_avg, hmd_avg, wnd_avg, wb,SWATPlus_flow,flow], axis=1)

             
        # Drop columns with all zero values
        all_data = all_data.loc[:, (all_data != 0).any(axis=0)]
        
        # Set index name
        all_data.index.name = 'date'
        all_data.to_csv(sample_path+f'{hydro_station}_MeteAVGCalvalFeatureDataForML_PRED{pred_year}.csv', index=True)

        
        selected_data = all_data.loc[:, selected_features]

        feature_samples, target_samples = gen_multi_output_samples(
            timeseries=selected_data.copy(),
            target_column='flow(m^3/s)',
            lead=12,
            lag=12,
        )
        feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_feature_samples_pred{pred_year}.csv', index=True)
        target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_target_samples_pred{pred_year}.csv', index=True)

    print(f"Data for {hydro_station} processed and saved.")
    

No null values found in Tangnaihai data.
Data for Tangnaihai processed and saved.
No null values found in Tangnaihai data.
Data for Tangnaihai processed and saved.
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)', 'SLR(MJ/m^2)', 'AVG-RHU(%)', 'AVG-WV(m/s)', 'precip(mm)', 'snofall(mm)', 'snomlt(mm)', 'surq_gen(mm)', 'latq(mm)', 'wateryld(mm)', 'perc(mm)', 'et(mm)', 'ecanopy(mm)', 'eplant(mm)', 'esoil(mm)', 'surq_cont(mm)', 'cn', 'sw_init(mm)', 'sw_final(mm)', 'sw_ave(mm)', 'sw_300(mm)', 'sno_init(mm)', 'sno_final(mm)', 'snopack(mm)', 'pet(mm)', 'qtile(mm)', 'irr(mm)', 'surq_runon(mm)', 'latq_runon(mm)', 'overbank(mm)', 'surq_cha(mm)', 'surq_res(mm)', 'surq_ls(mm)', 'latq_cha(mm)', 'latq_res(mm)', 'latq_ls(mm)', 'gwsoilq(mm)', 'satex(mm)', 'satex_chan(mm)', 'sw_change(mm)', 'lagsurf(mm)', 'laglatq(mm)', 'lagsatex(mm)', 'wet_evap(mm)', 'wet_oflo(mm)', 'wet_stor(mm)', 'SWATPlusSimFlow']
No null values found in Guide data.
Data for Guide processed and saved.
No null values found in Guide

In [17]:
sample_path = '../samples_mete_wb_vif/InputOutputSamples_hydrosimyr_swatpsim_arimasim/'
if not os.path.exists(sample_path):
    os.makedirs(sample_path)
selected_features = [
    'P2020(mm)', 'MAX-TEM(C)','MIN-TEM(C)',
    'latq(mm)', 'eplant(mm)', 'wet_evap(mm)', 
    'surq_cha(mm)', 'wet_oflo(mm)', 'surq_gen(mm)', 
    'snomlt(mm)', 'snofall(mm)', 'sw_change(mm)', 'snopack(mm)',
                         'SWATPlusSimFlow','ARIMASimFlow','flow(m^3/s)']

for hydro_station in hydro_stations:
    station_names = hydrostation_metestations[hydro_station]

    index = pd.date_range(calval_start,calval_end,freq='M')
    
    # Initialize DataFrames to store aggregated data
    pcp_data = pd.DataFrame()
    maxtmp_data = pd.DataFrame()
    mintmp_data = pd.DataFrame()
    slr_data = pd.DataFrame()
    hmd_data = pd.DataFrame()
    wnd_data = pd.DataFrame()

    total_area = sum(metestation_controal_area_dict[station] for station in station_names)
    weights = {station: metestation_controal_area_dict[station] / total_area for station in station_names}

    for station in station_names:
        # Read climate data for each station
        station_data = pd.read_csv(f'D:/DataSpace/HydroMeteAnthropicDatabase/7.FilledRawMeteObsInfo/ChinaLandDailyMeteV3(InsertSolarRadiation)/{station}.csv', 
                                   index_col=['DATE'], parse_dates=['DATE'])
        station_calval = station_data.loc[calval_start:calval_end]

        # Aggregate data
        pcp_data[station] = station_calval['P2020(mm)'].resample('MS').sum() * weights[station]
        maxtmp_data[station] = station_calval['MAX-TEM(C)'].resample('MS').mean() * weights[station]
        mintmp_data[station] = station_calval['MIN-TEM(C)'].resample('MS').mean() * weights[station]
        slr_data[station] = station_calval['SLR(MJ/m^2)'].resample('MS').sum() * weights[station]
        hmd_data[station] = station_calval['AVG-RHU(%)'].resample('MS').mean() * weights[station]
        wnd_data[station] = station_calval['AVG-WV(m/s)'].resample('MS').mean() * weights[station]

    # Calculate weighted averages across stations
    pcp_avg = pcp_data.sum(axis=1)
    pcp_avg.name = 'P2020(mm)'
    pcp_avg.index = index
    maxtmp_avg = maxtmp_data.sum(axis=1)
    maxtmp_avg.name = 'MAX-TEM(C)'
    maxtmp_avg.index = index
    mintmp_avg = mintmp_data.sum(axis=1)
    mintmp_avg.name = 'MIN-TEM(C)'
    mintmp_avg.index = index
    slr_avg = slr_data.sum(axis=1)
    slr_avg.name = 'SLR(MJ/m^2)'
    slr_avg.index = index
    hmd_avg = hmd_data.sum(axis=1)
    hmd_avg.name = 'AVG-RHU(%)'
    hmd_avg.index = index
    wnd_avg = wnd_data.sum(axis=1)
    wnd_avg.name = 'AVG-WV(m/s)'
    wnd_avg.index = index


    # Read water balance data
    wb = pd.read_csv(f'../result/SWATPlusWaterBlanceDataFromHydroSimYr/YellowRiver{hydrostation_abbrs[hydro_station]}_BasinWaterBalance_pred2019.csv', 
                     index_col=['date'], parse_dates=['date'])
    wb_calval = wb.loc[calval_start:calval_end]
    wb_calval = wb_calval.drop(columns=['mon', 'day', 'yr', 'name'])
    wb_calval = wb_calval.sort_index()
    wb_calval.index = index

    # print(wb_calval.isnull().any())

    # Read monthly streamflow data
    flow = pd.read_csv(f'../data/{hydro_station}_natural_monthly_flow.csv', 
                       index_col=['date'], parse_dates=['date'])
    flow_calval = flow.loc[calval_start:calval_end]
    flow_calval = flow_calval.sort_index()
    flow_calval.index = index


    arima_sim =pd.read_csv(f'../result/ARIMAPredData/seasonal_decompose_multiplicative_arima_train_sim_{hydro_station}_before_2015.csv',index_col=['date'],parse_dates=['date'])
    arima_sim = arima_sim.loc[calval_start:calval_end,'SimFlow(m^3/s)']
    arima_sim.name = 'ARIMASimFlow'
    arima_sim = arima_sim.sort_index()
    arima_sim.index = index

    # Read simulated streamflow
    swatplus_sim_flow = pd.read_csv(f'../result/SWATPlusCalValSimData/Channel_{hydrostation_channel[hydro_station]}_Monthly_River-Flow_{hydro_station}_Sim1972_2019.csv', 
                           index_col=['Date'], parse_dates=['Date'])
    swatplus_sim_flow.columns = ['SWATPlusSimFlow']
    swatplus_sim_flow = swatplus_sim_flow.loc[calval_start:calval_end]
    swatplus_sim_flow = swatplus_sim_flow.sort_index()
    swatplus_sim_flow.index = flow_calval.index
    swatplus_sim_flow.index.name = 'date'
    
    

    # Concatenate all data
    all_data = pd.concat([pcp_avg, maxtmp_avg, mintmp_avg, slr_avg, hmd_avg, wnd_avg, wb_calval, swatplus_sim_flow, arima_sim, flow_calval], axis=1)
    all_data.index.name = 'date'

    # Remove columns with all zero values
    all_data = all_data.loc[:, (all_data != 0).any(axis=0)]

    # Save all data
    all_data.to_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv', index=True)
    
    # Check for null values in each column
    null_columns = all_data.columns[all_data.isnull().any()].tolist()
    if null_columns:
        print(f"Columns with null values in {hydro_station} data:")
        for col in null_columns:
            null_count = all_data[col].isnull().sum()
            print(f"  {col}: {null_count} null values")
    else:
        print(f"No null values found in {hydro_station} data.")
    
    print(f"Data for {hydro_station} processed and saved.")

    all_data = pd.read_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv',index_col=['date'],parse_dates=['date'])
    feature_samples, target_samples = gen_multi_output_samples(
        timeseries=all_data.copy(),
        target_column='flow(m^3/s)',
        lag=12,
        lead=12,
    )
    feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_full_feature_samples_calval.csv',index=True)
    target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_full_target_samples_calval.csv',index=True)
# select features including ['snomlt(mm)', 'eplant(mm)', 'surq_cha(mm)', 'snofall(mm)', 'wet_oflo(mm)', 'wet_evap(mm)', 'snopack(mm)', 'sw_change(mm)', 'MIN-TEM(C)','flow(m^3/s)'] from all_data
# ['snomlt(mm)', 'eplant(mm)', 'surq_cha(mm)', 'snofall(mm)', 'wet_oflo(mm)', 'wet_evap(mm)', 'snopack(mm)', 'sw_change(mm)', 'MIN-TEM(C)']
for hydro_station in hydro_stations:
    all_data = pd.read_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv',index_col=['date'],parse_dates=['date'])
    selected_data = all_data.loc[:,selected_features]

    feature_samples, target_samples = gen_multi_output_samples(
        timeseries=selected_data.copy(),
        target_column='flow(m^3/s)',
        lead=12,
        lag=12,
    )
    feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_feature_samples_calval.csv',index=True)
    target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_target_samples_calval.csv',index=True)

pred_years = np.arange(2015, 2020)
start_year = 1972

for hydro_station in hydro_stations:
    similarity_years = pd.read_csv(f'../result/SimilarityYears/{hydro_station}_hydrological_similiarity_year.csv')
    similarity_dict = dict(zip(similarity_years['PredYear'], similarity_years['SimYear']))
    station_names = hydrostation_metestations[hydro_station]
    
    for pred_year in pred_years:

        pred_index = pd.date_range(f'{start_year}-01-01', f'{pred_year}-12-31', freq='MS')

        # Initialize DataFrames to store aggregated data
        pcp_data = pd.DataFrame(index=pred_index)
        maxtmp_data = pd.DataFrame(index=pred_index)
        mintmp_data = pd.DataFrame(index=pred_index)
        slr_data = pd.DataFrame(index=pred_index)
        hmd_data = pd.DataFrame(index=pred_index)
        wnd_data = pd.DataFrame(index=pred_index)

        total_area = sum(metestation_controal_area_dict[station] for station in station_names)
        weights = {station: metestation_controal_area_dict[station] / total_area for station in station_names}

        for station_name in station_names:
            climate_data = pd.read_csv(f'D:/DataSpace/HydroMeteAnthropicDatabase/7.FilledRawMeteObsInfo/ChinaLandDailyMeteV3(InsertSolarRadiation)/{station_name}.csv', 
                                       index_col=['DATE'], parse_dates=['DATE'])
            
            tar_year = pred_year #将预测年份数据进行替换
            ref_year = similarity_dict[pred_year]
            
            target_data = climate_data[climate_data.index.year == tar_year]
            reference_data = climate_data[climate_data.index.year == ref_year]
            
            if len(reference_data) == len(target_data):
                climate_data.loc[target_data.index, :] = reference_data.values
            elif len(reference_data) > len(target_data):
                reference_data = reference_data[:-1]
                climate_data.loc[target_data.index, :] = reference_data.values
            else:
                last_day = reference_data.iloc[-1:]
                reference_data = pd.concat([reference_data, last_day])
                climate_data.loc[target_data.index, :] = reference_data.values

            climate_data = climate_data.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
            
            pcp_data[station_name] = climate_data['P2020(mm)'].resample('MS').sum() * weights[station_name]
            maxtmp_data[station_name] = climate_data['MAX-TEM(C)'].resample('MS').mean() * weights[station_name]
            mintmp_data[station_name] = climate_data['MIN-TEM(C)'].resample('MS').mean() * weights[station_name]
            slr_data[station_name] = climate_data['SLR(MJ/m^2)'].resample('MS').sum() * weights[station_name]
            hmd_data[station_name] = climate_data['AVG-RHU(%)'].resample('MS').mean() * weights[station_name]
            wnd_data[station_name] = climate_data['AVG-WV(m/s)'].resample('MS').mean() * weights[station_name]

        # Calculate weighted averages across stations
        pcp_avg = pcp_data.sum(axis=1)
        pcp_avg.name = 'P2020(mm)'
        pcp_avg = pcp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        pcp_avg = pcp_avg.sort_index()
        pcp_avg.index = pred_index

        maxtmp_avg = maxtmp_data.sum(axis=1)
        maxtmp_avg.name = 'MAX-TEM(C)'
        maxtmp_avg = maxtmp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        maxtmp_avg = maxtmp_avg.sort_index()
        maxtmp_avg.index = pred_index

        mintmp_avg = mintmp_data.sum(axis=1)
        mintmp_avg.name = 'MIN-TEM(C)'
        mintmp_avg = mintmp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        mintmp_avg = mintmp_avg.sort_index()
        mintmp_avg.index = pred_index

        slr_avg = slr_data.sum(axis=1)
        slr_avg.name = 'SLR(MJ/m^2)'
        slr_avg = slr_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        slr_avg = slr_avg.sort_index()
        slr_avg.index = pred_index

        hmd_avg = hmd_data.sum(axis=1)
        hmd_avg.name = 'AVG-RHU(%)'
        hmd_avg = hmd_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        hmd_avg = hmd_avg.sort_index()
        hmd_avg.index = pred_index

        wnd_avg = wnd_data.sum(axis=1)
        wnd_avg.name = 'AVG-WV(m/s)'
        wnd_avg = wnd_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        wnd_avg = wnd_avg.sort_index()
        wnd_avg.index = pred_index

        # Read water balance data
        wb = pd.read_csv(f'../result/SWATPlusWaterBlanceDataFromHydroSimYr/YellowRiver{hydrostation_abbrs[hydro_station]}_BasinWaterBalance_pred{pred_year}.csv', 
                         index_col=['date'], parse_dates=['date'])
        wb = wb.drop(columns=['mon', 'day', 'yr', 'name'])
        wb = wb.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        wb = wb.sort_index()
        wb.index = pred_index


        # Read monthly streamflow data
        flow = pd.read_csv(f'../data/{hydro_station}_natural_monthly_flow.csv', 
                           index_col=['date'], parse_dates=['date'])
        flow = flow.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        flow = flow.sort_index()
        flow.index = pred_index

        # read simulated streamflow of SWAT+
        SWATPlus_sim_flow = pd.read_csv(f'../result/SWATPlusCalValSimData/Channel_{hydrostation_channel[hydro_station]}_Monthly_River-Flow_{hydro_station}_Sim1972_2019.csv', 
                               index_col=['Date'], parse_dates=['Date'])
        SWATPlus_sim_flow = SWATPlus_sim_flow['Value']
        SWATPlus_sim_flow.index.name = 'date'
        SWATPlus_sim_flow.name = 'SWATPlusSimFlow'
        SWATPlus_sim_flow = SWATPlus_sim_flow.loc[f'{start_year}-01-01':f'{pred_year-1}-12-31'] # 历史模拟流量，也可以算作集水区初始条件
        SWATPlus_sim_flow = SWATPlus_sim_flow.sort_index()
        SWATPlus_sim_flow.index = pd.date_range(f'{start_year}-01-01', f'{pred_year-1}-12-31', freq='MS')

        # Read preddiced streamflow of SWAT+, 首选获取SWAT+模型预测结果；这样做的目的是对SWAT+预测结果进行修正
        SWATPlus_pred_flow = pd.read_csv(f'../result/SWATPlusPredUsingHydroSimYearData/{hydro_station}_SWATPlus_pred_obs_2015_2019.csv', 
                               index_col=['date'], parse_dates=['date'])
        SWATPlus_pred_flow = SWATPlus_pred_flow['pred']
        SWATPlus_pred_flow.index.name = 'date'
        SWATPlus_pred_flow.name = 'SWATPlusSimFlow'
        SWATPlus_pred_flow = SWATPlus_pred_flow.loc[f'{pred_year}-01-01':f'{pred_year}-12-31'] #获取预测年份预测流量
        SWATPlus_pred_flow = SWATPlus_pred_flow.sort_index()
        SWATPlus_pred_flow.index = pd.date_range(f'{pred_year}-01-01', f'{pred_year}-12-31', freq='MS')

        SWATPlus_flow = pd.concat([SWATPlus_sim_flow, SWATPlus_pred_flow], axis=0)

        arima_sim =pd.read_csv(f'../result/ARIMAPredData/seasonal_decompose_multiplicative_arima_train_sim_{hydro_station}_before_{pred_year}.csv',index_col=['date'],parse_dates=['date'])
        arima_sim = arima_sim.loc[f'{start_year}-01-01':f'{pred_year-1}-12-31','SimFlow(m^3/s)']
        arima_sim.name = 'ARIMASimFlow'
        arima_sim = arima_sim.sort_index()
        arima_sim.index = pd.date_range(f'{start_year}-01-01', f'{pred_year-1}-12-31', freq='MS')

        arima_pred = pd.read_csv(f'../result/ARIMAPredData/seasonal_decompose_multiplicative_arima_pred_{hydro_station}_{pred_years[0]}_{pred_years[-1]}.csv',index_col=['date'],parse_dates=['date'])
        arima_pred = arima_pred.loc[f'{pred_year}-01-01':f'{pred_year}-12-31','flow(m^3/s)']
        arima_pred.name = 'ARIMASimFlow'
        arima_pred = arima_pred.sort_index()
        arima_pred.index = pd.date_range(f'{pred_year}-01-01', f'{pred_year}-12-31', freq='MS')

        arima_flow = pd.concat([arima_sim, arima_pred], axis=0)

        # Concatenate all data
        all_data = pd.concat([pcp_avg, maxtmp_avg, mintmp_avg, slr_avg, hmd_avg, wnd_avg, wb,SWATPlus_flow,arima_flow,flow], axis=1)

             
        # Drop columns with all zero values
        all_data = all_data.loc[:, (all_data != 0).any(axis=0)]
        
        # Set index name
        all_data.index.name = 'date'
        all_data.to_csv(sample_path+f'{hydro_station}_MeteAVGCalvalFeatureDataForML_PRED{pred_year}.csv', index=True)

        
        selected_data = all_data.loc[:, selected_features]

        feature_samples, target_samples = gen_multi_output_samples(
            timeseries=selected_data.copy(),
            target_column='flow(m^3/s)',
            lead=12,
            lag=12,
        )
        feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_feature_samples_pred{pred_year}.csv', index=True)
        target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_target_samples_pred{pred_year}.csv', index=True)

    print(f"Data for {hydro_station} processed and saved.")
    

No null values found in Tangnaihai data.
Data for Tangnaihai processed and saved.
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)', 'SLR(MJ/m^2)', 'AVG-RHU(%)', 'AVG-WV(m/s)', 'precip(mm)', 'snofall(mm)', 'snomlt(mm)', 'surq_gen(mm)', 'latq(mm)', 'wateryld(mm)', 'perc(mm)', 'et(mm)', 'ecanopy(mm)', 'eplant(mm)', 'esoil(mm)', 'surq_cont(mm)', 'cn', 'sw_init(mm)', 'sw_final(mm)', 'sw_ave(mm)', 'sw_300(mm)', 'sno_init(mm)', 'sno_final(mm)', 'snopack(mm)', 'pet(mm)', 'surq_cha(mm)', 'latq_cha(mm)', 'sw_change(mm)', 'lagsurf(mm)', 'laglatq(mm)', 'wet_evap(mm)', 'wet_oflo(mm)', 'wet_stor(mm)', 'SWATPlusSimFlow', 'ARIMASimFlow']
No null values found in Guide data.
Data for Guide processed and saved.
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)', 'SLR(MJ/m^2)', 'AVG-RHU(%)', 'AVG-WV(m/s)', 'precip(mm)', 'snofall(mm)', 'snomlt(mm)', 'surq_gen(mm)', 'latq(mm)', 'wateryld(mm)', 'perc(mm)', 'et(mm)', 'ecanopy(mm)', 'eplant(mm)', 'esoil(mm)', 'surq_cont(mm)', 'cn', 'sw_init(mm)', 'sw_final

In [18]:
sample_path = '../samples_mete_wb_vif/InputOutputSamples_hydrosimyr_swatpsim/'
if not os.path.exists(sample_path):
    os.makedirs(sample_path)
selected_features = [
    'P2020(mm)', 'MAX-TEM(C)','MIN-TEM(C)',
    'latq(mm)', 'eplant(mm)', 'wet_evap(mm)', 
    'surq_cha(mm)', 'wet_oflo(mm)', 'surq_gen(mm)', 
    'snomlt(mm)', 'snofall(mm)', 'sw_change(mm)', 'snopack(mm)',
                         'SWATPlusSimFlow','flow(m^3/s)']

for hydro_station in hydro_stations:
    station_names = hydrostation_metestations[hydro_station]

    index = pd.date_range(calval_start,calval_end,freq='M')
    
    # Initialize DataFrames to store aggregated data
    pcp_data = pd.DataFrame()
    maxtmp_data = pd.DataFrame()
    mintmp_data = pd.DataFrame()
    slr_data = pd.DataFrame()
    hmd_data = pd.DataFrame()
    wnd_data = pd.DataFrame()

    total_area = sum(metestation_controal_area_dict[station] for station in station_names)
    weights = {station: metestation_controal_area_dict[station] / total_area for station in station_names}

    for station in station_names:
        # Read climate data for each station
        station_data = pd.read_csv(f'D:/DataSpace/HydroMeteAnthropicDatabase/7.FilledRawMeteObsInfo/ChinaLandDailyMeteV3(InsertSolarRadiation)/{station}.csv', 
                                   index_col=['DATE'], parse_dates=['DATE'])
        station_calval = station_data.loc[calval_start:calval_end]

        # Aggregate data
        pcp_data[station] = station_calval['P2020(mm)'].resample('MS').sum() * weights[station]
        maxtmp_data[station] = station_calval['MAX-TEM(C)'].resample('MS').mean() * weights[station]
        mintmp_data[station] = station_calval['MIN-TEM(C)'].resample('MS').mean() * weights[station]
        slr_data[station] = station_calval['SLR(MJ/m^2)'].resample('MS').sum() * weights[station]
        hmd_data[station] = station_calval['AVG-RHU(%)'].resample('MS').mean() * weights[station]
        wnd_data[station] = station_calval['AVG-WV(m/s)'].resample('MS').mean() * weights[station]

    # Calculate weighted averages across stations
    pcp_avg = pcp_data.sum(axis=1)
    pcp_avg.name = 'P2020(mm)'
    pcp_avg.index = index
    maxtmp_avg = maxtmp_data.sum(axis=1)
    maxtmp_avg.name = 'MAX-TEM(C)'
    maxtmp_avg.index = index
    mintmp_avg = mintmp_data.sum(axis=1)
    mintmp_avg.name = 'MIN-TEM(C)'
    mintmp_avg.index = index
    slr_avg = slr_data.sum(axis=1)
    slr_avg.name = 'SLR(MJ/m^2)'
    slr_avg.index = index
    hmd_avg = hmd_data.sum(axis=1)
    hmd_avg.name = 'AVG-RHU(%)'
    hmd_avg.index = index
    wnd_avg = wnd_data.sum(axis=1)
    wnd_avg.name = 'AVG-WV(m/s)'
    wnd_avg.index = index


    # Read water balance data
    wb = pd.read_csv(f'../result/SWATPlusWaterBlanceDataFromHydroSimYr/YellowRiver{hydrostation_abbrs[hydro_station]}_BasinWaterBalance_pred2019.csv', 
                     index_col=['date'], parse_dates=['date'])
    wb_calval = wb.loc[calval_start:calval_end]
    wb_calval = wb_calval.drop(columns=['mon', 'day', 'yr', 'name'])
    wb_calval = wb_calval.sort_index()
    wb_calval.index = index

    # print(wb_calval.isnull().any())

    # Read monthly streamflow data
    flow = pd.read_csv(f'../data/{hydro_station}_natural_monthly_flow.csv', 
                       index_col=['date'], parse_dates=['date'])
    flow_calval = flow.loc[calval_start:calval_end]
    flow_calval = flow_calval.sort_index()
    flow_calval.index = index


    arima_sim =pd.read_csv(f'../result/ARIMAPredData/seasonal_decompose_multiplicative_arima_train_sim_{hydro_station}_before_2015.csv',index_col=['date'],parse_dates=['date'])
    arima_sim = arima_sim.loc[calval_start:calval_end,'SimFlow(m^3/s)']
    arima_sim.name = 'ARIMASimFlow'
    arima_sim = arima_sim.sort_index()
    arima_sim.index = index

    # Read simulated streamflow
    swatplus_sim_flow = pd.read_csv(f'../result/SWATPlusCalValSimData/Channel_{hydrostation_channel[hydro_station]}_Monthly_River-Flow_{hydro_station}_Sim1972_2019.csv', 
                           index_col=['Date'], parse_dates=['Date'])
    swatplus_sim_flow.columns = ['SWATPlusSimFlow']
    swatplus_sim_flow = swatplus_sim_flow.loc[calval_start:calval_end]
    swatplus_sim_flow = swatplus_sim_flow.sort_index()
    swatplus_sim_flow.index = flow_calval.index
    swatplus_sim_flow.index.name = 'date'
    
    

    # Concatenate all data
    all_data = pd.concat([pcp_avg, maxtmp_avg, mintmp_avg, slr_avg, hmd_avg, wnd_avg, wb_calval, swatplus_sim_flow, flow_calval], axis=1)
    all_data.index.name = 'date'

    # Remove columns with all zero values
    all_data = all_data.loc[:, (all_data != 0).any(axis=0)]

    # Save all data
    all_data.to_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv', index=True)
    
    # Check for null values in each column
    null_columns = all_data.columns[all_data.isnull().any()].tolist()
    if null_columns:
        print(f"Columns with null values in {hydro_station} data:")
        for col in null_columns:
            null_count = all_data[col].isnull().sum()
            print(f"  {col}: {null_count} null values")
    else:
        print(f"No null values found in {hydro_station} data.")
    
    print(f"Data for {hydro_station} processed and saved.")

    all_data = pd.read_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv',index_col=['date'],parse_dates=['date'])
    feature_samples, target_samples = gen_multi_output_samples(
        timeseries=all_data.copy(),
        target_column='flow(m^3/s)',
        lag=12,
        lead=12,
    )
    feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_full_feature_samples_calval.csv',index=True)
    target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_full_target_samples_calval.csv',index=True)
# select features including ['snomlt(mm)', 'eplant(mm)', 'surq_cha(mm)', 'snofall(mm)', 'wet_oflo(mm)', 'wet_evap(mm)', 'snopack(mm)', 'sw_change(mm)', 'MIN-TEM(C)','flow(m^3/s)'] from all_data
# ['snomlt(mm)', 'eplant(mm)', 'surq_cha(mm)', 'snofall(mm)', 'wet_oflo(mm)', 'wet_evap(mm)', 'snopack(mm)', 'sw_change(mm)', 'MIN-TEM(C)']
for hydro_station in hydro_stations:
    all_data = pd.read_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv',index_col=['date'],parse_dates=['date'])
    selected_data = all_data.loc[:,selected_features]

    feature_samples, target_samples = gen_multi_output_samples(
        timeseries=selected_data.copy(),
        target_column='flow(m^3/s)',
        lead=12,
        lag=12,
    )
    feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_feature_samples_calval.csv',index=True)
    target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_target_samples_calval.csv',index=True)

pred_years = np.arange(2015, 2020)
start_year = 1972

for hydro_station in hydro_stations:
    similarity_years = pd.read_csv(f'../result/SimilarityYears/{hydro_station}_hydrological_similiarity_year.csv')
    similarity_dict = dict(zip(similarity_years['PredYear'], similarity_years['SimYear']))
    station_names = hydrostation_metestations[hydro_station]
    
    for pred_year in pred_years:

        pred_index = pd.date_range(f'{start_year}-01-01', f'{pred_year}-12-31', freq='MS')

        # Initialize DataFrames to store aggregated data
        pcp_data = pd.DataFrame(index=pred_index)
        maxtmp_data = pd.DataFrame(index=pred_index)
        mintmp_data = pd.DataFrame(index=pred_index)
        slr_data = pd.DataFrame(index=pred_index)
        hmd_data = pd.DataFrame(index=pred_index)
        wnd_data = pd.DataFrame(index=pred_index)

        total_area = sum(metestation_controal_area_dict[station] for station in station_names)
        weights = {station: metestation_controal_area_dict[station] / total_area for station in station_names}

        for station_name in station_names:
            climate_data = pd.read_csv(f'D:/DataSpace/HydroMeteAnthropicDatabase/7.FilledRawMeteObsInfo/ChinaLandDailyMeteV3(InsertSolarRadiation)/{station_name}.csv', 
                                       index_col=['DATE'], parse_dates=['DATE'])
            
            tar_year = pred_year #将预测年份数据进行替换
            ref_year = similarity_dict[pred_year]
            
            target_data = climate_data[climate_data.index.year == tar_year]
            reference_data = climate_data[climate_data.index.year == ref_year]
            
            if len(reference_data) == len(target_data):
                climate_data.loc[target_data.index, :] = reference_data.values
            elif len(reference_data) > len(target_data):
                reference_data = reference_data[:-1]
                climate_data.loc[target_data.index, :] = reference_data.values
            else:
                last_day = reference_data.iloc[-1:]
                reference_data = pd.concat([reference_data, last_day])
                climate_data.loc[target_data.index, :] = reference_data.values

            climate_data = climate_data.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
            
            pcp_data[station_name] = climate_data['P2020(mm)'].resample('MS').sum() * weights[station_name]
            maxtmp_data[station_name] = climate_data['MAX-TEM(C)'].resample('MS').mean() * weights[station_name]
            mintmp_data[station_name] = climate_data['MIN-TEM(C)'].resample('MS').mean() * weights[station_name]
            slr_data[station_name] = climate_data['SLR(MJ/m^2)'].resample('MS').sum() * weights[station_name]
            hmd_data[station_name] = climate_data['AVG-RHU(%)'].resample('MS').mean() * weights[station_name]
            wnd_data[station_name] = climate_data['AVG-WV(m/s)'].resample('MS').mean() * weights[station_name]

        # Calculate weighted averages across stations
        pcp_avg = pcp_data.sum(axis=1)
        pcp_avg.name = 'P2020(mm)'
        pcp_avg = pcp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        pcp_avg = pcp_avg.sort_index()
        pcp_avg.index = pred_index

        maxtmp_avg = maxtmp_data.sum(axis=1)
        maxtmp_avg.name = 'MAX-TEM(C)'
        maxtmp_avg = maxtmp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        maxtmp_avg = maxtmp_avg.sort_index()
        maxtmp_avg.index = pred_index

        mintmp_avg = mintmp_data.sum(axis=1)
        mintmp_avg.name = 'MIN-TEM(C)'
        mintmp_avg = mintmp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        mintmp_avg = mintmp_avg.sort_index()
        mintmp_avg.index = pred_index

        slr_avg = slr_data.sum(axis=1)
        slr_avg.name = 'SLR(MJ/m^2)'
        slr_avg = slr_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        slr_avg = slr_avg.sort_index()
        slr_avg.index = pred_index

        hmd_avg = hmd_data.sum(axis=1)
        hmd_avg.name = 'AVG-RHU(%)'
        hmd_avg = hmd_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        hmd_avg = hmd_avg.sort_index()
        hmd_avg.index = pred_index

        wnd_avg = wnd_data.sum(axis=1)
        wnd_avg.name = 'AVG-WV(m/s)'
        wnd_avg = wnd_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        wnd_avg = wnd_avg.sort_index()
        wnd_avg.index = pred_index

        # Read water balance data
        wb = pd.read_csv(f'../result/SWATPlusWaterBlanceDataFromHydroSimYr/YellowRiver{hydrostation_abbrs[hydro_station]}_BasinWaterBalance_pred{pred_year}.csv', 
                         index_col=['date'], parse_dates=['date'])
        wb = wb.drop(columns=['mon', 'day', 'yr', 'name'])
        wb = wb.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        wb = wb.sort_index()
        wb.index = pred_index


        # Read monthly streamflow data
        flow = pd.read_csv(f'../data/{hydro_station}_natural_monthly_flow.csv', 
                           index_col=['date'], parse_dates=['date'])
        flow = flow.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        flow = flow.sort_index()
        flow.index = pred_index

        # read simulated streamflow of SWAT+
        SWATPlus_sim_flow = pd.read_csv(f'../result/SWATPlusCalValSimData/Channel_{hydrostation_channel[hydro_station]}_Monthly_River-Flow_{hydro_station}_Sim1972_2019.csv', 
                               index_col=['Date'], parse_dates=['Date'])
        SWATPlus_sim_flow = SWATPlus_sim_flow['Value']
        SWATPlus_sim_flow.index.name = 'date'
        SWATPlus_sim_flow.name = 'SWATPlusSimFlow'
        SWATPlus_sim_flow = SWATPlus_sim_flow.loc[f'{start_year}-01-01':f'{pred_year-1}-12-31'] # 历史模拟流量，也可以算作集水区初始条件
        SWATPlus_sim_flow = SWATPlus_sim_flow.sort_index()
        SWATPlus_sim_flow.index = pd.date_range(f'{start_year}-01-01', f'{pred_year-1}-12-31', freq='MS')

        # Read preddiced streamflow of SWAT+, 首选获取SWAT+模型预测结果；这样做的目的是对SWAT+预测结果进行修正
        SWATPlus_pred_flow = pd.read_csv(f'../result/SWATPlusPredUsingHydroSimYearData/{hydro_station}_SWATPlus_pred_obs_2015_2019.csv', 
                               index_col=['date'], parse_dates=['date'])
        SWATPlus_pred_flow = SWATPlus_pred_flow['pred']
        SWATPlus_pred_flow.index.name = 'date'
        SWATPlus_pred_flow.name = 'SWATPlusSimFlow'
        SWATPlus_pred_flow = SWATPlus_pred_flow.loc[f'{pred_year}-01-01':f'{pred_year}-12-31'] #获取预测年份预测流量
        SWATPlus_pred_flow = SWATPlus_pred_flow.sort_index()
        SWATPlus_pred_flow.index = pd.date_range(f'{pred_year}-01-01', f'{pred_year}-12-31', freq='MS')

        SWATPlus_flow = pd.concat([SWATPlus_sim_flow, SWATPlus_pred_flow], axis=0)

        arima_sim =pd.read_csv(f'../result/ARIMAPredData/seasonal_decompose_multiplicative_arima_train_sim_{hydro_station}_before_{pred_year}.csv',index_col=['date'],parse_dates=['date'])
        arima_sim = arima_sim.loc[f'{start_year}-01-01':f'{pred_year-1}-12-31','SimFlow(m^3/s)']
        arima_sim.name = 'ARIMASimFlow'
        arima_sim = arima_sim.sort_index()
        arima_sim.index = pd.date_range(f'{start_year}-01-01', f'{pred_year-1}-12-31', freq='MS')

        arima_pred = pd.read_csv(f'../result/ARIMAPredData/seasonal_decompose_multiplicative_arima_pred_{hydro_station}_{pred_years[0]}_{pred_years[-1]}.csv',index_col=['date'],parse_dates=['date'])
        arima_pred = arima_pred.loc[f'{pred_year}-01-01':f'{pred_year}-12-31','flow(m^3/s)']
        arima_pred.name = 'ARIMASimFlow'
        arima_pred = arima_pred.sort_index()
        arima_pred.index = pd.date_range(f'{pred_year}-01-01', f'{pred_year}-12-31', freq='MS')

        arima_flow = pd.concat([arima_sim, arima_pred], axis=0)

        # Concatenate all data
        all_data = pd.concat([pcp_avg, maxtmp_avg, mintmp_avg, slr_avg, hmd_avg, wnd_avg, wb,SWATPlus_flow,flow], axis=1)

             
        # Drop columns with all zero values
        all_data = all_data.loc[:, (all_data != 0).any(axis=0)]
        
        # Set index name
        all_data.index.name = 'date'
        all_data.to_csv(sample_path+f'{hydro_station}_MeteAVGCalvalFeatureDataForML_PRED{pred_year}.csv', index=True)

        
        selected_data = all_data.loc[:, selected_features]

        feature_samples, target_samples = gen_multi_output_samples(
            timeseries=selected_data.copy(),
            target_column='flow(m^3/s)',
            lead=12,
            lag=12,
        )
        feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_feature_samples_pred{pred_year}.csv', index=True)
        target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_target_samples_pred{pred_year}.csv', index=True)

    print(f"Data for {hydro_station} processed and saved.")
    

No null values found in Tangnaihai data.
Data for Tangnaihai processed and saved.
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)', 'SLR(MJ/m^2)', 'AVG-RHU(%)', 'AVG-WV(m/s)', 'precip(mm)', 'snofall(mm)', 'snomlt(mm)', 'surq_gen(mm)', 'latq(mm)', 'wateryld(mm)', 'perc(mm)', 'et(mm)', 'ecanopy(mm)', 'eplant(mm)', 'esoil(mm)', 'surq_cont(mm)', 'cn', 'sw_init(mm)', 'sw_final(mm)', 'sw_ave(mm)', 'sw_300(mm)', 'sno_init(mm)', 'sno_final(mm)', 'snopack(mm)', 'pet(mm)', 'surq_cha(mm)', 'latq_cha(mm)', 'sw_change(mm)', 'lagsurf(mm)', 'laglatq(mm)', 'wet_evap(mm)', 'wet_oflo(mm)', 'wet_stor(mm)', 'SWATPlusSimFlow']
No null values found in Guide data.
Data for Guide processed and saved.
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)', 'SLR(MJ/m^2)', 'AVG-RHU(%)', 'AVG-WV(m/s)', 'precip(mm)', 'snofall(mm)', 'snomlt(mm)', 'surq_gen(mm)', 'latq(mm)', 'wateryld(mm)', 'perc(mm)', 'et(mm)', 'ecanopy(mm)', 'eplant(mm)', 'esoil(mm)', 'surq_cont(mm)', 'cn', 'sw_init(mm)', 'sw_final(mm)', 'sw_ave(m

In [19]:
sample_path = '../samples_mete_wb_vif/InputOutputSamples_hydrosimyr/'
if not os.path.exists(sample_path):
    os.makedirs(sample_path)
selected_features = ['P2020(mm)','MAX-TEM(C)','MIN-TEM(C)','flow(m^3/s)']

for hydro_station in hydro_stations:
    station_names = hydrostation_metestations[hydro_station]

    index = pd.date_range(calval_start,calval_end,freq='MS')
    
    # Initialize DataFrames to store aggregated data
    pcp_data = pd.DataFrame()
    maxtmp_data = pd.DataFrame()
    mintmp_data = pd.DataFrame()
    slr_data = pd.DataFrame()
    hmd_data = pd.DataFrame()
    wnd_data = pd.DataFrame()

    total_area = sum(metestation_controal_area_dict[station] for station in station_names)
    weights = {station: metestation_controal_area_dict[station] / total_area for station in station_names}

    for station in station_names:
        # Read climate data for each station
        station_data = pd.read_csv(f'D:/DataSpace/HydroMeteAnthropicDatabase/7.FilledRawMeteObsInfo/ChinaLandDailyMeteV3(InsertSolarRadiation)/{station}.csv', 
                                   index_col=['DATE'], parse_dates=['DATE'])
        station_calval = station_data.loc[calval_start:calval_end]

        # Aggregate data
        pcp_data[station] = station_calval['P2020(mm)'].resample('MS').sum() * weights[station]
        maxtmp_data[station] = station_calval['MAX-TEM(C)'].resample('MS').mean() * weights[station]
        mintmp_data[station] = station_calval['MIN-TEM(C)'].resample('MS').mean() * weights[station]
        slr_data[station] = station_calval['SLR(MJ/m^2)'].resample('MS').sum() * weights[station]
        hmd_data[station] = station_calval['AVG-RHU(%)'].resample('MS').mean() * weights[station]
        wnd_data[station] = station_calval['AVG-WV(m/s)'].resample('MS').mean() * weights[station]

    # Calculate weighted averages across stations
    pcp_avg = pcp_data.sum(axis=1)
    pcp_avg.name = 'P2020(mm)'
    pcp_avg.index = index
    maxtmp_avg = maxtmp_data.sum(axis=1)
    maxtmp_avg.name = 'MAX-TEM(C)'
    maxtmp_avg.index = index
    mintmp_avg = mintmp_data.sum(axis=1)
    mintmp_avg.name = 'MIN-TEM(C)'
    mintmp_avg.index = index
    slr_avg = slr_data.sum(axis=1)
    slr_avg.name = 'SLR(MJ/m^2)'
    slr_avg.index = index
    hmd_avg = hmd_data.sum(axis=1)
    hmd_avg.name = 'AVG-RHU(%)'
    hmd_avg.index = index
    wnd_avg = wnd_data.sum(axis=1)
    wnd_avg.name = 'AVG-WV(m/s)'
    wnd_avg.index = index

    # Read monthly streamflow data
    flow = pd.read_csv(f'../data/{hydro_station}_natural_monthly_flow.csv', 
                       index_col=['date'], parse_dates=['date'])
    flow_calval = flow.loc[calval_start:calval_end]
    flow_calval = flow_calval.sort_index()
    flow_calval.index = index

    # Concatenate all data
    all_data = pd.concat([pcp_avg, maxtmp_avg, mintmp_avg, slr_avg, hmd_avg, wnd_avg, flow_calval], axis=1)
    all_data.index.name = 'date'

    # Remove columns with all zero values
    all_data = all_data.loc[:, (all_data != 0).any(axis=0)]

    # Save all data
    all_data.to_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv', index=True)
    
    # Check for null values in each column
    null_columns = all_data.columns[all_data.isnull().any()].tolist()
    if null_columns:
        print(f"Columns with null values in {hydro_station} data:")
        for col in null_columns:
            null_count = all_data[col].isnull().sum()
            print(f"  {col}: {null_count} null values")
    else:
        print(f"No null values found in {hydro_station} data.")
    
    print(f"Data for {hydro_station} processed and saved.")

    all_data = pd.read_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv',index_col=['date'],parse_dates=['date'])
    feature_samples, target_samples = gen_multi_output_samples(
        timeseries=all_data.copy(),
        target_column='flow(m^3/s)',
        lag=12,
        lead=12,
    )
    feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_full_feature_samples_calval.csv',index=True)
    target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_full_target_samples_calval.csv',index=True)
# select features including ['snomlt(mm)', 'eplant(mm)', 'surq_cha(mm)', 'snofall(mm)', 'wet_oflo(mm)', 'wet_evap(mm)', 'snopack(mm)', 'sw_change(mm)', 'MIN-TEM(C)','flow(m^3/s)'] from all_data
# ['snomlt(mm)', 'eplant(mm)', 'surq_cha(mm)', 'snofall(mm)', 'wet_oflo(mm)', 'wet_evap(mm)', 'snopack(mm)', 'sw_change(mm)', 'MIN-TEM(C)']
for hydro_station in hydro_stations:
    all_data = pd.read_csv(sample_path+f'MeteAVGCalvalFeatureDataForML_CALVAL_{hydro_station}.csv',index_col=['date'],parse_dates=['date'])
    selected_data = all_data.loc[:,selected_features]

    feature_samples, target_samples = gen_multi_output_samples(
        timeseries=selected_data.copy(),
        target_column='flow(m^3/s)',
        lead=12,
        lag=12,
    )
    feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_feature_samples_calval.csv',index=True)
    target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_target_samples_calval.csv',index=True)

pred_years = np.arange(2015, 2020)
start_year = 1972

for hydro_station in hydro_stations:
    similarity_years = pd.read_csv(f'../result/SimilarityYears/{hydro_station}_hydrological_similiarity_year.csv')
    similarity_dict = dict(zip(similarity_years['PredYear'], similarity_years['SimYear']))
    station_names = hydrostation_metestations[hydro_station]
    
    for pred_year in pred_years:

        pred_index = pd.date_range(f'{start_year}-01-01', f'{pred_year}-12-31', freq='MS')

        # Initialize DataFrames to store aggregated data
        pcp_data = pd.DataFrame(index=pred_index)
        maxtmp_data = pd.DataFrame(index=pred_index)
        mintmp_data = pd.DataFrame(index=pred_index)
        slr_data = pd.DataFrame(index=pred_index)
        hmd_data = pd.DataFrame(index=pred_index)
        wnd_data = pd.DataFrame(index=pred_index)

        total_area = sum(metestation_controal_area_dict[station] for station in station_names)
        weights = {station: metestation_controal_area_dict[station] / total_area for station in station_names}

        for station_name in station_names:
            climate_data = pd.read_csv(f'D:/DataSpace/HydroMeteAnthropicDatabase/7.FilledRawMeteObsInfo/ChinaLandDailyMeteV3(InsertSolarRadiation)/{station_name}.csv', 
                                       index_col=['DATE'], parse_dates=['DATE'])
            
            tar_year = pred_year #将预测年份数据进行替换
            ref_year = similarity_dict[pred_year]
            
            target_data = climate_data[climate_data.index.year == tar_year]
            reference_data = climate_data[climate_data.index.year == ref_year]
            
            if len(reference_data) == len(target_data):
                climate_data.loc[target_data.index, :] = reference_data.values
            elif len(reference_data) > len(target_data):
                reference_data = reference_data[:-1]
                climate_data.loc[target_data.index, :] = reference_data.values
            else:
                last_day = reference_data.iloc[-1:]
                reference_data = pd.concat([reference_data, last_day])
                climate_data.loc[target_data.index, :] = reference_data.values

            climate_data = climate_data.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
            
            pcp_data[station_name] = climate_data['P2020(mm)'].resample('MS').sum() * weights[station_name]
            maxtmp_data[station_name] = climate_data['MAX-TEM(C)'].resample('MS').mean() * weights[station_name]
            mintmp_data[station_name] = climate_data['MIN-TEM(C)'].resample('MS').mean() * weights[station_name]
            slr_data[station_name] = climate_data['SLR(MJ/m^2)'].resample('MS').sum() * weights[station_name]
            hmd_data[station_name] = climate_data['AVG-RHU(%)'].resample('MS').mean() * weights[station_name]
            wnd_data[station_name] = climate_data['AVG-WV(m/s)'].resample('MS').mean() * weights[station_name]

        # Calculate weighted averages across stations
        pcp_avg = pcp_data.sum(axis=1)
        pcp_avg.name = 'P2020(mm)'
        pcp_avg = pcp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        pcp_avg = pcp_avg.sort_index()
        pcp_avg.index = pred_index

        maxtmp_avg = maxtmp_data.sum(axis=1)
        maxtmp_avg.name = 'MAX-TEM(C)'
        maxtmp_avg = maxtmp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        maxtmp_avg = maxtmp_avg.sort_index()
        maxtmp_avg.index = pred_index

        mintmp_avg = mintmp_data.sum(axis=1)
        mintmp_avg.name = 'MIN-TEM(C)'
        mintmp_avg = mintmp_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        mintmp_avg = mintmp_avg.sort_index()
        mintmp_avg.index = pred_index

        slr_avg = slr_data.sum(axis=1)
        slr_avg.name = 'SLR(MJ/m^2)'
        slr_avg = slr_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        slr_avg = slr_avg.sort_index()
        slr_avg.index = pred_index

        hmd_avg = hmd_data.sum(axis=1)
        hmd_avg.name = 'AVG-RHU(%)'
        hmd_avg = hmd_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        hmd_avg = hmd_avg.sort_index()
        hmd_avg.index = pred_index

        wnd_avg = wnd_data.sum(axis=1)
        wnd_avg.name = 'AVG-WV(m/s)'
        wnd_avg = wnd_avg.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        wnd_avg = wnd_avg.sort_index()
        wnd_avg.index = pred_index

        # Read monthly streamflow data
        flow = pd.read_csv(f'../data/{hydro_station}_natural_monthly_flow.csv', 
                           index_col=['date'], parse_dates=['date'])
        flow = flow.loc[f'{start_year}-01-01':f'{pred_year}-12-31']
        flow = flow.sort_index()
        flow.index = pred_index

        # Concatenate all data
        all_data = pd.concat([pcp_avg, maxtmp_avg, mintmp_avg, slr_avg, hmd_avg, wnd_avg,flow], axis=1)

             
        # Drop columns with all zero values
        all_data = all_data.loc[:, (all_data != 0).any(axis=0)]
        
        # Set index name
        all_data.index.name = 'date'
        all_data.to_csv(sample_path+f'{hydro_station}_MeteAVGCalvalFeatureDataForML_PRED{pred_year}.csv', index=True)

        
        selected_data = all_data.loc[:, selected_features]

        feature_samples, target_samples = gen_multi_output_samples(
            timeseries=selected_data.copy(),
            target_column='flow(m^3/s)',
            lead=12,
            lag=12,
        )
        feature_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_feature_samples_pred{pred_year}.csv', index=True)
        target_samples.to_csv(sample_path+f'{hydro_station}_meteavg_vif_target_samples_pred{pred_year}.csv', index=True)

    print(f"Data for {hydro_station} processed and saved.")
    

No null values found in Tangnaihai data.
Data for Tangnaihai processed and saved.
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)', 'SLR(MJ/m^2)', 'AVG-RHU(%)', 'AVG-WV(m/s)']
No null values found in Guide data.
Data for Guide processed and saved.
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)', 'SLR(MJ/m^2)', 'AVG-RHU(%)', 'AVG-WV(m/s)']
No null values found in Xunhua data.
Data for Xunhua processed and saved.
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)', 'SLR(MJ/m^2)', 'AVG-RHU(%)', 'AVG-WV(m/s)']
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)']
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)']
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)']
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)']
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)']
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)']
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)']
features:  ['P2020(mm)', 'MAX-TEM(C)', 'MIN-TEM(C)']
Data for Tangnaihai processed and saved.
features:  ['P20