In [None]:
#将多个csv整合到一个csv文件中

In [44]:
import pandas as pd
import os

# 定义一个函数来处理单个CSV文件
def process_csv_file(file_path):
    # 读取CSV文件
    df = pd.read_csv(file_path)
    
    # 选择需要的列
    df_selected = df[['Time Stamp', 'Name', 'Load']]
    
    # 以'Time Stamp'为行，'Name'为列，'Load'为值，并计算每个组的Load值
    # 这里我们使用pivot_table来避免重复项的问题，并且使用mean作为聚合函数
    df_pivot = df_selected.pivot_table(index='Time Stamp', columns='Name', values='Load', aggfunc='mean')
    
    return df_pivot

# 定义保存结果的路径
output_file_path = 'F:/AAA/GNN_Gaussian/data/纽约州各区用电量数据/processed_data.csv'

# 获取文件夹中所有CSV文件的列表，并按照文件名排序
csv_files = sorted([f for f in os.listdir('F:/AAA/GNN_Gaussian/data/纽约州各区用电量数据/负荷数据') if f.endswith('.csv')],
                   key=lambda x: os.path.splitext(x)[0])

# 初始化一个空的DataFrame来存储结果
result_df = pd.DataFrame()

# 遍历排序后的CSV文件列表
for file_name in csv_files:
    file_path = os.path.join('F:/AAA/GNN_Gaussian/data/纽约州各区用电量数据/负荷数据', file_name)
    
    # 处理CSV文件
    df_pivot = process_csv_file(file_path)
    
    # 如果result_df为空，则直接赋值
    if result_df.empty:
        result_df = df_pivot
    else:
        # 否则，将新文件的结果合并到结果DataFrame中
        # 使用add函数来合并DataFrame，并且忽略索引
        result_df = result_df.add(df_pivot, fill_value=0)

# 由于我们使用了add函数，结果中可能会有NaN值，这里我们可以选择填充它们
result_df.fillna(0, inplace=True)

# 保存最终结果到CSV文件
result_df.to_csv(output_file_path)

In [45]:
result_df

Name,CAPITL,CENTRL,DUNWOD,GENESE,HUD VL,LONGIL,MHK VL,MILLWD,N.Y.C.,NORTH,WEST
Time Stamp,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
01/01/2016 00:00:00,1199.3,1642.5,602.9,987.8,954.7,2113.0,775.0,311.8,5050.9,491.8,1655.1
01/01/2016 00:05:00,1187.6,1643.8,606.2,970.8,943.3,2087.7,769.2,299.5,4987.6,504.1,1654.9
01/01/2016 00:10:00,1185.5,1625.8,600.0,968.3,931.0,2076.5,772.9,307.9,4964.3,509.2,1641.5
01/01/2016 00:15:00,1170.4,1599.8,602.1,968.9,925.1,2072.6,781.7,317.3,4954.4,485.3,1628.3
01/01/2016 00:20:00,1179.9,1567.6,582.3,952.4,842.1,2055.0,756.9,398.4,4942.5,496.5,1645.7
...,...,...,...,...,...,...,...,...,...,...,...
12/31/2019 23:35:00,1224.1,1683.1,588.6,1027.7,1016.8,2079.8,831.5,312.6,5059.8,576.9,1611.2
12/31/2019 23:40:00,1217.3,1680.9,572.1,1028.1,998.2,2076.9,847.1,307.8,5029.4,569.4,1599.4
12/31/2019 23:45:00,1218.3,1682.8,563.2,1024.7,983.4,2085.1,821.4,328.6,5027.7,570.0,1601.7
12/31/2019 23:50:00,1211.1,1649.9,585.1,1017.0,981.8,2077.1,830.2,301.9,5003.7,570.6,1597.6


In [None]:
#按照时间排序

In [46]:
import pandas as pd

# 读取CSV文件
df = pd.read_csv('F:/AAA/GNN_Gaussian/data/纽约州各区用电量数据/processed_data.csv', parse_dates=['Time Stamp'])

# 假设df是你的DataFrame，并且已经包含了上述数据
# 首先确保Time_Stamp列是datetime类型
df['Time Stamp'] = pd.to_datetime(df['Time Stamp'], format='%m/%d/%Y %H:%M:%S')

# 按照日期对数据进行分组，并且对每个日期内的数据按照时间排序
# 然后重置索引，以便每个日期的数据连续排列
df_sorted = df.sort_values(by='Time Stamp').reset_index(drop=True)

# 保存排序后的数据到CSV文件
df_sorted.to_csv('F:/AAA/GNN_Gaussian/data/纽约州各区用电量数据/processed_data2.csv', index=False)

# 打印一条消息表示文件已保存
print('Sorted data has been saved to sorted_data.csv')

Sorted data has been saved to sorted_data.csv


In [47]:
df_sorted

Unnamed: 0,Time Stamp,CAPITL,CENTRL,DUNWOD,GENESE,HUD VL,LONGIL,MHK VL,MILLWD,N.Y.C.,NORTH,WEST
0,2016-01-01 00:00:00,1199.3000,1642.5000,602.9000,987.8000,954.7000,2113.0000,775.0000,311.8000,5050.900,491.8000,1655.1000
1,2016-01-01 00:05:00,1187.6000,1643.8000,606.2000,970.8000,943.3000,2087.7000,769.2000,299.5000,4987.600,504.1000,1654.9000
2,2016-01-01 00:10:00,1185.5000,1625.8000,600.0000,968.3000,931.0000,2076.5000,772.9000,307.9000,4964.300,509.2000,1641.5000
3,2016-01-01 00:15:00,1170.4000,1599.8000,602.1000,968.9000,925.1000,2072.6000,781.7000,317.3000,4954.400,485.3000,1628.3000
4,2016-01-01 00:20:00,1179.9000,1567.6000,582.3000,952.4000,842.1000,2055.0000,756.9000,398.4000,4942.500,496.5000,1645.7000
...,...,...,...,...,...,...,...,...,...,...,...,...
468286,2020-06-25 20:50:00,1682.7012,1818.1097,884.3146,1221.9072,1446.4265,3347.7510,947.5031,484.8189,6697.745,556.3856,1728.1814
468287,2020-06-25 20:55:00,1679.4862,1792.6196,885.1205,1230.5212,1468.7590,3334.2761,945.3936,463.8996,6691.453,555.1653,1749.7068
468288,2020-06-25 21:00:00,1680.4424,1823.3419,884.3774,1228.0935,1439.2723,3341.1055,950.2733,483.8524,6705.533,558.4659,1768.2601
468289,2020-06-25 21:05:00,1676.5277,1821.6813,886.1188,1227.0681,1441.1370,3334.5293,953.6710,480.5135,6673.177,551.7037,1774.6190


In [None]:
#edge_attr

In [43]:
import pandas as pd

# 读取CSV文件
df = pd.read_csv('F:/AAA/GNN_Gaussian/data/纽约州各区用电量数据/edge_attr.csv')

# 要写入的数据，这里是一个列表，每个元素代表一行的数据
new_data = [270, 130, 240, 120, 200, 160, 100, 320, 90, 110, 50, 110, 90, 150, 50, 100, 160, 50, 70]

# 确保数据行数足够
if len(df) < 468291:
    # 计算需要添加的行数
    additional_rows = 468291 - len(df)
    # 创建新的DataFrame来填充数据
    additional_df = pd.DataFrame([new_data] * additional_rows, columns=df.columns)
    # 将新行添加到原始DataFrame
    df = pd.concat([df, additional_df], ignore_index=True)

# 替换第2行到第470043行的数据
df.loc[1:468291, :] = new_data

# 保存修改后的文件
df.to_csv('F:/AAA/GNN_Gaussian/data/纽约州各区用电量数据/edge_attr.csv', index=False)