In [None]:
import pandas as pd
import numpy as np
import psycopg2

# Connect to the newyork database
conn = psycopg2.connect(
    dbname = 'newyork',
    user = '',
    password = '',
    host = 'localhost',
    port = '5432'
)

# Get the top 50 nodes by negative price occurence
query = f"SELECT node, COUNT(*) FROM realtime_lbmp WHERE price < 0 GROUP BY node ORDER BY COUNT(*) DESC LIMIT 50;"
df = pd.read_sql_query(query, conn)
best_nodes = list(df.node)

# There are 58'943'742 elements in the file, so roughly 80k points per node
query = f"SELECT * FROM realtime_lbmp WHERE node IN %s"
df = pd.read_sql_query(query, conn, params=(tuple(best_nodes),))[['time','node','price']]
df = df.sort_values(by='time')
conn.close()

In [None]:
df['time'] = pd.to_datetime(df['time'])
df['hour'] = df['time'].dt.strftime('%Y-%m-%d %H')
df['price'] = pd.to_numeric(df['price'], errors='coerce')
df

In [None]:
# Find the minimum length of prices
min_length = 1e9
for node in best_nodes:
    df_node = df[df.node==node]
    if len(df_node) < min_length:
        min_length = len(df_node)
print(f'The node with the least points has data for {min_length} time steps.')

# Set up as a dict and crop price lists at min_length
prices = {}
for node in best_nodes:

    # Get price by hour
    df2 = df[df.node==node][['hour','price']]
    hourly_avg = df2.groupby(['hour']).mean()

    prices[node] = [round(x,2) for x in list(hourly_avg.price)[:min_length]]

for key, value in prices.items():
    print(f'{key}: {value[:10]}...')

In [None]:
# Save the hourly data to CSV
df_for_csv = pd.DataFrame(prices)
df_for_csv.index = pd.to_datetime(list(hourly_avg.index))
df_for_csv.reset_index(inplace=True)
df_for_csv.rename(columns={'index': 'Time'}, inplace=True)
df_for_csv.to_csv('hourly_data.csv', index=False)

In [None]:
# Test to see if it worked
df = pd.read_csv('hourly_data.csv')
df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

correlation_matrix = pd.DataFrame(prices).corr()

plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=False, cmap='coolwarm', fmt=".2f")
plt.title('Correlation matrix')
plt.show()