In [2]:
from utils import *
import pandas as pd
from sqlalchemy import create_engine

username = "root"
password = "123456"
database = "siemens_proj"
engine = create_engine(f"mysql+mysqlconnector://{username}:{password}@localhost:3306/{database}")

In [3]:
# train_file = "train_combined_pub_0313_0905.csv"
# df_train_raw = read_data(train_file)
# df_train_raw.shape

table_name = "bid"
time_split = "2023-10-16 00:00:00"
query_train = f"""
SELECT *
FROM {table_name}
WHERE SCH_BID_TIMEINTERVALSTART < '{time_split}';
"""


In [4]:
df_train_raw = pd.read_sql(query_train, engine)
df_train_raw.shape

(5236702, 23)

In [5]:
columns_to_drop = ['MAXEOHSTATEOFCHARGE','PRODUCTBID_DESC','PRODUCTBID_MRID', 'MARKETPRODUCT_DESC', 'SCH_BID_Y2AXISDATA', 'MINEOHSTATEOFCHARGE', 'MAXEOHSTATEOFCHARGE', 'STARTTIME', 'STOPTIME', 'RESOURCE_TYPE', 'MARKETPRODUCTTYPE', 'SCH_BID_CURVETYPE']
df_train_droped = df_train_raw.drop(columns=columns_to_drop, axis="columns")

In [6]:
printStats(df_train_droped)

size of data is: (5236702, 12)
type of each column is:
row_id                                int64
STARTDATE                            object
RESOURCEBID_SEQ                       int64
SCHEDULINGCOORDINATOR_SEQ             int64
SCH_BID_TIMEINTERVALSTART    datetime64[ns]
SCH_BID_TIMEINTERVALSTOP     datetime64[ns]
SCH_BID_XAXISDATA                   float64
SCH_BID_Y1AXISDATA                  float64
TIMEINTERVALSTART                    object
TIMEINTERVALEND                      object
SELFSCHEDMW                          object
row_repeat                            int64
dtype: object
Statistics in data:
              row_id  RESOURCEBID_SEQ  SCHEDULINGCOORDINATOR_SEQ  \
count  5.236702e+06     5.236702e+06               5.236702e+06   
mean   3.010585e+06     4.258356e+05               4.096662e+05   
min    1.000000e+00     1.000310e+05               1.042980e+05   
25%    1.506491e+06     1.662180e+05               1.874960e+05   
50%    3.013614e+06     3.165800e+05           

In [7]:
RESOURCEBID_SEQ = 100651 #None
# RESOURCEBID_SEQ = 117712
df_train_filtered_check_empty = filter_rows(df=df_train_droped, RESOURCEBID_SEQ=RESOURCEBID_SEQ)
df_train_filtered_check_empty.shape

(2491, 14)

In [8]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
from draw_bid_curve import find_trace_bid_curves_for_multiple_days_2d

# create a subplot grid: 4 rows, 6 columns
rows = 4
cols = 6
fig_err = make_subplots(rows=rows, cols=cols, subplot_titles=[f"Hour {h}" for h in range(24)], horizontal_spacing=0.03, vertical_spacing=0.05)
# fig_bid_train = make_subplots(rows=rows, cols=cols, subplot_titles=[f"Hour {h}" for h in range(24)], horizontal_spacing=0.03, vertical_spacing=0.05)

fig_clustered_bid_train = make_subplots(rows=rows, cols=cols, subplot_titles=[f"Hour {h}" for h in range(24)], horizontal_spacing=0.03, vertical_spacing=0.05)

min_clusters=2
max_clusters= 10 #20

best_clusters_by_hour=[]
list_pca_train = []
list_model_kmeans_train = []
list_cluster_thresholds = []
list_scaler_train = []
for HOUR in range(24):
    # print(f"Processing Hour {HOUR}...")
    df_train_filtered = filter_rows(df=df_train_droped, HOUR=HOUR, RESOURCEBID_SEQ=RESOURCEBID_SEQ)
    df_train_feat = extract_feat_from_bid(df_train_filtered)
    df_train_feat_scaled, scaler_train = scale_feat(df_train_feat)
    list_scaler_train.append(scaler_train)
    
    df_train_feat_scaled_pca, pca_components_train, explained_variance_train, pca_train = extract_pca_components(df_train_feat_scaled, goal_var=0.95)
    list_pca_train.append(pca_train)
    
    model_kmeans_train, n_cluster = find_best_model(df_train_feat_scaled_pca, min_clusters=min_clusters, max_clusters=max_clusters)
    cluster_thresholds = find_cluster_threshold(model_kmeans_train, df_train_feat_scaled_pca, n_cluster)
    list_cluster_thresholds.append(cluster_thresholds)
    
    best_clusters_by_hour.append(n_cluster)
    list_model_kmeans_train.append(model_kmeans_train)
    
    df_train_feat_scaled_pca_w_label = df_train_feat_scaled_pca.copy()
    df_train_feat_scaled_pca_w_label['cluster_label'] = model_kmeans_train.labels_
    cluster_labels = df_train_feat_scaled_pca_w_label[['cluster_label']].copy()
    
    row = (HOUR // cols) + 1
    col = (HOUR % cols) + 1   
    
    showlegend=True if HOUR == 0 else False
    kwargs = {'showlegend': showlegend}
    
    traces_err = find_trace_inter_intra_err(df_train_feat_scaled_pca, min_clusters, max_clusters, **kwargs)
    for trace in traces_err:
        fig_err.add_trace(trace, row=row, col=col)
    
    # traces_bid_train = find_trace_bid_curves_for_multiple_days_2d(df_data=df_train_filtered, hr_specified=HOUR, marker_size=5, opacity=0.8, linewidth=1)['traces']
    # for trace in traces_bid_train:
    #     fig_bid_train.add_trace(trace, row=row, col=col)
    
    traces_clustered_bid_train = find_trace_bid_curves_for_multiple_days_2d(df_data=df_train_filtered, hr_specified=HOUR, cluster_labels=cluster_labels, marker_size=5, opacity=0.8, linewidth=1)['traces']
    for trace in traces_clustered_bid_train:
        fig_clustered_bid_train.add_trace(trace, row=row, col=col)
        
    # add annotations for the number of clusters
    fig_clustered_bid_train.add_annotation(
    xref='x domain', yref='y domain',
    x=1.0, y=1.11,  # Adjust these values to position the annotation
    text=f'Clusters: {n_cluster}',
    showarrow=False,
    font=dict(size=12, color="#040404"),
    align="right",
    xanchor='right',
    yanchor='top',
    bordercolor="#c7c7c7",
    borderwidth=2,
    bgcolor="#c7c7c7",  # Optional: Choose a background color
    opacity=0.8,
    row=row, col=col
    )
    

trace = go.Scatter(
    x=list(range(24)) ,
    y=best_clusters_by_hour,
    mode='lines+markers',  # Both lines and markers
    marker=dict(size=8, color='blue'),  # Marker settings
    line=dict(color='blue')  # Line settings
)
fig_best_clusters = go.Figure(data=[trace])
fig_best_clusters.update_layout(
    title='Best number of clusters by hour (for training set)',
    xaxis_title='Hour',
    yaxis_title='Number of custers',
    height=800,  # height of the figure in pixels
    width=800,   # width of the figure in pixels
    hovermode="x unified", 
)

    
fig_height_factor = 250
fig_width_factor = fig_height_factor * 1.25 

# fig_bid_train.update_layout(
#     height=rows*fig_height_factor,
#     width=cols*fig_width_factor,
#     title_text="Bidding curves by hour (X axis: Amount (MW), Y axis: Price ($)) (for training set)",
#     showlegend=False,
#     legend=dict(
#         orientation="h",
#         yanchor="bottom",
#         # y=1.05,
#         xanchor="right",
#         # x=1
#     ),
#     hovermode="x unified", 
# )

      
fig_clustered_bid_train.update_layout(
    height=rows*fig_height_factor,
    width=cols*fig_width_factor,
    title_text="Clustered bidding curves by hour (X axis: Amount (MW), Y axis: Price ($)) (for training set)",
    showlegend=False,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        # y=1.05,
        xanchor="right",
        # x=1
    ),
    # hovermode="x unified", 
)
   
fig_err.update_layout(
    height=rows*fig_height_factor,
    width=cols*fig_width_factor,
    title_text="Intercluster and intracluster errors by hour (X axis: Num of clusters, Y axis: Error(Sum of sqared distances)) (for training set)",
    legend=dict(
        orientation="h", #horizontal legend
        yanchor="top",
        y=1.05,
        xanchor="right",
        x=1
    ),
    hovermode="x unified", 
)

# Save the figure
# fig_bid_train.write_html("output/train_bid.html")
fig_err.write_html("output/train_intra_inter_err.html")
fig_best_clusters.write_html("output/train_best_clusters.html")
fig_clustered_bid_train.write_html("output/train_clustered_bid.html")

# Show the figure
# fig_bid_train.show()
# fig_err.show()
# fig_best_clusters.show()    
# fig_clustered_bid_train.show()

# Testing

<div style="text-align: center;">
    <img src="./img/percentile_cluster1.jpg" width="60%" />
</div>

In [9]:
# valid_file = "valid_combined_pub_0906_1023.csv"
# df_valid_raw = read_data(valid_file)
# # printStats(df_valid_raw)

query_valid = f"""
SELECT *
FROM {table_name}
WHERE SCH_BID_TIMEINTERVALSTART >= '{time_split}';
"""
df_valid_raw = pd.read_sql(query_valid, engine)
df_valid_raw.shape

df_valid_droped = df_valid_raw.drop(columns=columns_to_drop, axis='columns')

In [10]:
# fig_bid_valid = make_subplots(rows=rows, cols=cols, subplot_titles=[f"Hour {h}" for h in range(24)], horizontal_spacing=0.03, vertical_spacing=0.05)

# fig_clustered_bid_valid = make_subplots(rows=rows, cols=cols, subplot_titles=[f"Hour {h}" for h in range(24)], horizontal_spacing=0.03, vertical_spacing=0.05)


for HOUR in range(24):
    df_valid_filtered = filter_rows(df=df_valid_droped, HOUR=HOUR, RESOURCEBID_SEQ=RESOURCEBID_SEQ)
    df_valid_feat = extract_feat_from_bid(df_valid_filtered)
    df_valid_feat_scaled, _ = scale_feat(df_valid_feat, scaler=list_scaler_train[HOUR])
    df_valid_feat_scaled_pca = convert_numpy_pca_to_df(list_pca_train[HOUR].transform(df_valid_feat_scaled), df_valid_feat_scaled)

    # valid_labels = list_model_kmeans_train[HOUR].predict(df_valid_feat_scaled_pca)
    cluster_thresholds = list_cluster_thresholds[HOUR]
    valid_labels = predict_valid_label(list_model_kmeans_train[HOUR], df_valid_feat_scaled_pca, cluster_thresholds)
    df_valid_feat_scaled_pca_w_label = df_valid_feat_scaled_pca.copy()
    df_valid_feat_scaled_pca_w_label['cluster_label'] = valid_labels

    cluster_labels_valid = df_valid_feat_scaled_pca_w_label[['cluster_label']].copy()

    row = (HOUR // cols) +  1
    col = (HOUR % cols) + 1
    showlegend=True if HOUR == 0 else False
    kwargs = {'showlegend': showlegend}
    
    # traces_bid_valid = find_trace_bid_curves_for_multiple_days_2d(df_valid_filtered, HOUR)['traces']
    # for trace in traces_bid_valid:
    #     fig_bid_valid.add_trace(trace, row=row, col=col)
    
    traces_clustered_bid_valid = find_trace_bid_curves_for_multiple_days_2d(df_data=df_valid_filtered, hr_specified=HOUR, cluster_labels=cluster_labels_valid, marker_symbol='triangle-up', marker_size=15, opacity=0.8, linewidth=2)['traces']
    for trace in traces_clustered_bid_valid:
        # fig_clustered_bid_valid.add_trace(trace, row=row, col=col)
        fig_clustered_bid_train.add_trace(trace, row=row, col=col)
        
# fig_bid_valid.update_layout(
#     height=rows*fig_height_factor,
#     width=cols*fig_width_factor,
#     title_text="Bidding curves by hour (X axis: Amount (MW), Y axis: Price ($)) (for testing(or validation) set)",
#     showlegend=False,
#     legend=dict(
#         orientation="h",
#         yanchor="bottom",
#         # y=1.05,
#         xanchor="right",
#         # x=1
#     ),
#     hovermode="x unified", 
# )
      
fig_clustered_bid_train.update_layout(
    height=rows*fig_height_factor,
    width=cols*fig_width_factor,
    title_text="Clustered bidding curves by hour (X axis: Amount (MW), Y axis: Price ($)) (for testing(or validation) set)",
    showlegend=False,
    legend=dict(
        orientation="h",
        yanchor="bottom",
        # y=1.05,
        xanchor="right",
        # x=1
    ),
    # hovermode="x unified", 
)

# Save the figure
# fig_bid_valid.write_html("output/valid_bid.html")
fig_clustered_bid_train.write_html("output/valid_clustered_bid.html")

# Show the figure
# fig_bid_valid.show()
# fig_clustered_bid_valid.show()