In [None]:
from pathlib import Path
import pandas as pd
import sys
import pickle

# Add the root project directory to the Python path
project_root = Path.cwd().parent  # This will get the project root since the notebook is in 'notebooks/'
sys.path.append(str(project_root))
from configs.path_config import EXTRACTED_DATA_DIR, OUTPUT_DIR
from configs.support_coords import *

from src.clustering import clustering_preprocess #load_data, drop_columns_by_header_rules, remove_outliers, explain_variance, do_pca
from src.clustering import clustering_models # kmeans_clustering, gmm_clustering, kl_divergence, jeffreys_divergence, merge_clusters_by_divergence, streaming_dpgmm_clustering
from src.clustering import clustering_visualization #plot_clusters_over_time, plot_cluster_mean_and_std
from src.clustering import sankey_diagram #plot_sankey_diagram

### Data loading and preprocessing


In [None]:
# Setup
beam_id = 'E'
support_coords = beam_E
path = EXTRACTED_DATA_DIR / 'strain_distributions' / 'alvbrodel_04' / f'S-{beam_id}_Close_Comp_20091129120000_20210611160000_strain_distribution_04.csv'

df = clustering_preprocess.load_data(path)
df = clustering_preprocess.drop_columns_by_header_rules(df, threshold=0)
df_strain, df = clustering_preprocess.remove_outliers(df, threshold=7, individual_threshold=7)
# df

### Explained Variance by Number of Prinicipal Components 

In [None]:
clustering_preprocess.explain_variance(df_strain)

### Perform the PCA

In [None]:
n_components = 8
normalized_pca_components, df_pca = clustering_preprocess.do_pca(n_components, df_strain, df)

### GMM Clustering

In [None]:
n_clusters = 5
data_with_gmm, cluster_color_map  = clustering_models.gmm_clustering(normalized_pca_components, df, n_clusters)
data_with_gmm

In [None]:
data_with_gmm_new  = data_with_gmm.copy()

data_with_gmm_new.loc[data_with_gmm_new['Cluster'] == 4, 'Cluster'] = 5
data_with_gmm_new.loc[data_with_gmm_new['Cluster'] == 3, 'Cluster'] = 4
data_with_gmm_new.loc[data_with_gmm_new['Cluster'] == 2, 'Cluster'] = 3

data_with_gmm_new.loc[data_with_gmm_new['Cluster'] == 5, 'Cluster'] = 6
data_with_gmm_new.loc[data_with_gmm_new['Cluster'] == 1, 'Cluster'] = 5
data_with_gmm_new.loc[data_with_gmm_new['Cluster'] == 0, 'Cluster'] = 1
data_with_gmm_new.loc[data_with_gmm_new['Cluster'] == 6, 'Cluster'] = 0

data_with_gmm_new

In [None]:
save_path = OUTPUT_DIR / 'strain_distributions' / 'GMM' / f'{beam_id}_GMM_clusters_over_time.pdf'
clustering_visualization.plot_clusters_over_time(
    data_with_gmm_new, 
    cluster_color_map, 
    'GMM', 
    beam_id,
    save_path, 
    save=True)

In [None]:
clusters_to_keep = [0,3] # 'all' or a list of cluster indices
# clusters_to_keep = ['all'] # 'all' or a list of cluster indices
save_path = OUTPUT_DIR / 'strain_distributions' / 'GMM' / f"{beam_id}_{clusters_to_keep}_mean_and_std_GMM.pdf"
clustering_visualization.plot_cluster_mean_and_std(data_with_gmm_new, clusters_to_keep, cluster_color_map, 'GMM', support_coords, beam_id, save_path, save=True)

### DPGMM Clustering

In [None]:
save_dir = OUTPUT_DIR / 'strain_distributions' / 'DPGMM' / f'{beam_id}_step_size_7'
name = f"{beam_id}_clustering_results.pkl"
path = save_dir / name
print(path)

In [None]:
data_with_dpgmm, cluster_color_map, cluster_dict, normalized_pca_components, all_labels = clustering_models.streaming_dpgmm_clustering(
    normalized_pca_components=normalized_pca_components,
    df=df,
    prior=0.1,                # Decides how restrictive the model is when creating new clusters the lower the more restrictive
    n_points=1095,            # Number of points to use for the initial clustering, 3 years
    window_size=180,          # Size of the sliding window, 0,5 years
    step_size=7,             # Step size for the sliding window, 3 months
    max_components=100,       # Maximum number of components to use in the model
    merge_threshold=7,        # Threshold for merging clusters
)

clustering_results = {
    "data_with_dpgmm": data_with_dpgmm,
    "cluster_color_map": cluster_color_map,
    "cluster_dict": cluster_dict,
    "normalized_pca_components": normalized_pca_components,
    "all_labels": all_labels
}

with open(path, 'wb') as outf:
    pickle.dump(clustering_results, outf)

In [None]:
with open(path, 'rb') as f: 
    clustering_results_reload = pickle.load(f)

data_with_dpgmm = clustering_results_reload["data_with_dpgmm"]
cluster_color_map = clustering_results_reload["cluster_color_map"]
cluster_dict = clustering_results_reload["cluster_dict"]
normalized_pca_components = clustering_results_reload["normalized_pca_components"]
all_labels = clustering_results_reload["all_labels"]

### Visualize the Clusters

In [None]:
save_path = save_dir / f'{beam_id}_clusters_all.pdf'
clustering_visualization.plot_dpgmm_clusters(
    data_with_dpgmm,
    normalized_pca_components,
    all_labels,
    cluster_color_map, 
    num_components_to_plot=4,
    beam_id=beam_id,
    save_dir=save_path,
    save=True)

### Plot the Cluster Assignment over Time

In [None]:
name = f"clusters_over_time_{beam_id}.pdf"
save_path = save_dir / name
clustering_visualization.plot_clusters_over_time(
    data_with_dpgmm, 
    cluster_color_map, 
    'DPGMM', 
    beam_id,
    save_path, 
    save=True)

### Cluster Visualization
Visualizes the representative strain distribution of each cluster in the form of mean strain (dark) and standard deviation of strain (light shade).

In [None]:
clusters_to_keep = [2,4] # 'all' or a list of cluster indices
name = f"{beam_id}_{clusters_to_keep}_mean_and_std.pdf"
save_path = save_dir / name
clustering_visualization.plot_cluster_mean_and_std(data_with_dpgmm, clusters_to_keep, cluster_color_map, 'DPGMM', support_coords, beam_id, save_path, save=True)

### Plot Sankey Diagram
Visualizes trainsitions between and merges of clusters after each step of the sliding window in the clustering algorithm. 

In [None]:
# cluster_dict_converted = {
#     pd.to_datetime(key, format="%Y-%m-%d").date(): value
#     for key, value in cluster_dict.items()
# }
cluster_dict_converted = {
    pd.to_datetime(k, format="%Y-%m-%d").date(): cluster_dict[k]
    for i, k in enumerate(cluster_dict)
    if i % 12 == 0
}

links = sankey_diagram.build_sankey_links_from_cluster_dict(cluster_dict_converted)
nodes, source, target, value = sankey_diagram.prepare_sankey_data(links)

save_path = save_dir / f'{beam_id}_sankey_diagram_12_week_intervals.pdf'

sankey_diagram.plot_sankey(
    nodes, 
    source,
    target, 
    value, 
    title=f"Cluster transitions over Time for Beam {beam_id} (12 week intervals)", 
    save_path=save_path,
    save = True
)