In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from datetime import datetime
import sys
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.cluster import DBSCAN
import plotly.graph_objects as go

# Add the root project directory to the Python path
project_root = Path.cwd().parent  # This will get the project root since the notebook is in 'notebooks/'
sys.path.append(str(project_root))
from configs.path_config import EXTRACTED_DATA_DIR, OUTPUT_DIR

from src.clustering import clustering_preprocess #load_data, drop_columns_by_header_rules, remove_outliers, explain_variance, do_pca
from src.clustering import clustering_models # kmeans_clustering, gmm_clustering, kl_divergence, jeffreys_divergence, merge_clusters_by_divergence, streaming_dpgmm_clustering
from src.clustering import clustering_visualization #plot_clusters_over_time, plot_cluster_mean_and_std

### Data loading and preprocessing


In [None]:
path = EXTRACTED_DATA_DIR / 'strain_distributions' / 'alvbrodel_04' / 'S-B_Close_Comp_20091129120000_20210611160000_strain_distribution_04 1.csv'
# path = OUTPUT_DIR / 'strain_distributions' / 'N-F_Mid_Comp_20091129120000_20210611160000_strain_distribution.csv'
df = clustering_preprocess.load_data(path)
# df

In [None]:
df = clustering_preprocess.drop_columns_by_header_rules(df, threshold=0)
# df

In [None]:
df_strain, df = clustering_preprocess.remove_outliers(df, threshold=7, individual_threshold=7)
# df

In [None]:
clustering_preprocess.explain_variance(df_strain)

In [None]:
n_components = 10
normalized_pca_components, df_pca = clustering_preprocess.do_pca(n_components, df_strain, df)

### GMM Clustering

In [None]:
n_clusters = 10
data_with_gmm = clustering_models.gmm_clustering(normalized_pca_components, df, n_clusters)
# data_with_gmm

In [None]:
clustering_visualization.plot_clusters_over_time(data_with_gmm, 'GMM')

In [None]:
clusters_to_keep = [8, 9] # 'all' or a list of cluster indices
clustering_visualization.plot_cluster_mean_and_std(data_with_gmm, clusters_to_keep, 'GMM')

### DPGMM Clustering

In [None]:
data_with_dpgmm, _ = clustering_models.streaming_dpgmm_clustering(
    normalized_pca_components=normalized_pca_components,
    df=df,
    n_points=1000,
    window_size=400,
    step_size=100,
    max_components=100,
    merge_threshold=60,
    merge_within_window=True  # Toggle ON/OFF
)

In [None]:
clustering_visualization.plot_clusters_over_time(data_with_dpgmm, 'DPGMM')

In [None]:
clusters_to_keep = ['all'] # 'all' or a list of cluster indices
clustering_visualization.plot_cluster_mean_and_std(data_with_dpgmm, clusters_to_keep, 'DPGMM')