# Interactive UMAP & HDBSCAN Clustering Pipeline

This notebook allows you to run the clustering pipeline step-by-step with adjustable parameters.

In [None]:
# Imports
import pandas as pd
import numpy as np
import umap
import hdbscan
from sklearn.experimental import enable_iterative_imputer  # noqa
from sklearn.impute import IterativeImputer
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns


## Load Data
Set your CSV file paths below.

In [None]:
# EDIT PATHS HERE
female_csv_path = 'path/to/females.csv'
male_csv_path = 'path/to/males.csv'


## Define pipeline functions (same as in the script, you can modularize or import externally)

In [None]:
# Paste or import functions from the script here for all the modular steps
# For brevity, please copy the functions `load_and_combine_data`, `create_intervals`, `aggregate_behaviors`,
# `preprocess_and_impute`, `scale_features`, `compute_umap_embedding`, `perform_hdbscan_clustering`, `add_cluster_labels`, and `plot_umap_clusters`
# exactly as defined in the above script cell into separate notebook cells or a single cell.

# ... example for loading and combining data function ...
def load_and_combine_data(female_path, male_path):
    df_females = pd.read_csv(female_path)
    df_males = pd.read_csv(male_path)
    df_females['Sex'] = 'Female'
    df_males['Sex'] = 'Male'
    combined_df = pd.concat([df_females, df_males], ignore_index=True)
    if not np.issubdtype(combined_df['Time'].dtype, np.timedelta64):
        combined_df['Time'] = pd.to_timedelta(combined_df['Time'])
    return combined_df

# (Continue copying all other functions here similarly)


## Parameters
Adjust these parameters before running the pipeline.

In [None]:
interval_seconds = 2
exclude_ids = ['ID63', 'ID214']
exclude_geno = 'atg7OE'
last_interval_label = "0 days 00:09:58 - 0 days 00:10:00"

umap_params = {
    'n_components': 2,
    'n_neighbors': 25,
    'min_dist': 0.1,
    'metric': 'euclidean',
    'random_state': 42,
    'verbose': True
}

hdbscan_params = {
    'min_cluster_size': 500,
    'min_samples': 90
}


## Run the Clustering Pipeline

In [None]:
# Run pipeline step-by-step
combined_df = load_and_combine_data(female_csv_path, male_csv_path)
combined_df = create_intervals(combined_df, interval_seconds)
behavior_cols = [
    'B_W_nose2nose', 'B_W_sidebyside', 'B_W_sidereside', 'B_W_nose2tail', 'B_W_nose2body',
    'B_W_following', 'B_climb-arena', 'B_sniff-arena', 'B_immobility', 'B_stat-lookaround',
    'B_stat-active', 'B_stat-passive', 'B_moving', 'B_sniffing', 'B_speed'
]
agg_df = aggregate_behaviors(combined_df, behavior_cols)
filtered_df, behavior_cols_filtered = preprocess_and_impute(
    agg_df, exclude_ids=exclude_ids, exclude_geno=exclude_geno, last_label=last_interval_label
)
scaled_data, scaler = scale_features(filtered_df, behavior_cols_filtered)
embedding, umap_model = compute_umap_embedding(scaled_data, umap_params)
labels, hdbscan_model = perform_hdbscan_clustering(embedding, hdbscan_params)
filtered_df = add_cluster_labels(filtered_df, labels)

# Visualize
plot_umap_clusters(embedding, labels)


## Save Results (optional)

In [None]:
output_csv_path = 'clustered_results.csv'
filtered_df.to_csv(output_csv_path, index=False)
print(f'Saved clustered results to {output_csv_path}')