In [None]:
import sys
import os


sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))
print(sys.path[-1])

In [None]:
BASE_DIR = os.path.abspath(os.path.join(os.getcwd(), ".."))
DATA_DIR = os.path.join(BASE_DIR, "data")
H5AD_V2_DIR = os.path.join(
    DATA_DIR, "abc_atlas", "expression_matrices", "WMB-10Xv2", "20230630"
)
H5AD_V3_DIR = os.path.join(
    DATA_DIR, "abc_atlas", "expression_matrices", "WMB-10Xv3", "20230630"
)


print(f"{'BASE_DIR':<15}{BASE_DIR}")
print(f"{'DATA_DIR':<15}{DATA_DIR}")
print(f"{'H5AD_V2_DIR':<15}{H5AD_V2_DIR}")
print(f"{'H5AD_V3_DIR':<15}{H5AD_V3_DIR}")

In [None]:
hy_v2_path = os.path.join(
    H5AD_V2_DIR,
    "WMB-10Xv2-HY-log2.h5ad"
)
hy_v3_path = os.path.join(
    H5AD_V3_DIR,
    "WMB-10Xv3-HY-log2.h5ad"
)

print(f"{'hy_v2_path':<15}{hy_v2_path}")
print(f"{'hy_v3_path':<15}{hy_v3_path}")

In [None]:
from src.utils import load_data
from src.clustering import ClusteringOptimization

import scanpy as sc
import matplotlib.pyplot as plt

In [None]:
hy_v2 = load_data(hy_v2_path, backed="r")
hy_v3 = load_data(hy_v3_path, backed="r")

In [None]:
adata = hy_v2.to_memory()

optimizer = ClusteringOptimization(
    adata=adata,
    genes_to_clust=("top", 2_500), # Use fewer genes for speed
    n_pcs=15, # Specify PCs, or leave as None to auto-detect
    tune_dbscan=True,
    tune_leiden=True,
    tune_neighbors=True,
)
optimizer.run_optimization(n_trials=30) 
best_params = optimizer.get_best_params()
final_adata = optimizer.apply_best_params()
print("\n--- Visualizing Final Results ---")
fig, axes = plt.subplots(1, 2, figsize=(14, 6))

sc.pl.umap(
    final_adata, 
    color='dbscan_labels', 
    ax=axes[0], 
    show=False, 
    frameon=False,
    title='DBSCAN Noise Detection (All Cells)'
)

sc.pl.umap(
    final_adata, 
    color='optimized_leiden', 
    ax=axes[1], 
    show=False, 
    frameon=False,
    title='Optimized Leiden (Noise Removed)',
    na_color='lightgray' # Color for the 'Filtered_Out_Noise' cells
)

plt.tight_layout()
plt.show()


adata_clean = final_adata[final_adata.obs['optimized_leiden'] != 'Filtered_Out_Noise'].copy()
sc.pl.umap(adata_clean, color='optimized_leiden', title='Optimized Leiden (Filtered View)')
