# file import 

In [20]:
import numpy as np


# 低次元投影(ノイズあり)
projected_vector_with_noise = np.load("../../d3-app/data/projection.npy")
print(f"length with noise: {len(projected_vector_with_noise)}")

# hdbscan label
hdbscan_label = np.load("../../d3-app/data/hdbscan_label.npy")

# 低次元投影(ノイズなし): hdbscanでノイズと判定された点を除去
projected_vector_no_noise = projected_vector_with_noise[hdbscan_label != -1]
print(f"length without noise: {len(projected_vector_no_noise)}")


length with noise: 115754
length without noise: 6367


# sample

In [None]:
import plotly.express as px
import plotly.io as pio
pio.renderers.default = "iframe"

import plotly.express as px

df = px.data.iris()
fig = px.scatter(df, x="sepal_length", y="sepal_width", color="species")

# タイトルを中央に配置する設定
fig.update_layout(
    title={
        'text': "PCA Plot of Biological Samples", # タイトル文
        'y': 0.95,           # 上からの位置 (0 to 1)
        'x': 0.5,            # 横の位置 (0.5 = 中央)
        'xanchor': 'center', # 基準点を中央に
        'yanchor': 'top'     # 基準点を上に
    },
    font=dict(
        family="Arial",      # 論文で一般的なフォント
        size=16,
        color="black"
    ),
    template="plotly_white"
)

fig.show()
pio.write_image(fig, "pca_plot.pdf", width=800, height=600)

# 散布図(ノイズあり、ノイズなし)

In [18]:
import plotly.graph_objects as go

def save_umap_plot(x_data, y_data, filename="umap_plot.pdf", point_size=1, opacity=1):
    fig = go.Figure()

    fig.add_trace(go.Scatter(
        x=x_data, y=y_data,
        mode='markers',
        marker=dict(color='black', size=point_size, opacity=opacity),
        showlegend=False
    ))

    fig.update_layout(
        template="plotly_white",
        width=500, height=500,
        font=dict(family="Arial", size=14),
        # 軸の設定：線だけ残して数字（Ticks）を消す
        xaxis=dict(
            title="UMAP 1",
            showticklabels=False, # 数字を消す
            showgrid=False,       # グリッドを消す
            linecolor='black',
            linewidth=2,
            mirror=False          # UMAPはBoxで囲わないスタイルも人気
        ),
        yaxis=dict(
            title="UMAP 2",
            showticklabels=False,
            showgrid=False,
            linecolor='black',
            linewidth=2,
            mirror=False
        ),
        margin=dict(l=50, r=20, t=50, b=50)
    )

    fig.write_image(filename, engine="kaleido")
    

In [21]:
save_umap_plot(projected_vector_no_noise[:, 0], projected_vector_no_noise[:, 1], filename="umap_no_noise.pdf", point_size=1, opacity=1)
save_umap_plot(projected_vector_with_noise[:, 0], projected_vector_with_noise[:, 1], filename="umap_with_noise.pdf", point_size=1, opacity=1)

# Density Heatmap(ノイズ有り用)

In [56]:
def save_density_heatmap(data_x, data_y, nbins=30, x_label="UMAP 1", y_label="UMAP 2", filename="density_heatmap.pdf"):
    fig = px.density_heatmap(
        x=data_x, 
        y=data_y, 
        nbinsx=nbins, 
        nbinsy=nbins,
        color_continuous_scale="Viridis",
        labels={'x': x_label, 'y': y_label},
        template="plotly_white"
    )

    fig.update_layout(
        width=500, # 凡例を消すので正方形に近く
        height=500,
        font=dict(family="Arial", size=14),
        title={'text': "Density Analysis", 'x': 0.5, 'xanchor': 'center'},
        # 【修正】右側のカラーバー（legend相当）を非表示にする
        coloraxis_showscale=False 
    )

    # 軸の設定
    fig.update_xaxes(
        showline=True, linewidth=2, linecolor='black', mirror=True, 
        ticks="outside", 
        showticklabels=False # 【修正】軸の数字（メモリラベル）を消す
    )
    fig.update_yaxes(
        showline=True, linewidth=2, linecolor='black', mirror=True, 
        ticks="outside", 
        showticklabels=False # 【修正】軸の数字（メモリラベル）を消す
    )

    fig.write_image(filename, engine="kaleido", scale=2)

In [46]:
import os
# mkdir
os.makedirs("density_heatmap", exist_ok=True)

In [57]:

save_density_heatmap(
    projected_vector_with_noise[:, 0],
    projected_vector_with_noise[:, 1],
    nbins=500,
    filename="density_heatmap/_density_with_noise.pdf"
)

hyperparam
nbin=500

# DBSCAN

In [40]:
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio
from sklearn.cluster import DBSCAN
import numpy as np
import pandas as pd

def save_dbscan_scatter(
    data: np.ndarray,
    eps: float = 0.5,
    min_samples: int = 5,
    x_label: str = "UMAP 1",
    y_label: str = "UMAP 2",
    title: str = "DBSCAN Clustering",
    filename: str = "dbscan_no_legend.pdf"
):
    # DBSCANを実行
    db = DBSCAN(eps=eps, min_samples=min_samples)
    clusters = db.fit_predict(data)
    print(f"Number of clusters found: {len(set(clusters)) - (1 if -1 in clusters else 0)}")
    # クラスタの最大サイズ、最小サイズ
    unique, counts = np.unique(clusters, return_counts=True)
    cluster_sizes = dict(zip(unique, counts))
    max_cluster_size = max([size for label, size in cluster_sizes.items() if label != -1], default=0)
    min_cluster_size = min([size for label, size in cluster_sizes.items() if label != -1], default=0)
    print(f"Max cluster size: {max_cluster_size}, Min cluster size: {min_cluster_size}")
    

    df_plot = pd.DataFrame(data, columns=['x', 'y'])
    df_plot['cluster'] = clusters.astype(str)

    # 散布図作成
    fig = px.scatter(
        df_plot, x='x', y='y', color='cluster',
        title=title, template="plotly_white",
        labels={'x': x_label, 'y': y_label}
    )
    # point sizeを小さくする
    fig.update_traces(marker=dict(size=1))

    # 凡例を非表示にする設定
    fig.update_layout(showlegend=False)

    # 外れ値(-1)をグレーにする
    fig.for_each_trace(lambda t: t.update(marker_color='lightgray') if t.name == '-1' else ())

    # 論文用のレイアウト微調整
    fig.update_layout(
        width=500, height=500,
        font=dict(family="Arial", size=14, color="black"),
        title={'x': 0.5, 'xanchor': 'center'},
        xaxis=dict(showline=True, linewidth=2, linecolor='black', mirror=True, ticks='outside'),
        yaxis=dict(showline=True, linewidth=2, linecolor='black', mirror=True, ticks='outside'),
        margin=dict(l=60, r=20, t=60, b=60)
    )

    # 保存
    pio.write_image(fig, filename, engine="kaleido", scale=2)
    # fig.show()

# --- 実行例 ---
# data = np.random.rand(200, 2)
# save_dbscan_scatter_no_legend(data, eps=0.1, min_samples=5)

In [44]:
# dbscanのハイパーパラメータ探索(dbscanのクラスタ数を出力)
def dbscan_hyperparameter_search(
    data: np.ndarray,
    eps_values: list,
    min_samples_values: list
):
    for eps in eps_values:
        for min_samples in min_samples_values:
            # dbscanを実行
            db = DBSCAN(eps=eps, min_samples=min_samples)
            clusters = db.fit_predict(data)
            print(f"eps: {eps}, min_samples: {min_samples} => Number of clusters: {len(set(clusters)) - (1 if -1 in clusters else 0)}")

            # noise
            num_noise = np.sum(clusters == -1)
            print(f"Number of noise points: {num_noise}")

            # クラスタの最大サイズ、最小サイズ
            unique, counts = np.unique(clusters, return_counts=True)
            cluster_sizes = dict(zip(unique, counts))
            max_cluster_size = max([size for label, size in cluster_sizes.items() if label != -1], default=0)
            min_cluster_size = min([size for label, size in cluster_sizes.items() if label != -1], default=0)
            print(f"Max cluster size: {max_cluster_size}, Min cluster size: {min_cluster_size}")
dbscan_hyperparameter_search(
    projected_vector_with_noise,
    eps_values=[0.02, 0.03, 0.04, 0.05, 0.1, 0.3, 0.5, 0.7],
    min_samples_values=[5, 10]
)

eps: 0.02, min_samples: 5 => Number of clusters: 4131
Number of noise points: 66221
Max cluster size: 2690, Min cluster size: 1
eps: 0.02, min_samples: 10 => Number of clusters: 367
Number of noise points: 106444
Max cluster size: 1618, Min cluster size: 5
eps: 0.03, min_samples: 5 => Number of clusters: 2663
Number of noise points: 23350
Max cluster size: 13949, Min cluster size: 1
eps: 0.03, min_samples: 10 => Number of clusters: 1299
Number of noise points: 72095
Max cluster size: 3205, Min cluster size: 2
eps: 0.04, min_samples: 5 => Number of clusters: 874
Number of noise points: 8062
Max cluster size: 60134, Min cluster size: 2
eps: 0.04, min_samples: 10 => Number of clusters: 1015
Number of noise points: 33127
Max cluster size: 16901, Min cluster size: 1
eps: 0.05, min_samples: 5 => Number of clusters: 328
Number of noise points: 3451
Max cluster size: 93057, Min cluster size: 1
eps: 0.05, min_samples: 10 => Number of clusters: 430
Number of noise points: 13949
Max cluster size:

In [35]:
# mkdir
import os
os.makedirs("dbscan", exist_ok=True)

In [41]:

# eps: 0.05, min_samples: 5 => Number of clusters: 328
save_dbscan_scatter(
    data=projected_vector_with_noise,
    eps=0.05,
    min_samples=5,
    x_label="UMAP 1",
    y_label="UMAP 2",
    title="DBSCAN Clustering on UMAP Projection",
    filename="dbscan/dbscan_umap_scatter_eps005_min5.pdf"
)

Number of clusters found: 328
Max cluster size: 93057, Min cluster size: 1






In [43]:
# eps: 0.02, min_samples: 10 => Number of clusters: 367
save_dbscan_scatter(
    data=projected_vector_with_noise,
    eps=0.02,
    min_samples=10,
    x_label="UMAP 1",
    y_label="UMAP 2",
    title="DBSCAN Clustering on UMAP Projection",
    filename="dbscan/dbscan_umap_scatter_eps002_min10.pdf"
)

Number of clusters found: 367
Max cluster size: 1618, Min cluster size: 5






In [None]:
# hyperparameter search results
for eps in [0.02, 0.03, 0.04, 0.05, 0.1, 0.3, 0.5, 0.7]:
    for min_samples in [5, 10]:
        save_dbscan_scatter(
            data=projected_vector_with_noise,
            eps=eps,
            min_samples=min_samples,
            x_label="UMAP 1",
            y_label="UMAP 2",
            title=f"DBSCAN Clustering on UMAP Projection (eps={eps}, min_samples={min_samples})",
            filename=f"dbscan/dbscan_umap_scatter_eps{str(eps).replace('.', '')}_min{min_samples}.pdf"
        )

eps: 0.03, min_samples: 5 => Number of clusters: 2663
Number of noise points: 23350
Max cluster size: 13949, Min cluster size: 1
を採用

# シルエット係数

クラスタ分離ができているかどうかの評価: DBSCANのノイズを除外するとまあ分離はできていることになりそう