In [76]:
import pickle

with open("../19_tree/processed_data/cluster_similarities.pkl" , "rb") as f:
    similarity_dict = pickle.load(f)

print(similarity_dict.keys())

dict_keys(['kl_divergence', 'bhattacharyya_coefficient', 'mahalanobis_distance'])


ベクトル
空間への埋め込み
HSV

In [77]:
import numpy as np
import pandas as pd
from sklearn.manifold import MDS
import plotly.graph_objects as go
from matplotlib.colors import hsv_to_rgb
from scipy.stats import scoreatpercentile
import warnings

print("✅ Libraries imported successfully")

✅ Libraries imported successfully


In [78]:
# Load embedding data and HDBSCAN results
import pickle

data_file_path = "../18_rapids/result/20251203_053328/embedding.npz"
word_file_path = "../18_rapids/result/20251203_053328/data.npz"
hdbscan_condensed_tree_file_path = "../18_rapids/result/20251203_053328/condensed_tree_object.pkl"

data = np.load(data_file_path)
word_data = np.load(word_file_path)
embedding = data['embedding']
labels = word_data['words']

with open(hdbscan_condensed_tree_file_path, 'rb') as f:
    hdbscan_condensed_tree = pickle.load(f)

print(f"✅ Embedding shape: {embedding.shape}")
print(f"✅ Labels shape: {labels.shape}")
print(f"✅ Similarity dict keys: {list(similarity_dict.keys())}")

✅ Embedding shape: (115754, 2)
✅ Labels shape: (115754,)
✅ Similarity dict keys: ['kl_divergence', 'bhattacharyya_coefficient', 'mahalanobis_distance']


In [79]:
# Prepare point-to-cluster mapping from HDBSCAN
raw_tree = hdbscan_condensed_tree._raw_tree
leaf_rows = raw_tree[raw_tree['child_size'] == 1]
point_cluster_map = {int(row['child']): int(row['parent']) for row in leaf_rows}

parent_ids = [point_cluster_map[i] for i in range(len(labels))]

# Create DataFrame
df = pd.DataFrame({
    "x": embedding[:, 0],
    "y": embedding[:, 1],
    "label": labels,
    "cluster_id": parent_ids,
    "noise": word_data["labels"] == -1,
    "cluster_label": word_data["labels"]
})

# Filter noise points
df_denoised = df[~df['noise']].copy()

print(f"✅ Total points: {len(df)}")
print(f"✅ Denoised points: {len(df_denoised)}")
print(f"✅ Unique clusters: {df_denoised['cluster_id'].nunique()}")

✅ Total points: 115754
✅ Denoised points: 6367
✅ Unique clusters: 473


## 1. Similarity Matrix Creation

Convert dictionary-based similarity (cluster_id, cluster_id) -> value into a normalized N×N matrix

In [80]:
def create_similarity_matrix_from_dict(similarity_dict):
    """
    Convert (cluster_id, cluster_id) -> similarity dictionary into N×N matrix
    
    Args:
        similarity_dict: Dict with (id1, id2) as keys and similarity as values
    
    Returns:
        tuple: (similarity_matrix, id_to_index mapping)
    """
    cluster_ids = set()
    for id1, id2 in similarity_dict.keys():
        cluster_ids.add(id1)
        cluster_ids.add(id2)
    
    sorted_ids = sorted(list(cluster_ids))
    N = len(sorted_ids)
    
    id_to_index = {id: i for i, id in enumerate(sorted_ids)}
    similarity_matrix = np.zeros((N, N), dtype=float)
    
    for (id1, id2), similarity in similarity_dict.items():
        i = id_to_index.get(id1)
        j = id_to_index.get(id2)
        
        if i is None or j is None:
            continue
        
        similarity_matrix[i, j] = similarity
        if i != j:
            similarity_matrix[j, i] = similarity
    
    np.fill_diagonal(similarity_matrix, 1.0)
    
    return similarity_matrix, id_to_index

# Create similarity matrix from mahalanobis distance
similarity_matrix, id_to_index = create_similarity_matrix_from_dict(
    similarity_dict["mahalanobis_distance"]
)
print(f"✅ Similarity matrix shape: {similarity_matrix.shape}")

✅ Similarity matrix shape: (885, 885)


## 2. MDS Projection and Scaling Functions

In [81]:
def apply_mds_projection(similarity_matrix, n_components=3):
    """
    Convert similarity matrix to distance and apply MDS projection
    
    Args:
        similarity_matrix: N×N similarity matrix (0=dissimilar, 1=similar)
        n_components: Number of dimensions for projection
    
    Returns:
        tuple: (3D coordinates, MDS stress value)
    """
    distance_matrix = 1 - similarity_matrix
    mds = MDS(
        n_components=n_components,
        dissimilarity='precomputed',
        random_state=42,
        normalized_stress='auto'
    )
    
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        coords = mds.fit_transform(distance_matrix)
    
    print(f"✅ MDS Stress Value: {mds.stress_:.4f}")
    return coords, mds.stress_


def apply_scaling(coords, scaling_type='robust'):
    """
    Apply scaling to coordinates (normalize to [0, 1])
    
    Args:
        coords: 1D array of coordinates
        scaling_type: 'linear' or 'robust' (P5-P95 based)
    
    Returns:
        Scaled coordinates in [0, 1] range
    """
    if scaling_type == 'linear':
        min_val = coords.min()
        max_val = coords.max()
        return (coords - min_val) / (max_val - min_val)
    
    elif scaling_type == 'robust':
        p5 = scoreatpercentile(coords, 5)
        p95 = scoreatpercentile(coords, 95)
        scaled = (coords - p5) / (p95 - p5)
        return np.clip(scaled, 0, 1)
    
    else:
        raise ValueError("scaling_type must be 'linear' or 'robust'")


# Test MDS projection
coords_3d, stress = apply_mds_projection(similarity_matrix)
print(f"✅ 3D coordinates shape: {coords_3d.shape}")

✅ MDS Stress Value: 2718223934.2557
✅ 3D coordinates shape: (885, 3)


## 3. Color Space Mapping Functions

In [82]:
def map_coords_to_hsv(coords_3d, scaling_type='robust', hsv_mapping=('H', 'S', 'V'), value_range=(0, 1)):
    """
    Map 3D MDS coordinates to HSV color space
    
    Args:
        coords_3d: (N, 3) 3D coordinates from MDS
        scaling_type: 'linear' or 'robust' scaling
        hsv_mapping: Tuple specifying which MDS axis maps to (H, S, V)
        value_range: Tuple (min, max) specifying the range for V (明度)
                     例: (0.5, 1) → 暗い色を避ける [0.5, 1.0]の範囲
                         (0, 0.8) → 明るすぎる色を避ける [0, 0.8]の範囲
    
    Returns:
        tuple: (H, S, V) arrays in [0, 1] range (V は value_range で指定された範囲)
    """
    hsv_inputs = {}
    
    for i, hsv_comp in enumerate(hsv_mapping):
        mds_coord = coords_3d[:, i]
        scaled = apply_scaling(mds_coord, scaling_type)
        hsv_inputs[hsv_comp] = scaled
    
    H = hsv_inputs['H']
    S = hsv_inputs['S']
    V = hsv_inputs['V']
    
    # V (明度) の範囲を調整
    v_min, v_max = value_range
    V = v_min + V * (v_max - v_min)
    
    return H, S, V


def convert_hsv_to_rgb(H, S, V):
    """
    Convert HSV arrays to RGB hex color strings for Plotly
    
    Args:
        H, S, V: Arrays of HSV values in [0, 1]
    
    Returns:
        List of RGB color strings like 'rgb(255, 128, 0)'
    """
    hsv_array = np.stack([H, S, V], axis=1)
    rgb_array = hsv_to_rgb(hsv_array)
    
    hex_colors = [
        f'rgb({int(r*255)}, {int(g*255)}, {int(b*255)})'
        for r, g, b in rgb_array
    ]
    
    return hex_colors


# Test color mapping
H, S, V = map_coords_to_hsv(coords_3d, scaling_type='robust')
colors = convert_hsv_to_rgb(H, S, V)

print(f"✅ H range: [{H.min():.3f}, {H.max():.3f}]")
print(f"✅ S range: [{S.min():.3f}, {S.max():.3f}]")
print(f"✅ V range: [{V.min():.3f}, {V.max():.3f}]")
print(f"✅ Generated {len(colors)} colors")


✅ H range: [0.000, 1.000]
✅ S range: [0.000, 1.000]
✅ V range: [0.000, 1.000]
✅ Generated 885 colors


## 4. Cluster-to-Color Mapping

In [83]:
def get_cluster_colors_from_similarity(
    similarity_matrix,
    id_to_index,
    scaling_type='robust',
    hsv_mapping=('H', 'S', 'V')
):
    """
    Compute cluster ID -> color mapping using MDS + HSV embedding
    
    Args:
        similarity_matrix: N×N similarity matrix
        id_to_index: Dict mapping cluster IDs to matrix indices
        scaling_type: 'linear' or 'robust'
        hsv_mapping: Tuple of (H_axis, S_axis, V_axis) for MDS coordinates
    
    Returns:
        Dict: {cluster_id: 'rgb(r, g, b)', ...}
    """
    # MDS projection
    coords_3d, _ = apply_mds_projection(similarity_matrix)
    
    # Map to HSV
    H, S, V = map_coords_to_hsv(coords_3d, scaling_type, hsv_mapping)
    
    # Convert to RGB
    colors = convert_hsv_to_rgb(H, S, V)
    
    # Create cluster_id -> color mapping
    cluster_id_to_color = {}
    for cluster_id, index in id_to_index.items():
        if index < len(colors):
            cluster_id_to_color[cluster_id] = colors[index]
    
    return cluster_id_to_color


# Compute cluster colors
cluster_id_to_color = get_cluster_colors_from_similarity(
    similarity_matrix,
    id_to_index,
    scaling_type='robust'
)

print(f"✅ Generated colors for {len(cluster_id_to_color)} clusters")
print(f"   Sample: {list(cluster_id_to_color.items())[:3]}")

✅ MDS Stress Value: 2718223934.2557
✅ Generated colors for 885 clusters
   Sample: [(115754, 'rgb(63, 112, 123)'), (115755, 'rgb(63, 113, 124)'), (115756, 'rgb(63, 111, 124)')]


## 5. Visualization Functions

In [84]:
def visualize_mds_projection_3d(
    similarity_matrix,
    id_to_index,
    scaling_type='robust',
    hsv_mapping=('H', 'S', 'V')
):
    """
    Visualize MDS projection in 3D HSV space
    Shows how cluster similarity maps to the color space
    
    Args:
        similarity_matrix: N×N similarity matrix
        id_to_index: Cluster ID to matrix index mapping
        scaling_type: 'linear' or 'robust'
        hsv_mapping: Which MDS axis maps to (H, S, V)
    """
    coords_3d, _ = apply_mds_projection(similarity_matrix)
    H, S, V = map_coords_to_hsv(coords_3d, scaling_type, hsv_mapping)
    colors = convert_hsv_to_rgb(H, S, V)
    
    # Create cluster labels
    index_to_id = {v: k for k, v in id_to_index.items()}
    labels = [f"Cluster {index_to_id.get(i, i)}" for i in range(len(H))]
    
    # 3D scatter plot
    fig = go.Figure(data=[go.Scatter3d(
        x=H, y=S, z=V,
        mode='markers+text',
        marker=dict(size=8, color=colors, opacity=0.8),
        text=labels,
        textfont=dict(color='black', size=8),
        textposition='top center'
    )])
    
    fig.update_layout(
        title=f'MDS Projection in HSV Color Space ({scaling_type.capitalize()})',
        scene=dict(
            xaxis_title=f'{hsv_mapping[0]} (Hue)',
            yaxis_title=f'{hsv_mapping[1]} (Saturation)',
            zaxis_title=f'{hsv_mapping[2]} (Value)',
            xaxis=dict(range=[0, 1]),
            yaxis=dict(range=[0, 1]),
            zaxis=dict(range=[0, 1])
        ),
        height=700,
        width=900
    )
    
    fig.show()


def visualize_embedding_with_colors(
    df,
    cluster_id_to_color,
    x_col='x',
    y_col='y',
    cluster_col='cluster_id',
    label_col='label',
    max_cluster_size=1000
):
    """
    Visualize 2D embedding with similarity-based cluster colors
    
    Args:
        df: DataFrame with x, y, cluster_id, label
        cluster_id_to_color: Dict of cluster_id -> color
        max_cluster_size: Skip clusters larger than this for performance
    """
    fig = go.Figure()
    
    # Plot each cluster
    plotted_count = 0
    for cluster_id, color in cluster_id_to_color.items():
        if cluster_id == -1:
            continue  # Skip noise
        
        subset = df[df[cluster_col] == cluster_id]
        
        # Skip very large clusters for performance
        if len(subset) > max_cluster_size:
            continue
        
        if not subset.empty:
            fig.add_trace(go.Scatter(
                x=subset[x_col],
                y=subset[y_col],
                mode='markers',
                name=f'Cluster {cluster_id}',
                marker=dict(size=4, color=color, opacity=0.7),
                customdata=subset[[label_col]].values,
                hovertemplate='<b>%{customdata[0]}</b><br>Cluster: ' + str(cluster_id) + '<extra></extra>'
            ))
            plotted_count += 1
    
    fig.update_layout(
        title=f'2D Embedding with Similarity-Based Colors ({plotted_count} clusters)',
        xaxis_title='UMAP X',
        yaxis_title='UMAP Y',
        hovermode='closest',
        height=700,
        width=900,
        template='plotly_white'
    )
    
    fig.show()
    
    return fig


print("✅ Visualization functions defined")

✅ Visualization functions defined


## 6. Execute Full Pipeline

Run the complete color space embedding pipeline

In [85]:
# Step 1: Visualize 3D MDS projection in HSV space
print("=" * 60)
print("STEP 1: 3D MDS Projection Visualization")
print("=" * 60)

visualize_mds_projection_3d(
    similarity_matrix,
    id_to_index,
    scaling_type='robust',
    hsv_mapping=('H', 'S', 'V')
)

STEP 1: 3D MDS Projection Visualization
✅ MDS Stress Value: 2718223934.2557


In [86]:
# Step 2: Visualize 2D embedding with similarity-based colors
print("\n" + "=" * 60)
print("STEP 2: 2D Embedding Visualization with Similarity Colors")
print("=" * 60)

fig_embedding = visualize_embedding_with_colors(
    df_denoised,
    cluster_id_to_color,
    x_col='x',
    y_col='y',
    cluster_col='cluster_id',
    label_col='label',
    max_cluster_size=1000
)


STEP 2: 2D Embedding Visualization with Similarity Colors


In [87]:
# Step 3: Summary and statistics
print("\n" + "=" * 60)
print("PIPELINE SUMMARY")
print("=" * 60)
print(f"✅ Total data points: {len(df)}")
print(f"✅ After noise removal: {len(df_denoised)}")
print(f"✅ Unique clusters: {len(cluster_id_to_color)}")
print(f"✅ Similarity metric: Mahalanobis distance")
print(f"✅ DR method: MDS with n_components=3")
print(f"✅ Scaling: Robust (P5-P95)")
print(f"✅ HSV mapping: H-S-V (MDS X1-X2-X3)")
print("=" * 60)


PIPELINE SUMMARY
✅ Total data points: 115754
✅ After noise removal: 6367
✅ Unique clusters: 885
✅ Similarity metric: Mahalanobis distance
✅ DR method: MDS with n_components=3
✅ Scaling: Robust (P5-P95)
✅ HSV mapping: H-S-V (MDS X1-X2-X3)


## 7. Backend統合用メイン関数

システムに組み込む際に使用する統合関数

In [88]:
def compute_cluster_colors_from_similarity(
    similarity_dict,
    cluster_ids=None,
    scaling_type='robust',
    hsv_mapping=('H', 'S', 'V'),
    value_range=(0, 1)
):
    """
    [BACKEND統合用] メイン関数
    
    類似度辞書からクラスタID→RGB色の対応を計算
    指定されたクラスタID集合内でのみ埋め込みを実行（汎用版）
    
    Args:
        similarity_dict (dict): 
            形式: {(cluster_id1, cluster_id2): similarity_value}
            例: {(0, 1): 0.85, (1, 2): 0.92, ...}
        
        cluster_ids (list, set, or None):
            色を計算対象にするクラスタIDの集合
            None の場合は全クラスタを対象
            指定時: 指定されたIDのみで部分空間での埋め込みを実行
        
        scaling_type (str): 
            'linear' - 標準的な正規化
            'robust' - P5-P95パーセンタイル基準（外れ値に強い）
        
        hsv_mapping (tuple):
            MDS座標をどのHSV要素に割り当てるか
            例: ('H', 'S', 'V') = MDS X1→色相, X2→彩度, X3→明度
        
        value_range (tuple):
            V（明度）の出力範囲を指定
            デフォルト: (0, 1) → 全範囲 [0.0, 1.0]
            例: (0.5, 1) → 暗い色を避ける [0.5, 1.0]
            例: (0, 0.8) → 明るすぎる色を避ける [0.0, 0.8]
    
    Returns:
        dict: {cluster_id: 'rgb(r, g, b)', ...}
              クラスタIDから'rgb(255, 128, 0)'形式の色文字列への対応
    
    使用例1（全クラスタ対象、デフォルト明度）:
    ```python
    cluster_colors = compute_cluster_colors_from_similarity(
        similarity_dict['mahalanobis_distance'],
        scaling_type='robust'
    )
    ```
    
    使用例2（部分集合のみ対象、明度0.5-1.0）:
    ```python
    target_clusters = [0, 1, 2, 5, 10]
    cluster_colors = compute_cluster_colors_from_similarity(
        similarity_dict['mahalanobis_distance'],
        cluster_ids=target_clusters,
        scaling_type='robust',
        value_range=(0.5, 1)  # 暗い色を避ける
    )
    ```
    """
    
    # Step 0: cluster_ids が指定されている場合、類似度辞書をフィルタリング
    if cluster_ids is not None:
        cluster_ids_set = set(cluster_ids)
        print(f"  Filtering to {len(cluster_ids_set)} clusters...")
        
        filtered_dict = {
            (c1, c2): sim 
            for (c1, c2), sim in similarity_dict.items()
            if c1 in cluster_ids_set and c2 in cluster_ids_set
        }
        
        if len(filtered_dict) == 0:
            print("  ⚠️  Warning: No similarity pairs found in the specified cluster set")
            return {cid: 'rgb(200, 200, 200)' for cid in cluster_ids_set}
        
        print(f"  Using {len(filtered_dict)} similarity pairs")
        working_dict = filtered_dict
    else:
        print(f"  Using all {len(similarity_dict)} similarity pairs")
        working_dict = similarity_dict
    
    # Step 1: 類似度行列を作成
    similarity_matrix, id_to_index = create_similarity_matrix_from_dict(
        working_dict
    )
    
    # Step 2: MDS投影
    coords_3d, stress = apply_mds_projection(similarity_matrix, n_components=3)
    print(f"  MDS Stress: {stress:.4f}")
    
    # Step 3: HSV空間にマッピング（value_range を指定）
    H, S, V = map_coords_to_hsv(coords_3d, scaling_type, hsv_mapping, value_range)
    
    # Step 4: RGB色に変換
    colors = convert_hsv_to_rgb(H, S, V)
    
    # Step 5: クラスタID → 色の対応を作成
    cluster_id_to_color = {}
    for cluster_id, index in id_to_index.items():
        if index < len(colors):
            cluster_id_to_color[cluster_id] = colors[index]
    
    return cluster_id_to_color


# テスト実行
print("\n" + "=" * 60)
print("Backend Integration Test")
print("=" * 60)

result = compute_cluster_colors_from_similarity(
    similarity_dict['mahalanobis_distance'],
    scaling_type='robust'
)

print(f"✅ Input: similarity_dict['mahalanobis_distance']")
print(f"   Type: dict with {len(similarity_dict['mahalanobis_distance'])} pairs")
print(f"\n✅ Output: cluster_id_to_color")
print(f"   Type: dict with {len(result)} cluster IDs")
print(f"   Sample:")
for i, (cid, color) in enumerate(list(result.items())[:5]):
    print(f"     Cluster {cid}: {color}")
print("=" * 60)



Backend Integration Test
  Using all 391170 similarity pairs
✅ MDS Stress Value: 2718223934.2557
  MDS Stress: 2718223934.2557
✅ Input: similarity_dict['mahalanobis_distance']
   Type: dict with 391170 pairs

✅ Output: cluster_id_to_color
   Type: dict with 885 cluster IDs
   Sample:
     Cluster 115754: rgb(63, 112, 123)
     Cluster 115755: rgb(63, 113, 124)
     Cluster 115756: rgb(63, 111, 124)
     Cluster 115757: rgb(255, 255, 255)
     Cluster 115758: rgb(63, 114, 124)


## 8. System Integration: Apply Colors to Points

In [89]:
# Example: How to apply computed colors to embedding points
# This is the pattern used in d3_data_manager.py

def apply_similarity_colors_to_points(df, cluster_id_to_color):
    """
    Apply computed cluster colors to DataFrame points
    
    Args:
        df: DataFrame with 'cluster_id' column
        cluster_id_to_color: Output from compute_cluster_colors_from_similarity()
    
    Returns:
        DataFrame with new 'color' column
    """
    df_colored = df.copy()
    
    # Map cluster colors to each point
    df_colored['color'] = df_colored['cluster_id'].map(cluster_id_to_color)
    
    # Fill missing colors (for noise clusters not in the color map)
    df_colored['color'] = df_colored['color'].fillna('#CCCCCC')
    
    return df_colored


# Test integration
print("\n" + "=" * 60)
print("Integration Example: Adding Colors to Points")
print("=" * 60)

df_with_colors = apply_similarity_colors_to_points(df_denoised, result)

print(f"✅ Input DataFrame shape: {df_denoised.shape}")
print(f"✅ Output DataFrame shape: {df_with_colors.shape}")
print(f"✅ New columns: {list(df_with_colors.columns)}")
print(f"\n✅ Sample colored points:")
print(df_with_colors[['cluster_id', 'color']].head(10))


Integration Example: Adding Colors to Points
✅ Input DataFrame shape: (6367, 6)
✅ Output DataFrame shape: (6367, 7)
✅ New columns: ['x', 'y', 'label', 'cluster_id', 'noise', 'cluster_label', 'color']

✅ Sample colored points:
     cluster_id               color
37       116385  rgb(247, 103, 103)
39       115953        rgb(0, 0, 0)
56       116385  rgb(247, 103, 103)
58       116460    rgb(61, 149, 91)
77       116580        rgb(0, 0, 0)
82       115933      rgb(7, 12, 10)
94       116385  rgb(247, 103, 103)
97       116561       rgb(98, 0, 0)
115      116385  rgb(247, 103, 103)
118      116460    rgb(61, 149, 91)


## 9. Function Dependency Diagram

In [90]:
print("""
╔════════════════════════════════════════════════════════════╗
║          BACKEND INTEGRATION STRUCTURE                     ║
╚════════════════════════════════════════════════════════════╝

📌 【メイン統合関数】
   compute_cluster_colors_from_similarity()
   
   入力: similarity_dict[metric_name]
        {(cluster_id1, cluster_id2): similarity_value, ...}
   
   出力: {cluster_id: 'rgb(r, g, b)', ...}

───────────────────────────────────────────────────────────

📦 【内部使用関数】(実装方法の詳細)

   1️⃣  create_similarity_matrix_from_dict()
       入力: similarity_dict
       出力: (N×N行列, id_to_index)
       目的: 辞書形式を行列形式に統一

   2️⃣  apply_mds_projection()
       入力: similarity_matrix
       出力: (3D座標, stress値)
       目的: 3次元への次元削減

   3️⃣  apply_scaling()
       入力: coords, scaling_type
       出力: [0,1]正規化座標
       目的: 外れ値対応のスケーリング

   4️⃣  map_coords_to_hsv()
       入力: coords_3d, scaling_type, hsv_mapping
       出力: (H配列, S配列, V配列)
       目的: 3D座標をHSV色空間に変換

   5️⃣  convert_hsv_to_rgb()
       入力: H, S, V配列
       出力: ['rgb(r,g,b)', ...]
       目的: RGB形式に変換

───────────────────────────────────────────────────────────

🔄 【統合パターン】

   d3_data_manager.py内:
   
   def get_initial_data(..., color_mode='cluster'):
       if color_mode == 'distance':
           cluster_colors = compute_cluster_colors_from_similarity(
               self._similarity_dict['mahalanobis_distance'],
               scaling_type='robust'
           )
           # points に色を付与
           for point in points:
               point['color'] = cluster_colors.get(point['c'])

───────────────────────────────────────────────────────────

✅ 移行コスト: 低い
   - 外部依存: NumPy, SciPy, scikit-learn (既存)
   - 追加依存: matplotlib.colors (色変換用, 軽量)

──────────────────────────────────────────────────────────
""")


╔════════════════════════════════════════════════════════════╗
║          BACKEND INTEGRATION STRUCTURE                     ║
╚════════════════════════════════════════════════════════════╝

📌 【メイン統合関数】
   compute_cluster_colors_from_similarity()
   
   入力: similarity_dict[metric_name]
        {(cluster_id1, cluster_id2): similarity_value, ...}
   
   出力: {cluster_id: 'rgb(r, g, b)', ...}

───────────────────────────────────────────────────────────

📦 【内部使用関数】(実装方法の詳細)

   1️⃣  create_similarity_matrix_from_dict()
       入力: similarity_dict
       出力: (N×N行列, id_to_index)
       目的: 辞書形式を行列形式に統一

   2️⃣  apply_mds_projection()
       入力: similarity_matrix
       出力: (3D座標, stress値)
       目的: 3次元への次元削減

   3️⃣  apply_scaling()
       入力: coords, scaling_type
       出力: [0,1]正規化座標
       目的: 外れ値対応のスケーリング

   4️⃣  map_coords_to_hsv()
       入力: coords_3d, scaling_type, hsv_mapping
       出力: (H配列, S配列, V配列)
       目的: 3D座標をHSV色空間に変換

   5️⃣  convert_hsv_to_rgb()
       入力: H, S, V配列
    

## 10. 部分集合での埋め込みテスト

指定したクラスタID集合のみで色を計算する例


In [91]:
print("\n" + "=" * 60)
print("Test 1: 全クラスタでの埋め込み")
print("=" * 60)

result_all = compute_cluster_colors_from_similarity(
    similarity_dict['mahalanobis_distance'],
    cluster_ids=None,  # 全クラスタ対象
    scaling_type='robust'
)

print(f"\n✅ 全クラスタ数: {len(result_all)}")
print(f"   Sample colors:")
for cid, color in list(result_all.items())[:3]:
    print(f"     Cluster {cid}: {color}")



Test 1: 全クラスタでの埋め込み
  Using all 391170 similarity pairs
✅ MDS Stress Value: 2718223934.2557
  MDS Stress: 2718223934.2557

✅ 全クラスタ数: 885
   Sample colors:
     Cluster 115754: rgb(63, 112, 123)
     Cluster 115755: rgb(63, 113, 124)
     Cluster 115756: rgb(63, 111, 124)


In [98]:
print("\n" + "=" * 60)
print("Test 2: 部分集合での埋め込み（最初の50クラスタ）")
print("=" * 60)

# 最初の50クラスタを選択(similarity_dict['mahalanobis_distance']のkeyから乱択する)
np.random.seed(42)
selected_clusters = np.random.choice(
    list({cid for pair in similarity_dict['mahalanobis_distance'].keys() for cid in pair}),
    size=50,
    replace=False
).tolist()
print(f"  Selected clusters: {selected_clusters[:10]}") 

result_subset = compute_cluster_colors_from_similarity(
    similarity_dict['mahalanobis_distance'],
    cluster_ids=selected_clusters,
    scaling_type='robust'
)

print(f"\n✅ 対象クラスタ数: {len(selected_clusters)}")
print(f"   計算結果: {len(result_subset)} clusters に色を割り当て")
print(f"   Sample colors:")
for cid in selected_clusters[:5]:
    if cid in result_subset:
        print(f"     Cluster {cid}: {result_subset[cid]}")



Test 2: 部分集合での埋め込み（最初の50クラスタ）
  Selected clusters: [115798, 116142, 116457, 116397, 116634, 116403, 116087, 116054, 115874, 116636]
  Filtering to 50 clusters...
  Using 1225 similarity pairs
✅ MDS Stress Value: 2022484.8184
  MDS Stress: 2022484.8184

✅ 対象クラスタ数: 50
   計算結果: 50 clusters に色を割り当て
   Sample colors:
     Cluster 115798: rgb(58, 136, 129)
     Cluster 116142: rgb(207, 118, 255)
     Cluster 116457: rgb(60, 136, 127)
     Cluster 116397: rgb(60, 138, 130)
     Cluster 116634: rgb(248, 94, 94)


In [99]:
print("\n" + "=" * 60)
print("Test 3: ランダム選択での埋め込み")
print("=" * 60)

# ランダムに30個のクラスタを選択
import random
random_clusters = sorted(random.sample(range(len(result_all)), min(30, len(result_all))))

result_random = compute_cluster_colors_from_similarity(
    similarity_dict['mahalanobis_distance'],
    cluster_ids=random_clusters,
    scaling_type='robust'
)

print(f"\n✅ ランダム選択クラスタ: {random_clusters}")
print(f"   計算結果: {len(result_random)} clusters に色を割り当て")
print(f"   Sample colors:")
for cid in random_clusters[:5]:
    if cid in result_random:
        print(f"     Cluster {cid}: {result_random[cid]}")

# 全体との色の違いを確認
print(f"\n📊 色割り当ての比較:")
print(f"   全体埋め込み: Cluster 0 → {result_all.get(0, 'N/A')}")
print(f"   部分埋め込み: Cluster 0 → {result_subset.get(0, 'N/A') if 0 in selected_clusters else 'N/A'}")
print(f"   ⚠️  注: 部分集合での埋め込みは全体埋め込みと異なる色を生成します")
print(f"      （部分空間での相対的な位置が変わるため）")



Test 3: ランダム選択での埋め込み
  Filtering to 30 clusters...

✅ ランダム選択クラスタ: [140, 153, 196, 199, 240, 244, 246, 252, 260, 301, 427, 432, 433, 442, 444, 472, 483, 516, 550, 587, 594, 637, 699, 709, 718, 726, 739, 741, 763, 878]
   計算結果: 30 clusters に色を割り当て
   Sample colors:
     Cluster 140: rgb(200, 200, 200)
     Cluster 153: rgb(200, 200, 200)
     Cluster 196: rgb(200, 200, 200)
     Cluster 199: rgb(200, 200, 200)
     Cluster 240: rgb(200, 200, 200)

📊 色割り当ての比較:
   全体埋め込み: Cluster 0 → N/A
   部分埋め込み: Cluster 0 → N/A
   ⚠️  注: 部分集合での埋め込みは全体埋め込みと異なる色を生成します
      （部分空間での相対的な位置が変わるため）


## 11. V（明度）の範囲制御テスト

Vを0.5-1に限定して、暗い色を避ける例


In [101]:
print("\n" + "=" * 60)
print("Test: V の範囲制御と可視化")
print("=" * 60)

# テスト対象クラスタ（より多くのクラスタを対象）
test_clusters = np.random.choice(
    list({cid for pair in similarity_dict['mahalanobis_distance'].keys() for cid in pair}),
    size=100,
    replace=False
).tolist()
print(f"  Test clusters: {test_clusters[:10]} ...")

# テスト対象クラスタ(all)
test_clusters = None  # 全クラスタ対象

# パターン1: デフォルト（V: 0.0-1.0）
print("\n1️⃣  V range: (0, 1) - 全範囲")
colors_full_v = compute_cluster_colors_from_similarity(
    similarity_dict['mahalanobis_distance'],
    cluster_ids=test_clusters,
    value_range=(0, 1)
)

# パターン2: V を 0.5-1.0 に制限（暗い色を避ける）
print("\n2️⃣  V range: (0.5, 1) - 暗い色を避ける")
colors_bright_v = compute_cluster_colors_from_similarity(
    similarity_dict['mahalanobis_distance'],
    cluster_ids=test_clusters,
    value_range=(0.5, 1)
)

# パターン3: V を 0.0-0.8 に制限（明るすぎる色を避ける）
print("\n3️⃣  V range: (0, 0.8) - 明るすぎる色を避ける")
colors_dark_v = compute_cluster_colors_from_similarity(
    similarity_dict['mahalanobis_distance'],
    cluster_ids=test_clusters,
    value_range=(0, 0.8)
)

# 数値比較
print("\n📊 同じクラスタの色を比較:")
for cid in [0, 5, 10, 15, 20]:
    print(f"\nCluster {cid}:")
    print(f"  V(0, 1):     {colors_full_v.get(cid)}")
    print(f"  V(0.5, 1):   {colors_bright_v.get(cid)} ← より明るい傾向")
    print(f"  V(0, 0.8):   {colors_dark_v.get(cid)} ← より暗い傾向")

# 可視化: visualize_embedding_with_colors を使用して3パターンを表示
print("\n" + "=" * 60)
print("2D 埋め込み空間での可視化比較")
print("=" * 60)

# パターン1: V(0, 1) - 全範囲
print("\n▶️  パターン1: V range (0, 1) - 全範囲")
visualize_embedding_with_colors(
    df_denoised,
    colors_full_v,
    x_col='x',
    y_col='y',
    cluster_col='cluster_id',
    label_col='label',
    max_cluster_size=2000
)

# パターン2: V(0.5, 1) - 暗い色を避ける
print("\n▶️  パターン2: V range (0.5, 1) - 暗い色を避ける（明るい色のみ）")
visualize_embedding_with_colors(
    df_denoised,
    colors_bright_v,
    x_col='x',
    y_col='y',
    cluster_col='cluster_id',
    label_col='label',
    max_cluster_size=2000
)

# パターン3: V(0, 0.8) - 明るすぎる色を避ける
print("\n▶️  パターン3: V range (0, 0.8) - 明るすぎる色を避ける（濃い色を含む）")
visualize_embedding_with_colors(
    df_denoised,
    colors_dark_v,
    x_col='x',
    y_col='y',
    cluster_col='cluster_id',
    label_col='label',
    max_cluster_size=2000
)

print("\n✅ V の範囲制御により、色の明度分布が変わります！")
print("=" * 60)



Test: V の範囲制御と可視化
  Test clusters: [116313, 115929, 116530, 115921, 115876, 116465, 116335, 116278, 116437, 116201] ...

1️⃣  V range: (0, 1) - 全範囲
  Using all 391170 similarity pairs
✅ MDS Stress Value: 2718223934.2557
  MDS Stress: 2718223934.2557

2️⃣  V range: (0.5, 1) - 暗い色を避ける
  Using all 391170 similarity pairs
✅ MDS Stress Value: 2718223934.2557
  MDS Stress: 2718223934.2557

3️⃣  V range: (0, 0.8) - 明るすぎる色を避ける
  Using all 391170 similarity pairs
✅ MDS Stress Value: 2718223934.2557
  MDS Stress: 2718223934.2557

📊 同じクラスタの色を比較:

Cluster 0:
  V(0, 1):     None
  V(0.5, 1):   None ← より明るい傾向
  V(0, 0.8):   None ← より暗い傾向

Cluster 5:
  V(0, 1):     None
  V(0.5, 1):   None ← より明るい傾向
  V(0, 0.8):   None ← より暗い傾向

Cluster 10:
  V(0, 1):     None
  V(0.5, 1):   None ← より明るい傾向
  V(0, 0.8):   None ← より暗い傾向

Cluster 15:
  V(0, 1):     None
  V(0.5, 1):   None ← より明るい傾向
  V(0, 0.8):   None ← より暗い傾向

Cluster 20:
  V(0, 1):     None
  V(0.5, 1):   None ← より明るい傾向
  V(0, 0.8):   None ← より暗い傾向



▶️  パターン2: V range (0.5, 1) - 暗い色を避ける（明るい色のみ）



▶️  パターン3: V range (0, 0.8) - 明るすぎる色を避ける（濃い色を含む）



✅ V の範囲制御により、色の明度分布が変わります！


In [None]:
colors_full_v