In [1]:
# Import required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import sys
import os
from pathlib import Path

# Add project root to path
project_root = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(project_root)

# Local imports
from api.utils.duckdb_utils import DuckDBHelper
from src.helpers import initialize_logging
from src.get_prototypes import run_clustering_pipeline, Reducer

# Configure plotting
plt.style.use('default')
sns.set_palette("husl")
%matplotlib inline

# Initialize logging
logger = initialize_logging("training_input_analysis")

print("✅ Libraries imported successfully")
print(f"📁 Project root: {project_root}")


2025-08-19 11:41:58,493 - src.helpers - INFO - Logging initialized for /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/logs/duckdb_utils.log
2025-08-19 11:41:58,493 - src.helpers - INFO - Logging initialized for /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/logs/duckdb_utils.log
2025-08-19 11:41:59,643 - src.helpers - INFO - Logging initialized for /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/logs/get_prototypes.log
2025-08-19 11:41:59,643 - src.helpers - INFO - Logging initialized for /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/logs/get_prototypes.log
2025-08-19 11:41:59,643 - src.helpers - INFO - Logging initialized for /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/logs/get_prototypes.log
2025-08-19 11:41:59,833 - src.helpers - INFO - Logging initialized for /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025/logs/training_input_analysis.log
2025-08-19 11:41:59,833 - src.helpers - INFO - Logging initialized for /Users

✅ Libraries imported successfully
📁 Project root: /Users/taishajoseph/Documents/Projects/MDC-Challenge-2025


In [2]:
# Default paths and configurations
DEFAULT_DUCKDB_PATH = "../artifacts/mdc_challenge.db"
DEFAULT_CHROMA_CONFIG = "configs/chunking.yaml"
DEFAULT_COLLECTION_NAME = "dataset-aggregates-train"
DEFAULT_FEATURE_CLUSTERS_PATH = "reports/clustering/dataset_clusters.json"

# UMAP parameters
DEFAULT_N_NEIGHBORS = 15
DEFAULT_MIN_DIST = 0.1
DEFAULT_N_COMPONENTS = 2
DEFAULT_RANDOM_SEED = 42

# Clustering parameters
DEFAULT_K_NEIGHBORS = 10
DEFAULT_SIMILARITY_THRESHOLD = None
DEFAULT_THRESHOLD_METHOD = "degree_target"
DEFAULT_RESOLUTION = 1
DEFAULT_MIN_CLUSTER_SIZE = 3
DEFAULT_MAX_CLUSTER_SIZE = 9999
DEFAULT_SPLIT_FACTOR = 1.3
DEFAULT_RANDOM_SEED = 42
DEFAULT_TARGET_N = 80
DEFAULT_TOL = 2

In [3]:
# Step 1: Load embeddings from ChromaDB
print("🔄 Loading embeddings from ChromaDB...")

# Initialize DuckDB helper
from chromadb import PersistentClient
client = PersistentClient(path=os.path.join(project_root, "local_chroma"))
print(client.list_collections())


🔄 Loading embeddings from ChromaDB...


2025-08-19 11:42:01,057 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-08-19 11:42:01,057 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-08-19 11:42:01,057 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.
2025-08-19 11:42:01,057 - chromadb.telemetry.product.posthog - INFO - Anonymized telemetry enabled. See                     https://docs.trychroma.com/telemetry for more information.


[Collection(name=mdc_training_data), Collection(name=dataset-aggregates-train)]


In [4]:
collection = client.get_collection(DEFAULT_COLLECTION_NAME)
dat = collection.get(include=["embeddings"])
dat

{'ids': ['https://doi.org/10.5061/dryad.4dj6042',
  'https://doi.org/10.5061/dryad.r7sqv9s8n',
  'https://doi.org/10.11583/dtu.20555586',
  'https://doi.org/10.11583/dtu.20555586.v3',
  'https://doi.org/10.5061/dryad.8153g',
  'https://doi.org/10.5281/zenodo.8014149',
  'https://doi.org/10.5281/zenodo.8014150',
  'ENSOARG00000003950',
  'ENSOARG00000012128',
  'ENSOARG00000012835',
  'ENSOARG00000013782',
  'ENSOARG00000013966',
  'ENSOARG00000014129',
  'IPR000264',
  'IPR002172',
  'IPR014760',
  'IPR020857',
  'IPR020858',
  'IPR021177',
  'IPR023415',
  'ENSBTAG00000011038',
  'ENSBTAG00000013718',
  'ENSBTAG00000017121',
  'ENSBTAG00000017131',
  'ENSBTAG00000021275',
  'ENSBTAG00000047833',
  'NM_001078656',
  'https://doi.org/10.15468/dl.354f8k',
  'https://doi.org/10.15468/dl.nbku3v',
  'https://doi.org/10.15468/dl.pdjqte',
  'https://doi.org/10.15468/dl.uejpg6',
  'https://doi.org/10.17862/cranfield.rd.19146182',
  'https://doi.org/10.17862/cranfield.rd.19146182.v1',
  '3.10.1

In [5]:
# Convert to DataFrame
df = pd.DataFrame(dat["embeddings"])
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
0,-0.017331,-0.003289,0.007024,-0.003306,-0.015948,0.029291,0.036715,0.035468,-0.001300,-0.046639,...,0.001059,0.043030,-0.053122,-0.065123,-0.001324,0.056817,-0.013290,-0.043261,0.097732,0.100580
1,-0.043640,-0.000003,0.034032,0.023671,0.092774,-0.053444,-0.056502,0.008318,-0.013348,0.008250,...,0.004956,-0.022488,-0.000035,-0.050601,-0.009106,0.036639,-0.024997,-0.035046,0.015330,-0.036408
2,-0.008052,0.095518,0.013060,-0.051372,0.059878,0.036095,-0.028284,0.019578,-0.041390,0.037182,...,-0.031841,0.058538,-0.017772,0.042047,0.015369,-0.025719,0.020821,-0.032335,0.059899,0.068338
3,-0.008052,0.095518,0.013060,-0.051372,0.059878,0.036095,-0.028284,0.019578,-0.041390,0.037182,...,-0.031841,0.058538,-0.017772,0.042047,0.015369,-0.025719,0.020821,-0.032335,0.059899,0.068338
4,-0.048955,0.002556,-0.031699,0.001135,0.054505,0.035323,-0.017979,0.003956,-0.019208,0.023924,...,0.003554,0.009065,-0.043255,-0.020902,-0.032191,0.020013,-0.020206,-0.039565,0.081096,0.006233
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
687,-0.065858,-0.054596,0.001030,0.017521,0.025454,0.020686,0.022257,0.005512,0.024903,-0.014037,...,0.025834,-0.005543,-0.040120,0.014040,-0.015782,-0.025307,0.003425,0.014606,0.090468,-0.025789
688,-0.087525,-0.028583,-0.024390,-0.053067,0.009462,-0.001504,0.024789,0.024796,-0.033924,-0.023509,...,0.003505,0.004181,0.009770,-0.014121,-0.047272,-0.000210,0.055508,-0.008958,0.104666,-0.014266
689,-0.087525,-0.028583,-0.024390,-0.053067,0.009462,-0.001504,0.024789,0.024796,-0.033924,-0.023509,...,0.003505,0.004181,0.009770,-0.014121,-0.047272,-0.000210,0.055508,-0.008958,0.104666,-0.014266
690,-0.087525,-0.028583,-0.024390,-0.053067,0.009462,-0.001504,0.024789,0.024796,-0.033924,-0.023509,...,0.003505,0.004181,0.009770,-0.014121,-0.047272,-0.000210,0.055508,-0.008958,0.104666,-0.014266


In [6]:
df.to_numpy

<bound method DataFrame.to_numpy of           0         1         2         3         4         5         6    \
0   -0.017331 -0.003289  0.007024 -0.003306 -0.015948  0.029291  0.036715   
1   -0.043640 -0.000003  0.034032  0.023671  0.092774 -0.053444 -0.056502   
2   -0.008052  0.095518  0.013060 -0.051372  0.059878  0.036095 -0.028284   
3   -0.008052  0.095518  0.013060 -0.051372  0.059878  0.036095 -0.028284   
4   -0.048955  0.002556 -0.031699  0.001135  0.054505  0.035323 -0.017979   
..        ...       ...       ...       ...       ...       ...       ...   
687 -0.065858 -0.054596  0.001030  0.017521  0.025454  0.020686  0.022257   
688 -0.087525 -0.028583 -0.024390 -0.053067  0.009462 -0.001504  0.024789   
689 -0.087525 -0.028583 -0.024390 -0.053067  0.009462 -0.001504  0.024789   
690 -0.087525 -0.028583 -0.024390 -0.053067  0.009462 -0.001504  0.024789   
691  0.025712  0.039187 -0.002140  0.048915  0.027948  0.040256  0.031754   

          7         8         9    ... 

In [7]:
dataset_cluster_map = run_clustering_pipeline(
    dataset_embeddings=df.to_numpy(),
    feature_names=dat["ids"],
    k_neighbors=DEFAULT_K_NEIGHBORS,
    similarity_threshold=DEFAULT_SIMILARITY_THRESHOLD,
    threshold_method=DEFAULT_THRESHOLD_METHOD,
    # resolution=DEFAULT_RESOLUTION,
    min_cluster_size=DEFAULT_MIN_CLUSTER_SIZE,
    max_cluster_size=DEFAULT_MAX_CLUSTER_SIZE,
    split_factor=DEFAULT_SPLIT_FACTOR,
    random_seed=DEFAULT_RANDOM_SEED,
    target_n=DEFAULT_TARGET_N,
    tol=DEFAULT_TOL,
    output_dir="reports/clustering"
)
dataset_cluster_map


2025-08-19 11:42:01,740 - src.helpers - INFO - Running feature clustering pipeline on (692, 384) matrix
2025-08-19 11:42:01,740 - src.helpers - INFO - Running feature clustering pipeline on (692, 384) matrix
2025-08-19 11:42:01,740 - src.helpers - INFO - Running feature clustering pipeline on (692, 384) matrix
2025-08-19 11:42:01,740 - src.helpers - INFO - Running feature clustering pipeline on (692, 384) matrix
2025-08-19 11:42:01,751 - src.helpers - INFO - Building k-NN graph for 692 features
2025-08-19 11:42:01,751 - src.helpers - INFO - Building k-NN graph for 692 features
2025-08-19 11:42:01,751 - src.helpers - INFO - Building k-NN graph for 692 features
2025-08-19 11:42:01,751 - src.helpers - INFO - Building k-NN graph for 692 features
2025-08-19 11:42:01,756 - src.helpers - INFO - Computing 10-NN with sklearn
2025-08-19 11:42:01,756 - src.helpers - INFO - Computing 10-NN with sklearn
2025-08-19 11:42:01,756 - src.helpers - INFO - Computing 10-NN with sklearn
2025-08-19 11:42:01,

Executing run_clustering_pipeline...
Executing build_knn_similarity_graph...
Executing determine_similarity_threshold...
Function determine_similarity_threshold took 0.0047 seconds to complete.
Function build_knn_similarity_graph took 0.0833 seconds to complete.
Executing find_resolution_for_target...
Executing run_leiden_clustering...
Function run_leiden_clustering took 0.0383 seconds to complete.
Executing run_leiden_clustering...
Function run_leiden_clustering took 0.0332 seconds to complete.
Executing run_leiden_clustering...


2025-08-19 11:42:01,943 - src.helpers - INFO - Final: 42 clusters (min=4 │ median=16 │ max=31)
2025-08-19 11:42:01,943 - src.helpers - INFO - Final: 42 clusters (min=4 │ median=16 │ max=31)
2025-08-19 11:42:01,943 - src.helpers - INFO - Final: 42 clusters (min=4 │ median=16 │ max=31)
2025-08-19 11:42:01,943 - src.helpers - INFO - Final: 42 clusters (min=4 │ median=16 │ max=31)
2025-08-19 11:42:01,948 - src.helpers - INFO - [res search] step 02  γ=7.062 → 42 clusters
2025-08-19 11:42:01,948 - src.helpers - INFO - [res search] step 02  γ=7.062 → 42 clusters
2025-08-19 11:42:01,948 - src.helpers - INFO - [res search] step 02  γ=7.062 → 42 clusters
2025-08-19 11:42:01,948 - src.helpers - INFO - [res search] step 02  γ=7.062 → 42 clusters
2025-08-19 11:42:01,968 - src.helpers - INFO - Initial Leiden: 43 clusters; modularity=0.8017
2025-08-19 11:42:01,968 - src.helpers - INFO - Initial Leiden: 43 clusters; modularity=0.8017
2025-08-19 11:42:01,968 - src.helpers - INFO - Initial Leiden: 43 cl

Function run_leiden_clustering took 0.0338 seconds to complete.
Executing run_leiden_clustering...
Function run_leiden_clustering took 0.0357 seconds to complete.
Executing run_leiden_clustering...
Function run_leiden_clustering took 0.0316 seconds to complete.
Executing run_leiden_clustering...
Function run_leiden_clustering took 0.0246 seconds to complete.
Executing run_leiden_clustering...
Function run_leiden_clustering took 0.0237 seconds to complete.
Executing run_leiden_clustering...
Function run_leiden_clustering took 0.0235 seconds to complete.
Executing run_leiden_clustering...
Function run_leiden_clustering took 0.0232 seconds to complete.
Executing run_leiden_clustering...


2025-08-19 11:42:02,150 - src.helpers - INFO - Cycle detected; stopping at iteration 2
2025-08-19 11:42:02,150 - src.helpers - INFO - Cycle detected; stopping at iteration 2
2025-08-19 11:42:02,150 - src.helpers - INFO - Cycle detected; stopping at iteration 2
2025-08-19 11:42:02,150 - src.helpers - INFO - Cycle detected; stopping at iteration 2
2025-08-19 11:42:02,155 - src.helpers - INFO - Final: 44 clusters (min=5 │ median=15 │ max=31)
2025-08-19 11:42:02,155 - src.helpers - INFO - Final: 44 clusters (min=5 │ median=15 │ max=31)
2025-08-19 11:42:02,155 - src.helpers - INFO - Final: 44 clusters (min=5 │ median=15 │ max=31)
2025-08-19 11:42:02,155 - src.helpers - INFO - Final: 44 clusters (min=5 │ median=15 │ max=31)
2025-08-19 11:42:02,159 - src.helpers - INFO - [res search] step 09  γ=7.993 → 44 clusters
2025-08-19 11:42:02,159 - src.helpers - INFO - [res search] step 09  γ=7.993 → 44 clusters
2025-08-19 11:42:02,159 - src.helpers - INFO - [res search] step 09  γ=7.993 → 44 clusters

Function run_leiden_clustering took 0.0248 seconds to complete.
Function find_resolution_for_target took 0.3310 seconds to complete.
Executing export_feature_clusters...
Function export_feature_clusters took 0.0057 seconds to complete.
Executing export_clustering_report...
Function export_clustering_report took 0.0054 seconds to complete.
Function run_clustering_pipeline took 0.4431 seconds to complete.


{'https://doi.org/10.5061/dryad.4dj6042': 'cluster_3',
 'https://doi.org/10.5061/dryad.r7sqv9s8n': 'cluster_32',
 'https://doi.org/10.11583/dtu.20555586': 'cluster_7',
 'https://doi.org/10.11583/dtu.20555586.v3': 'cluster_7',
 'https://doi.org/10.5061/dryad.8153g': 'cluster_16',
 'https://doi.org/10.5281/zenodo.8014149': 'cluster_29',
 'https://doi.org/10.5281/zenodo.8014150': 'cluster_7',
 'ENSOARG00000003950': 'cluster_29',
 'ENSOARG00000012128': 'cluster_22',
 'ENSOARG00000012835': 'cluster_29',
 'ENSOARG00000013782': 'cluster_26',
 'ENSOARG00000013966': 'cluster_26',
 'ENSOARG00000014129': 'cluster_26',
 'IPR000264': 'cluster_26',
 'IPR002172': 'cluster_26',
 'IPR014760': 'cluster_26',
 'IPR020857': 'cluster_10',
 'IPR020858': 'cluster_26',
 'IPR021177': 'cluster_10',
 'IPR023415': 'cluster_10',
 'ENSBTAG00000011038': 'cluster_26',
 'ENSBTAG00000013718': 'cluster_26',
 'ENSBTAG00000017121': 'cluster_26',
 'ENSBTAG00000017131': 'cluster_26',
 'ENSBTAG00000021275': 'cluster_26',
 'EN

In [8]:
# initialize reducer
pca = Reducer(
        collection_name=DEFAULT_COLLECTION_NAME,
        cfg_path=DEFAULT_CHROMA_CONFIG,
        db_path=DEFAULT_DUCKDB_PATH,
        n_neighbors=DEFAULT_N_NEIGHBORS,
        min_dist=DEFAULT_MIN_DIST,
        n_components=DEFAULT_N_COMPONENTS,
        random_seed=DEFAULT_RANDOM_SEED
)

2025-08-19 11:43:11,648 - api.database.duckdb_schema - INFO - Starting DuckDB schema creation...
2025-08-19 11:43:11,648 - api.database.duckdb_schema - INFO - Starting DuckDB schema creation...
2025-08-19 11:43:11,648 - api.database.duckdb_schema - INFO - Starting DuckDB schema creation...
2025-08-19 11:43:11,648 - api.database.duckdb_schema - INFO - Starting DuckDB schema creation...
2025-08-19 11:43:11,673 - api.database.duckdb_schema - INFO - Creating documents table...
2025-08-19 11:43:11,673 - api.database.duckdb_schema - INFO - Creating documents table...
2025-08-19 11:43:11,673 - api.database.duckdb_schema - INFO - Creating documents table...
2025-08-19 11:43:11,673 - api.database.duckdb_schema - INFO - Creating documents table...
2025-08-19 11:43:11,682 - api.database.duckdb_schema - INFO - Documents table created successfully
2025-08-19 11:43:11,682 - api.database.duckdb_schema - INFO - Documents table created successfully
2025-08-19 11:43:11,682 - api.database.duckdb_schema -

In [9]:
# dat["ids"]

In [9]:
results = pca.run_per_cluster_pca(df.to_numpy(), dataset_cluster_map, dat["ids"])
results

2025-08-19 11:43:15,842 - src.helpers - INFO - 🔄 Running feature-cluster PCA...
2025-08-19 11:43:15,842 - src.helpers - INFO - 🔄 Running feature-cluster PCA...
2025-08-19 11:43:15,842 - src.helpers - INFO - 🔄 Running feature-cluster PCA...
2025-08-19 11:43:15,842 - src.helpers - INFO - 🔄 Running feature-cluster PCA...
2025-08-19 11:43:15,846 - src.helpers - INFO - Loading datasets from DuckDB...
2025-08-19 11:43:15,846 - src.helpers - INFO - Loading datasets from DuckDB...
2025-08-19 11:43:15,846 - src.helpers - INFO - Loading datasets from DuckDB...
2025-08-19 11:43:15,846 - src.helpers - INFO - Loading datasets from DuckDB...
2025-08-19 11:43:15,921 - src.helpers - INFO - Retrieved 692 datasets from database
2025-08-19 11:43:15,921 - src.helpers - INFO - Retrieved 692 datasets from database
2025-08-19 11:43:15,921 - src.helpers - INFO - Retrieved 692 datasets from database
2025-08-19 11:43:15,921 - src.helpers - INFO - Retrieved 692 datasets from database
2025-08-19 11:43:15,924 - sr

Executing run_per_cluster_pca...


2025-08-19 11:43:16,032 - src.helpers - INFO - cluster_42: PC1 var ratio = 0.8298
2025-08-19 11:43:16,024 - src.helpers - INFO - cluster_18: PC1 var ratio = 0.9697
2025-08-19 11:43:16,044 - src.helpers - INFO - Dataset cluster cluster_1 shape: (384, 30)
2025-08-19 11:43:16,028 - src.helpers - INFO - cluster_36: PC1 var ratio = 0.7600
2025-08-19 11:43:16,035 - src.helpers - INFO - cluster_17: PC1 var ratio = 0.7798
2025-08-19 11:43:16,047 - src.helpers - INFO - Dataset cluster cluster_20 shape: (384, 15)
2025-08-19 11:43:16,041 - src.helpers - INFO - Dataset cluster cluster_27 shape: (384, 13)
2025-08-19 11:43:16,038 - src.helpers - INFO - cluster_43: PC1 var ratio = 0.8405
2025-08-19 11:43:16,032 - src.helpers - INFO - cluster_42: PC1 var ratio = 0.8298
2025-08-19 11:43:16,049 - src.helpers - INFO - Dataset cluster cluster_6 shape: (384, 22)
2025-08-19 11:43:16,044 - src.helpers - INFO - Dataset cluster cluster_1 shape: (384, 30)
2025-08-19 11:43:16,035 - src.helpers - INFO - cluster_1

Function run_per_cluster_pca took 0.4980 seconds to complete.


True

In [11]:
import pickle

with open("../artifacts/models_20250814_1652/prototypes.pkl", "rb") as f:
    prototypes = pickle.load(f)

prototypes

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,374,375,376,377,378,379,380,381,382,383
cluster_0,-0.188969,-0.230388,-0.303129,-0.017496,0.228384,0.150168,0.424777,0.101323,-0.029496,-0.034306,...,-0.086104,0.268471,0.054047,-0.197768,0.088208,0.526416,0.059695,-0.187691,0.227003,0.212082
cluster_1,0.01123,-0.129095,0.085674,-0.105383,0.603767,0.056677,-0.201922,0.054175,0.228505,0.109229,...,0.018395,0.323959,-0.009011,-0.255022,-0.241,0.195959,-0.208248,-0.160917,0.116162,0.110501
cluster_10,-0.107464,-0.097143,-0.249735,-0.086382,0.351574,-0.166939,0.122699,0.053063,-0.074944,-0.027911,...,-0.179388,0.078103,-0.012632,-0.212519,-0.129944,0.058793,-0.050016,-0.258612,0.436373,0.15932
cluster_11,-0.192988,-0.276198,0.069527,0.128474,0.115469,-0.003328,0.035286,0.157896,0.043427,0.25965,...,0.104788,0.067442,0.05912,0.038632,0.039791,0.242626,-0.227856,-0.147825,0.329119,0.157013
cluster_12,-0.365793,0.033681,-0.00216,0.074327,0.129595,-0.121579,0.074367,0.032111,0.10972,0.053377,...,-0.048313,-0.084456,0.005611,0.03541,-0.134508,0.204331,0.152509,-0.203388,0.156423,-0.158978
cluster_13,-0.05645,-0.115724,-0.187576,0.019799,0.107343,0.02353,-0.050435,0.053908,0.114705,0.025742,...,-0.033506,0.051133,0.001051,-0.299273,-0.150737,-0.05209,0.074055,-0.097351,0.463273,0.087192
cluster_14,-0.303337,-0.147053,-0.095047,-0.079371,0.051389,0.029275,0.075545,0.152682,-0.216419,-0.041724,...,0.085686,0.080099,0.040685,-0.06944,0.072493,-0.115427,0.107379,-0.13264,0.267979,0.048933
cluster_15,-0.148886,-0.13995,0.073067,-0.000676,0.256097,0.048449,-0.16115,0.00739,-0.208818,0.087088,...,0.173015,-0.045242,-0.112396,-0.104374,-0.145741,0.051302,-0.130323,-0.166539,-0.043798,0.215198
cluster_16,0.020862,-0.127486,-0.233582,-0.039845,0.34193,-0.15359,0.000456,0.020034,0.056644,-0.020557,...,-0.092016,0.037663,0.044628,-0.151735,-0.275284,0.020878,-0.089468,-0.296608,0.387565,0.143467
cluster_17,-0.161272,0.060291,0.144368,0.072211,0.135709,-0.030593,-0.184745,-0.015701,-0.204572,-0.061017,...,0.129509,-0.062131,-0.056154,-0.082769,-0.038512,0.064704,-0.121168,-0.171951,0.008989,0.057311
