In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.cluster import KMeans
import umap.umap_ as umap
from sklearn import preprocessing
from sklearn.preprocessing import MinMaxScaler

In [None]:
def load_data(filename, index_col=None):
    data = pd.read_csv(filename, decimal=",")
    if index_col:
        data.set_index(index_col, inplace=True)
    return data

In [None]:
def apply_kmeans(data, n_clusters):
    kmeans = KMeans(n_clusters)
    clusters_pred = kmeans.fit_predict(data)
    return clusters_pred

In [None]:
def apply_umap(data, n_neighbors=15, min_dist=0.1, random_state=None):
    embedding = umap.UMAP(n_neighbors=n_neighbors, min_dist=min_dist, random_state=random_state).fit_transform(data)
    return embedding

In [None]:
def visualize_clusters(embedding, cluster_labels, cmap='Dark2'):
    plt.figure(figsize=(15, 10))
    plt.scatter(embedding[:, 0], embedding[:, 1], c=cluster_labels, s=5, cmap=cmap)
    plt.show()

In [None]:
def save_clustered_data(data, cluster_labels, output_filename):
    clustered_data = data.copy()
    clustered_data['clusters_pred'] = cluster_labels
    clustered_data.to_csv(output_filename)

In [None]:
# Define parameters
input_filename = 'path_to_input_file.csv'
output_filename = 'path_to_output_file.csv'
n_clusters = 4  # Number of clusters for KMeans
index_col = 'SUBJID'  # Column to set as index

# Load data
data = load_data(input_filename, index_col)

# Apply KMeans
cluster_labels = apply_kmeans(data, n_clusters)

# Apply UMAP
embedding = apply_umap(data, random_state=42)

# Visualize clusters
visualize_clusters(embedding, cluster_labels)

# Save clustered data
save_clustered_data(data, cluster_labels, output_filename)