# Cluster Protein Copy Numbers - REDS Recall

In [None]:
import re
from collections import defaultdict

import gurobipy as gp
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn
from rbc_gem_utils import (
    GEM_NAME,
    get_dirpath,
    handle_msg,
    read_cobra_model,
    show_versions,
)
from sklearn.metrics import (
    calinski_harabasz_score,
    davies_bouldin_score,
    pairwise,
    r2_score,
    silhouette_score,
)

gp.setParam("OutputFlag", 0)
gp.setParam("LogToConsole", 0)

# Show versions of notebook
show_versions()

### Define organism, model, and dataset

In [None]:
organism = "Human"
model_id = "RBC_GEM"
dataset_name = "REDSRecall"
genotype = "ATP11C_V972M"
grouped_data_key = "Sample"
grouped_data_key

### Set variables for columns keys and sample identification

In [None]:
sample_key = "SAMPLE ID"
operations = "|".join([x.capitalize() for x in ["mean", "median"]])
operation_re = re.compile(r"(?P<op>" + operations + r")\_(?P<group>\w+)")
donor_re = re.compile(rf"(?P<donor>S(?P<num>\d\d\d))")
sample_id_re = re.compile(r"(?!" + operations + r")" + donor_re.pattern)

### Set figure options

In [None]:
save_figures = True
transparent = False
imagetype = "svg"

### Set paths

In [None]:
# Set paths
processed_data_dirpath = get_dirpath(use_temp="processed") / organism / dataset_name
processed_clusters_dirpath = (
    get_dirpath(use_temp="processed")
    / model_id
    / "Clustering"
    / organism
    / dataset_name
)
# Ensure directories exist
processed_clusters_dirpath.mkdir(exist_ok=True, parents=True)

## Load RBC-GEM model

In [None]:
model_dirpath = get_dirpath("model")
model = read_cobra_model(filename=model_dirpath / f"{GEM_NAME}.xml")
model

## Load copy numbers

In [None]:
# Load data for clustering
protein_dtype = "ProteinIntensities"
df_proteins = pd.read_csv(
    processed_data_dirpath / f"{protein_dtype}.csv",
    index_col=sample_key,
)

all_ids = list(df_proteins.index.unique())
operation_ids = [x for x in all_ids if operation_re.match(x)]
sample_ids = [x for x in all_ids if x not in operation_ids]

print(f"Number of measured samples: {len(sample_ids)}")
print(f"Number of operation samples: {len(operation_ids)}")
print(f"Number of models to generate: {len(all_ids)}")

### Remove models based on data operations

In [None]:
df_data_samples = df_proteins[
    [not bool(operation_re.search(x)) for x in df_proteins.index]
].copy()
df_data_samples

In [None]:
winsorize_limit = 0.05  # Set as 0 or None to prevent winsorization
export_data = True
dict_of_normalized_dfs = {}

### Normalize protein copy numbers across all samples

In [None]:
df = df_data_samples.copy()
filetag = []

if grouped_data_key == "Donor":
    df.index = [x.split("_")[0] for x in df.index]
    df = df.groupby(level=0).mean()
filetag = [grouped_data_key]

if winsorize_limit:
    # Winsorized
    df = df.clip(
        lower=df.quantile(winsorize_limit, axis=0),
        upper=df.quantile(1 - winsorize_limit, axis=0),
        axis=1,
    )
    filetag += ["WinPCT{:.0f}".format(100 * winsorize_limit)]

# Normalized
df_norm = df.sub(df.mean(axis=0), axis=1).div(df.std(ddof=1, axis=0), axis=1)
filetag += ["Z-score"]
filetag = "_".join(filetag)
print(f"Filetag: {filetag}")
df_norm

#### All proteins in dataset

In [None]:
if export_data:
    filepath = processed_clusters_dirpath / f"{filetag}.csv"
    df_norm.to_csv(filepath, index=True)
print(
    "Standardized Range: ({:.4f}, {:.4f})".format(
        df_norm.min().min(), df_norm.max().max()
    )
)

df_norm

## Split data into subgroups using metadata
### Load metadata

In [None]:
df_metadata = pd.read_csv(
    processed_data_dirpath / "Metadata.csv",
    index_col=sample_key,
)

if grouped_data_key.endswith("Donor"):
    df_metadata.index = [x.split("_")[0] for x in df_metadata.index]
    df_metadata = df_metadata.groupby(level=0).agg(lambda x: list(x.unique()))
    df_metadata = df_metadata.explode(list(df_metadata.columns))

for c in df_metadata.columns:
    print(c)
df_metadata.index.name = sample_key
df_metadata

### Split by genotype

In [None]:
# Genotypes
verbose = True
dict_of_genotype_dfs = {}
if genotype not in df_metadata.columns:
    handle_msg(f"No genotype data for Genotype {genotype}\n", print_msg=verbose)
handle_msg(f"Splitting data for Genotype {genotype}", print_msg=verbose)
# Make directory
genotype_dir = processed_clusters_dirpath / genotype
genotype_dir.mkdir(exist_ok=True, parents=True)
# Drop NA values and save Genotype MetaData
df_genotype = df_metadata[genotype].dropna()
if export_data:
    df_genotype.to_csv(genotype_dir / f"{grouped_data_key}_Genotypes.csv", index=True)
# Split data by allele counts for proteins
handle_msg(f"Applying to normalized data for proteins", print_msg=verbose)
# Group samples by allele count for samples with allele count determined
df_samples_grouped = (
    df_genotype.reset_index(drop=False)
    .convert_dtypes()
    .groupby(df_genotype.name)
    .agg(list)
)
df_samples_grouped.loc["All"] = [list(df_genotype.index)]
handle_msg(
    f"Creating subgroups for allele counts: {list(df_samples_grouped.index)}",
    print_msg=verbose,
)
for allele_count, sample_ids in df_samples_grouped.iterrows():
    allele_key = f"Alleles{allele_count}"
    df_allele_samples = df_data_samples.loc[
        df_data_samples.index.isin(sample_ids.item())
    ]
    df_allele_samples_norm = df_norm.loc[df_norm.index.isin(sample_ids.item())]
    df_allele_samples_norm = df_allele_samples_norm.dropna(how="all", axis=1).fillna(
        df_allele_samples_norm.mean(axis=0)
    )
    # Save data
    if export_data:
        df_allele_samples.to_csv(
            genotype_dir / "{}.csv".format("_".join([protein_dtype, allele_key])),
            index=True,
        )
        df_allele_samples_norm.to_csv(
            genotype_dir
            / "{}.csv".format("_".join([protein_dtype, allele_key, filetag])),
            index=True,
        )

    dict_of_genotype_dfs[(genotype, allele_count)] = df_allele_samples_norm.copy()
    print(df_allele_samples_norm.shape)
print()

## Load data for clustering
### Cluster samples pre-grouped by allele count for representative models

In [None]:
def calculate_dunn_index(data, labels):
    # Calculate pairwise distances
    distances = pairwise.euclidean_distances(data)

    # Initialize variables for min_intercluster and max_intracluster
    min_intercluster = np.inf
    max_intracluster = 0

    unique_labels = np.unique(labels)

    # Compute maximum intra-cluster distance
    for label in unique_labels:
        cluster_points_indices = np.where(labels == label)[0]
        if len(cluster_points_indices) > 1:
            intra_cluster_distances = distances[
                np.ix_(cluster_points_indices, cluster_points_indices)
            ]
            max_intracluster = max(max_intracluster, np.max(intra_cluster_distances))

    # Compute minimum inter-cluster distance
    for i in range(len(unique_labels)):
        for j in range(i + 1, len(unique_labels)):
            cluster_i_indices = np.where(labels == unique_labels[i])[0]
            cluster_j_indices = np.where(labels == unique_labels[j])[0]
            inter_cluster_distances = distances[
                np.ix_(cluster_i_indices, cluster_j_indices)
            ]
            min_intercluster = min(min_intercluster, np.min(inter_cluster_distances))

    if (
        max_intracluster == 0
    ):  # Handle cases of single-point clusters or perfect overlap
        return np.inf
    else:
        return min_intercluster / max_intracluster

#### Allele count: 0

In [None]:
cluster_mapping = {}

In [None]:
allele_count = 0
verbose = True
data_original = dict_of_genotype_dfs[(genotype, allele_count)].copy()
data_original.index.name = sample_key
data_original.columns.name = "Proteins"
data_original

#### K-means clustering
##### Determine optimal number of clusters for representative models

In [None]:
evaulation_metric_scores = defaultdict(dict)
dataframes_of_clusters = {}
dataframes_of_label_maps = {}
kmax = min(50, len(data_original) - 1)
verbose = False
for n_clusters in range(2, kmax + 1):

    kmeans = sklearn.cluster.KMeans(
        n_clusters=n_clusters,
        init="k-means++",
        n_init=50,
        max_iter=int(1e9),
        tol=1e-12,
        verbose=0,
        random_state=44,
        algorithm="lloyd",
    )
    # Clustering occurs on columns, goal is to reduce sample number so data format is (n_features, n_samples)
    data_clustered = kmeans.fit_transform(data_original.values)
    # Inertia (Within-cluster sum-of-squares)
    inertia = kmeans.inertia_
    evaulation_metric_scores["Inertia"][n_clusters] = inertia
    # Silhouette Score
    evaulation_metric_scores["Silhouette Score"][n_clusters] = silhouette_score(
        data_clustered,  # (n_samples, n_features)
        kmeans.labels_,
        metric="euclidean",
        sample_size=None,
        random_state=None,
    )
    # Davies-Bouldin Index
    evaulation_metric_scores["Davies-Bouldin Index"][n_clusters] = davies_bouldin_score(
        data_clustered, kmeans.labels_
    )
    # Calinski-Harabasz Index
    evaulation_metric_scores["Calinski-Harabasz Index"][n_clusters] = (
        calinski_harabasz_score(data_clustered, kmeans.labels_)
    )
    evaulation_metric_scores["Dunn Index"][n_clusters] = calculate_dunn_index(
        data_clustered, kmeans.labels_
    )
    if verbose:
        header = f"For K-means clustering: K = {n_clusters}"

        print(f"{header}\n{'='*len(header)}")
        for metric_key in list(evaulation_metric_scores):
            score = evaulation_metric_scores[metric_key][n_clusters]
            print(f"{metric_key}: {score:.4f}")
        print()

    # Save as DataFame
    data_clustered = pd.DataFrame(data_clustered, index=data_original.index)
    data_clustered.index.name = "Proteins"
    data_clustered.columns.name = "Cluster"
    dataframes_of_clusters[n_clusters] = data_clustered
    dataframes_of_label_maps[n_clusters] = kmeans.labels_

#### Plot evaluation metric scores
##### Inertia
Measures the compactness of clusters by calculating the sum of squared distances between each data point and its centroid. Lower inertia generally indicates tighter, more cohesive clusters.
##### Silhouette Score
Measures how well each data point fits within its assigned cluster compared to other clusters. A higher score (closer to 1) indicates better clustering. 
##### Davies-Bouldin Index
Measures the average similarity between each cluster and its most similar cluster. A lower score indicates better separation between clusters. 
##### Calinski-Harabasz Index
Measures the ratio of between-cluster dispersion to within-cluster dispersion. A higher score indicates better clustering. 
##### Dunn Index:
Measures the ratio of the shortest distance between clusters to the largest distance within a cluster. A higher score indicates better separation and compactness. 

In [None]:
fig, axes = plt.subplots(
    len(evaulation_metric_scores),
    1,
    figsize=(10, 2 * len(evaulation_metric_scores)),
    sharex=True,
)
major_tick_interval = 4
minor_tick_interval = 2
chosen_n_clusters = 18
print(f"Number of samples to cluster: {len(data_original.index)}")
# Initialize minimum and maximums
for idx, (ax, metric_key) in enumerate(
    zip(axes.flatten(), list(evaulation_metric_scores))
):
    n_clusters = list(evaulation_metric_scores[metric_key].keys())
    scores = list(evaulation_metric_scores[metric_key].values())
    ax.plot(
        n_clusters,
        scores,
        marker="o",
        markerfacecolor="black",
        markeredgecolor="black",
        markersize=2,
    )
    ax.xaxis.set_major_locator(mpl.ticker.MultipleLocator(major_tick_interval))
    ax.xaxis.set_minor_locator(mpl.ticker.MultipleLocator(minor_tick_interval))
    ax.set_title(metric_key)
    if idx == len(evaulation_metric_scores) - 1:
        ax.set_xlabel("Number of clusters", fontsize="xx-large")
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    if chosen_n_clusters < len(data_original.index):
        ax.vlines(
            chosen_n_clusters, min(scores), max(scores), linestyle="--", color="red"
        )
        ax.hlines(
            evaulation_metric_scores[metric_key][chosen_n_clusters],
            xmin=0,
            xmax=n_clusters[-1],
            linestyle="--",
            color="red",
        )
        ax.plot(
            chosen_n_clusters,
            evaulation_metric_scores[metric_key][chosen_n_clusters],
            marker="o",
            markerfacecolor="red",
            markeredgecolor="red",
            markersize=5,
        )
    ax.set_xlim(0, n_clusters[-1])
    ax.set_ylim(min(scores), max(scores))
fig.tight_layout()

##### Use clusters to form representative samples

In [None]:
cluster_start_num = 1
# Load protein copy numbers
df_copy_numbers = pd.read_csv(
    processed_data_dirpath / "ProteinCopyNumbers.csv", index_col=sample_key
)
# Load MCH for transforming data
df_MCH = pd.read_csv(processed_data_dirpath / "MCH.csv", index_col=sample_key)

df_copy_numbers = df_copy_numbers.loc[list(df_data_samples.index)].copy()
df_MCH = df_MCH.loc[list(df_data_samples.index)].copy()
if grouped_data_key.endswith("Donor"):
    df_copy_numbers.index = [x.split("_")[0] for x in df_copy_numbers.index]
    df_copy_numbers = df_copy_numbers.groupby(level=0).mean()

    df_MCH.index = [x.split("_")[0] for x in df_MCH.index]
    df_MCH = df_MCH.groupby(level=0).mean()


(processed_clusters_dirpath / genotype).mkdir(exist_ok=True, parents=True)
if chosen_n_clusters < len(data_original.index):
    cluster_labels = dataframes_of_label_maps[chosen_n_clusters] + cluster_start_num
    cluster_mapping.update(dict(zip(list(data_original.index), cluster_labels)))

    df_copy_numbers_clustered = df_copy_numbers.loc[list(data_original.index)].copy()
    df_copy_numbers_clustered = df_copy_numbers_clustered.rename(
        cluster_mapping, axis=0
    )
    df_copy_numbers_clustered = df_copy_numbers_clustered.groupby(level=0).mean()
    df_copy_numbers_clustered.index = [
        f"Allele{allele_count}_C{int(x)}" for x in df_copy_numbers_clustered.index
    ]
    df_copy_numbers_clustered.to_csv(
        processed_clusters_dirpath
        / genotype
        / f"{grouped_data_key}_Alleles{allele_count}_ClusterCopyNumbers.csv",
        index=True,
    )

    df_MCH_clustered = df_MCH.loc[list(data_original.index)].copy()
    df_MCH_clustered = df_MCH_clustered.rename(cluster_mapping, axis=0)
    df_MCH_clustered = df_MCH_clustered.groupby(level=0).mean()
    df_MCH_clustered.index = [
        f"Allele{allele_count}_C{int(x)}" for x in df_MCH_clustered.index
    ]
    df_MCH_clustered.to_csv(
        processed_clusters_dirpath
        / genotype
        / f"{grouped_data_key}_Alleles{allele_count}_ClusterMCH.csv",
        index=True,
    )
    if save_figures:
        fig.savefig(
            processed_clusters_dirpath
            / genotype
            / f"{grouped_data_key}_Alleles{allele_count}_ClusterMetrics.{imagetype}",
            transparent=transparent,
            format=imagetype,
        )
    cluster_start_num += chosen_n_clusters
else:
    df_copy_numbers_clustered = data_original.copy()
df_copy_numbers_clustered

#### Allele count: 1

In [None]:
allele_count = 1
verbose = True
data_original = dict_of_genotype_dfs[(genotype, allele_count)].copy()
data_original.index.name = sample_key
data_original.columns.name = "Proteins"
data_original

#### K-means clustering
##### Determine optimal number of clusters for representative models

In [None]:
evaulation_metric_scores = defaultdict(dict)
dataframes_of_clusters = {}
dataframes_of_label_maps = {}
kmax = min(50, len(data_original) - 1)
verbose = False
for n_clusters in range(2, kmax + 1):

    kmeans = sklearn.cluster.KMeans(
        n_clusters=n_clusters,
        init="k-means++",
        n_init=50,
        max_iter=int(1e9),
        tol=1e-12,
        verbose=0,
        random_state=44,
        algorithm="lloyd",
    )
    # Clustering occurs on columns, goal is to reduce sample number so data format is (n_features, n_samples)
    data_clustered = kmeans.fit_transform(data_original.values)
    # Inertia (Within-cluster sum-of-squares)
    inertia = kmeans.inertia_
    evaulation_metric_scores["Inertia"][n_clusters] = inertia
    # Silhouette Score
    evaulation_metric_scores["Silhouette Score"][n_clusters] = silhouette_score(
        data_clustered,  # (n_samples, n_features)
        kmeans.labels_,
        metric="euclidean",
        sample_size=None,
        random_state=None,
    )
    # Davies-Bouldin Index
    evaulation_metric_scores["Davies-Bouldin Index"][n_clusters] = davies_bouldin_score(
        data_clustered, kmeans.labels_
    )
    # Calinski-Harabasz Index
    evaulation_metric_scores["Calinski-Harabasz Index"][n_clusters] = (
        calinski_harabasz_score(data_clustered, kmeans.labels_)
    )
    evaulation_metric_scores["Dunn Index"][n_clusters] = calculate_dunn_index(
        data_clustered, kmeans.labels_
    )
    if verbose:
        header = f"For K-means clustering: K = {n_clusters}"

        print(f"{header}\n{'='*len(header)}")
        for metric_key in list(evaulation_metric_scores):
            score = evaulation_metric_scores[metric_key][n_clusters]
            print(f"{metric_key}: {score:.4f}")
        print()

    # Save as DataFame
    data_clustered = pd.DataFrame(data_clustered, index=data_original.index)
    data_clustered.index.name = "Proteins"
    data_clustered.columns.name = "Cluster"
    dataframes_of_clusters[n_clusters] = data_clustered
    dataframes_of_label_maps[n_clusters] = kmeans.labels_

#### Plot evaluation metric scores
##### Inertia
Measures the compactness of clusters by calculating the sum of squared distances between each data point and its centroid. Lower inertia generally indicates tighter, more cohesive clusters.
##### Silhouette Score
Measures how well each data point fits within its assigned cluster compared to other clusters. A higher score (closer to 1) indicates better clustering. 
##### Davies-Bouldin Index
Measures the average similarity between each cluster and its most similar cluster. A lower score indicates better separation between clusters. 
##### Calinski-Harabasz Index
Measures the ratio of between-cluster dispersion to within-cluster dispersion. A higher score indicates better clustering. 
##### Dunn Index:
Measures the ratio of the shortest distance between clusters to the largest distance within a cluster. A higher score indicates better separation and compactness. 

In [None]:
fig, axes = plt.subplots(
    len(evaulation_metric_scores),
    1,
    figsize=(10, 2 * len(evaulation_metric_scores)),
    sharex=True,
)
major_tick_interval = 2
minor_tick_interval = 1
chosen_n_clusters = 8
print(f"Number of samples to cluster: {len(data_original.index)}")
# Initialize minimum and maximums

for idx, (ax, metric_key) in enumerate(
    zip(axes.flatten(), list(evaulation_metric_scores))
):
    n_clusters = list(evaulation_metric_scores[metric_key].keys())
    scores = list(evaulation_metric_scores[metric_key].values())
    ax.plot(
        n_clusters,
        scores,
        marker="o",
        markerfacecolor="black",
        markeredgecolor="black",
        markersize=2,
    )
    ax.xaxis.set_major_locator(mpl.ticker.MultipleLocator(major_tick_interval))
    ax.xaxis.set_minor_locator(mpl.ticker.MultipleLocator(minor_tick_interval))
    ax.set_title(metric_key)
    if idx == len(evaulation_metric_scores) - 1:
        ax.set_xlabel("Number of clusters", fontsize="xx-large")
    ax.spines["top"].set_visible(False)
    ax.spines["right"].set_visible(False)
    if chosen_n_clusters < len(data_original.index):
        ax.vlines(
            chosen_n_clusters, min(scores), max(scores), linestyle="--", color="red"
        )
        ax.hlines(
            evaulation_metric_scores[metric_key][chosen_n_clusters],
            xmin=0,
            xmax=n_clusters[-1],
            linestyle="--",
            color="red",
        )
        ax.plot(
            chosen_n_clusters,
            evaulation_metric_scores[metric_key][chosen_n_clusters],
            marker="o",
            markerfacecolor="red",
            markeredgecolor="red",
            markersize=5,
        )
    ax.set_xlim(0, n_clusters[-1])
    ax.set_ylim(min(scores), max(scores))
fig.tight_layout()

##### Use clusters to form representative samples

In [None]:
# Load protein copy numbers
df_copy_numbers = pd.read_csv(
    processed_data_dirpath / "ProteinCopyNumbers.csv", index_col=sample_key
)

# Load MCH for transforming data
df_MCH = pd.read_csv(processed_data_dirpath / "MCH.csv", index_col=sample_key)

df_copy_numbers = df_copy_numbers.loc[list(df_data_samples.index)].copy()
df_MCH = df_MCH.loc[list(df_data_samples.index)].copy()
if grouped_data_key.endswith("Donor"):
    df_copy_numbers.index = [x.split("_")[0] for x in df_copy_numbers.index]
    df_copy_numbers = df_copy_numbers.groupby(level=0).mean()

    df_MCH.index = [x.split("_")[0] for x in df_MCH.index]
    df_MCH = df_MCH.groupby(level=0).mean()


(processed_clusters_dirpath / genotype).mkdir(exist_ok=True, parents=True)
if chosen_n_clusters < len(data_original.index):
    cluster_labels = dataframes_of_label_maps[chosen_n_clusters] + cluster_start_num
    cluster_mapping.update(dict(zip(list(data_original.index), cluster_labels)))

    df_copy_numbers_clustered = df_copy_numbers.loc[list(data_original.index)].copy()
    df_copy_numbers_clustered = df_copy_numbers_clustered.rename(
        cluster_mapping, axis=0
    )
    df_copy_numbers_clustered = df_copy_numbers_clustered.groupby(level=0).mean()
    df_copy_numbers_clustered.index = [
        f"Allele{allele_count}_C{int(x)}" for x in df_copy_numbers_clustered.index
    ]
    df_copy_numbers_clustered.to_csv(
        processed_clusters_dirpath
        / genotype
        / f"{grouped_data_key}_Alleles{allele_count}_ClusterCopyNumbers.csv",
        index=True,
    )

    df_MCH_clustered = df_MCH.loc[list(data_original.index)].copy()
    df_MCH_clustered = df_MCH_clustered.rename(cluster_mapping, axis=0)
    df_MCH_clustered = df_MCH_clustered.groupby(level=0).mean()
    df_MCH_clustered.index = [
        f"Allele{allele_count}_C{int(x)}" for x in df_MCH_clustered.index
    ]
    df_MCH_clustered.to_csv(
        processed_clusters_dirpath
        / genotype
        / f"{grouped_data_key}_Alleles{allele_count}_ClusterMCH.csv",
        index=True,
    )
    if save_figures:
        fig.savefig(
            processed_clusters_dirpath
            / genotype
            / f"{grouped_data_key}_Alleles{allele_count}_ClusterMetrics.{imagetype}",
            transparent=transparent,
            format=imagetype,
        )
    cluster_start_num += chosen_n_clusters
else:
    df_copy_numbers_clustered = data_original.copy()
df_copy_numbers_clustered

### Concat data to create final set

In [None]:
cluster_count = 0
allele_counts = [0, 1, 2]
df_genotypes = pd.read_csv(
    processed_clusters_dirpath / genotype / f"{grouped_data_key}_Genotypes.csv",
    index_col=0,
)

copy_number_dfs_to_concat = []
MCH_dfs_to_concat = []
for allele_count in allele_counts:
    try:
        df_copy_numbers_allele = pd.read_csv(
            processed_clusters_dirpath
            / genotype
            / f"{grouped_data_key}_Alleles{allele_count}_ClusterCopyNumbers.csv",
            index_col=0,
        )
        df_MCH_allele = pd.read_csv(
            processed_clusters_dirpath
            / genotype
            / f"{grouped_data_key}_Alleles{allele_count}_ClusterMCH.csv",
            index_col=0,
        )
        # Count number of existing clusters
        cluster_count += len(df_copy_numbers_allele.index)
    except FileNotFoundError:
        df_copy_numbers_allele = df_copy_numbers.loc[
            list(df_genotypes[(df_genotypes == allele_count).values].index)
        ]
        cluster_mapping.update(
            {
                sample_id: idx + cluster_count
                for idx, sample_id in enumerate(df_copy_numbers_allele.index, start=1)
            }
        )
        df_copy_numbers_allele.index = [
            f"Allele{allele_count}_{x}" for x in df_copy_numbers_allele.index
        ]
        # Set cluster for donors with allele count 2 as final cluster

        df_MCH_allele = df_MCH.loc[
            list(df_genotypes[(df_genotypes == allele_count).values].index)
        ]
        df_MCH_allele.index = [f"Allele{allele_count}_{x}" for x in df_MCH_allele.index]

    copy_number_dfs_to_concat += [df_copy_numbers_allele]
    MCH_dfs_to_concat += [df_MCH_allele]


operations = ["Mean", "Median"]
df_copy_numbers_representative = pd.concat(copy_number_dfs_to_concat)
to_concat = []
for op in operations:
    df = df_copy_numbers_representative.copy()
    df.index = [x.split("_")[0] for x in df.index]
    df = getattr(df.groupby(level=0), op.lower())()
    df.index = ["_".join((op.capitalize(), x)) for x in df.index]
    to_concat += [df]
df_copy_numbers_representative = pd.concat(
    (df_copy_numbers_representative, *to_concat), axis=0
)

df_MCH_representative = pd.concat(MCH_dfs_to_concat)
to_concat = []
for op in operations:
    df = df_MCH_representative.copy()
    df.index = [x.split("_")[0] for x in df.index]
    df = getattr(df.groupby(level=0), op.lower())()
    df.index = ["_".join((op.capitalize(), x)) for x in df.index]
    to_concat += [df]
df_MCH_representative = pd.concat((df_MCH_representative, *to_concat), axis=0)

df_cluster_mapping = pd.DataFrame.from_dict(
    cluster_mapping, orient="index", columns=["CLUSTER"]
)
df_cluster_mapping.index.name = sample_key
df_cluster_mapping.to_csv(
    processed_clusters_dirpath / genotype / f"{grouped_data_key}_ClusterMapping.csv",
    index=True,
)
df_copy_numbers_representative.to_csv(
    processed_clusters_dirpath
    / genotype
    / f"{genotype}_{grouped_data_key}_ClusterCopyNumbers.csv",
    index=True,
)
df_MCH_representative.to_csv(
    processed_clusters_dirpath
    / genotype
    / f"{genotype}_{grouped_data_key}_ClusterMCH.csv",
    index=True,
)
df_copy_numbers_representative

In [None]:
asdasdasdasdasdasd

In [None]:
umap = sklearn.manifold.LocallyLinearEmbedding(
    n_neighbors=10,
    n_components=2,
    reg=1e-9,
    eigen_solver="dense",
    max_iter=int(1e9),
    method="ltsa",
    hessian_tol=1e-4,
    modified_tol=1e-12,
    neighbors_algorithm="auto",
    # random_state=4,
    n_jobs=10,
)
transformed = umap.fit_transform(data_original.values)
# ax = plt.scatter(*transformed.T)
df = pd.DataFrame(
    transformed,
    index=data_original.merge(df_metadata, left_index=True, right_index=True)[
        genotype
    ].index,
)
df = df.merge(df_metadata[genotype], left_index=True, right_index=True)
df.plot.scatter(x=0, y=1, c=df[genotype])

In [None]:
umap = sklearn.manifold.MDS(
    n_components=2,
    metric=True,
    n_init=10,
    max_iter=int(1e9),
    verbose=0,
    eps=1e-6,
    n_jobs=10,
    random_state=4,
    dissimilarity="euclidean",
)
transformed = umap.fit_transform(data_original.values)
# ax = plt.scatter(*transformed.T)
df = pd.DataFrame(
    transformed,
    index=data_original.merge(df_metadata, left_index=True, right_index=True)[
        genotype
    ].index,
)
df = df.merge(df_metadata[genotype], left_index=True, right_index=True)
df.plot.scatter(x=0, y=1, c=df[genotype])

In [None]:
umap = sklearn.manifold.TSNE(
    n_components=2,
    perplexity=50,
    # reg=1e-9,
    # eigen_solver="dense",
    # max_iter=int(1e9),
    # method="standard",
    # hessian_tol=1e-4,
    # modified_tol=1e-12,
    # neighbors_algorithm="auto",
    # random_state=4,
    n_jobs=10,
)
transformed = umap.fit_transform(data_original.values)
# ax = plt.scatter(*transformed.T)
df = pd.DataFrame(
    transformed,
    index=data_original.merge(df_metadata, left_index=True, right_index=True)[
        genotype
    ].index,
)
df = df.merge(df_metadata[genotype], left_index=True, right_index=True)
df.plot.scatter(x=0, y=1, c=df[genotype])

In [None]:
silhouette_avg_dict = {}
wcss_dict = {}
for n_clusters in range(2, 50):

    kmeans = sklearn.cluster.KMeans(
        n_clusters=n_clusters,
        init="k-means++",
        n_init=10,
        max_iter=int(1e6),
        tol=1e-9,
        verbose=0,
        random_state=4,
        algorithm="lloyd",
    )
    # Shape is (n_samples, n_features), goal is to reduce sample number to transpose data
    data_original = data.T
    data_clustered = kmeans.fit_transform(data_original.values)
    data_clustered = pd.DataFrame(data_clustered.T, columns=data.columns)

    wcss_dict[n_clusters] = kmeans.inertia_
    # The silhouette_score gives the average value for all the samples.
    silhouette_avg = sklearn.metrics.silhouette_score(
        data_original.values, kmeans.labels_
    )
    silhouette_avg_dict[n_clusters] = silhouette_avg
    # fig, ax1 = plt.subplots(1, 1)
    # fig.set_size_inches(4, 4)

    # # The 1st subplot is the silhouette plot
    # # The silhouette coefficient can range from -1, 1 but in this example all
    # # lie within [-0.1, 1]
    # ax1.set_xlim([-0.1, 1])
    # # The (n_clusters+1)*10 is for inserting blank space between silhouette
    # # plots of individual clusters, to demarcate them clearly.
    # ax1.set_ylim([0, len(X) + (n_clusters + 1) * 10])
    # y_lower = 10
    # for i in range(n_clusters):
    #     # Aggregate the silhouette scores for samples belonging to
    #     # cluster i, and sort them
    #     ith_cluster_silhouette_values = sample_silhouette_values[kmeans.labels_ == i]

    #     ith_cluster_silhouette_values.sort()

    #     size_cluster_i = ith_cluster_silhouette_values.shape[0]
    #     y_upper = y_lower + size_cluster_i

    #     color = cm.nipy_spectral(float(i) / n_clusters)
    #     ax1.fill_betweenx(
    #         np.arange(y_lower, y_upper),
    #         0,
    #         ith_cluster_silhouette_values,
    #         facecolor=color,
    #         edgecolor=color,
    #         alpha=0.7,
    #     )

    #     # Label the silhouette plots with their cluster numbers at the middle
    #     ax1.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))

    #     # Compute the new y_lower for next plot
    #     y_lower = y_upper + 10  # 10 for the 0 samples

    # ax1.set_title("The silhouette plot for the various clusters.")
    # ax1.set_xlabel("The silhouette coefficient values")
    # ax1.set_ylabel("Cluster label")

    # # The vertical line for average silhouette score of all the values
    # ax1.axvline(x=silhouette_avg, color="red", linestyle="--")

    # ax1.set_yticks([])  # Clear the yaxis labels / ticks
    # ax1.set_xticks([-0.1, 0, 0.2, 0.4, 0.6, 0.8, 1])