## Formatting Results

In [None]:
# Let's start by importing the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import os
import re

# Defining a function to extract parameters from the filename
def extract_params(filename):
    name_without_extension = os.path.splitext(filename)[0]
    params_str = name_without_extension.split("_")
    params_dict = {}
    for i in range(0, len(params_str), 2):
        params_dict[params_str[i]] = params_str[i+1]
    return params_dict

# Path to the directory with the files
path = 'results/'

# Lists to store dataframes along with their parameters
labels_data = []
scores_data = []
weights_data = []
distances_data = []
feature_order_data = []

# Loop through all files in the directory
for filename in os.listdir(path):
    if filename.endswith('.csv'):
        # Read the data
        df = pd.read_csv(path + filename)
        
        # Extract parameters from the filename
        params = extract_params(filename)
        
        # Depending on the type of data, append it to the appropriate list
        if 'labels' in filename:
            labels_data.append((df, params))
        elif 'scores' in filename:
            scores_data.append((df, params))
        elif 'weights' in filename:
            weights_data.append((df, params))
        elif 'distance_matrix' in filename:
            distances_data.append((df, params))
        elif "feature_orders" in filename:
            feature_order_data.append((df, params))
            
# Now we have four lists - each containing tuples of dataframes and their parameters
# For each type of data (labels, scores, weights, distances), we have a list of these tuples
# Each tuple has a dataframe and a dictionary with parameters extracted from the filename

# Let's check the first few entries of each dataframe to ensure that the data has been loaded correctly
# labels_data[0][0].head(), scores_data[0][0].head(), weights_data[0][0].head(), distances_data[0][0].head()

In [None]:
# First, let's create a function to combine files of the same type into a single file

def combine_files(file_data, output_file):
    # Initialize an empty dataframe to store the combined data
    combined_df = pd.DataFrame()

    # Loop through all dataframes and their parameters
    for df, params in file_data:
        # Rename the column with the data to reflect the parameters
        column_name = '_'.join([f"{k}={v}" for k, v in params.items()])
        df = df.rename(columns={df.columns[0]: column_name})
        # If the combined dataframe is empty, replace it with the current dataframe
        if combined_df.empty:
            combined_df = df
        else:
            # Otherwise, merge the current dataframe with the combined dataframe
            combined_df = pd.concat([combined_df, df], axis=1)
    
    # Write the combined dataframe to a CSV file
    combined_df.to_csv(output_file, index=False)
    return combined_df

# Now, let's use this function to combine all the cluster score files into a single file
combined_scores = combine_files(scores_data, 'combined_results/combined_scores.csv')

# And all the cluster weight files into a single file
combined_weights = combine_files(weights_data, 'combined_results/combined_weights.csv')

# And all the cluster weight files into a single file
combined_labels = combine_files(labels_data, 'combined_results/combined_labels.csv')

# And all the feature orders into a single file
combined_orders = combine_files(feature_order_data, 'combined_results/feature_orders.csv')

In [None]:
# I got the column named from the data originally but I'll just hard-code them here so
# I don't need to upload the data
# tabular_data = pd.read_csv("work/91/0e2c6a8e9ff6d453846d2dfbf9d928/maf_files/binary_numerical_merged.csv")

In [None]:
# Instead of getting them from the data, let's hard-code the original feature names and order
#feature_cols = list(tabular_data.columns.values)
feature_cols = ['Unnamed: 0', 'age_at_diagnosis', 'year_of_diagnosis', 
                'Frame_Shift_Del', 'Frame_Shift_Ins', 'In_Frame_Del', 
                'In_Frame_Ins', 'Missense_Mutation', 'Nonsense_Mutation', 
                'Nonstop_Mutation', 'Splice_Site', 'Translation_Start_Site', 
                'MedianAbsoluteDeviation', 'MATH', 'prior_malignancy', 
                'prior_malignancy.1', 'gender', 'tissue_or_organ_of_origin', 
                'primary_diagnosis', 'ajcc_staging_system_edition', 'ajcc_pathologic_t', 
                'morphology', 'ajcc_pathologic_n', 'ajcc_pathologic_m', 'icd_10_code', 
                'site_of_resection_or_biopsy', 'race', 'ethnicity', 
                'treatments_radiation_treatment_or_therapy', 'vital_status']

feature_indexes = list(combined_orders.iloc[:,0].apply(int))
sorted_cols = sorted(feature_cols, key=lambda x: feature_indexes.index(feature_cols.index(x)))

## Feature Weight Distribution

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
weights = combined_weights
weights.index = sorted_cols

# Melt the DataFrame to a long format
melted_weights = weights.reset_index().melt(id_vars='index', var_name='parameters', value_name='weight')

# Extract K and distance metric from parameters
melted_weights['K'] = melted_weights['parameters'].apply(lambda x: x.split('_')[0].split('=')[3])
melted_weights['distance_metric'] = melted_weights['parameters'].apply(lambda x: x.split('_')[1].split('=')[1])

# Create boxplot
plt.figure(figsize=(15, 10))
sns.boxplot(data=melted_weights, x='index', y='weight')
plt.xticks(rotation=90)
plt.title('Boxplot of weights for each feature grouped by distance metric')
plt.show()

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

distance_metrics = melted_weights['distance_metric'].unique()

# Iterate over unique distance metrics
for metric in distance_metrics:
    # Filter data for the specific distance metric
    subset = melted_weights[melted_weights['distance_metric'] == metric]
    
    # Create a new figure for each distance metric
    plt.figure(figsize=(20, 10))
    
    # Create a boxplot for the subset
    sns.boxplot(x='index', y='weight', hue='K', data=subset)
    plt.title(f'Boxplot for {metric} distance metric')
    plt.xlabel('Feature')
    plt.ylabel('Weight')
    plt.xticks(rotation=90)
    
    # Display the plot for the current distance metric
    plt.show()

The huang distance metric just 0's everything except the age of the patient at diagnosis (this is why changing clusters or regularization really doesn't change anything).

## Silhouette Scores

In [None]:
# Let's convert the data to long format
data = combined_scores
data_long = data.melt(var_name='parameters', value_name='scores')

# Split the parameters into separate columns
data_long[['P', 'K', 'dist', 'S']] = data_long['parameters'].str.extract(r'P=(\d+)=K=(\d+)_dist=(\w+)=S=(\d\.\d)', expand=True)

# Convert N and K to integers and S to float for proper sorting in visualizations
data_long['P'] = data_long['P'].astype(int)
data_long['K'] = data_long['K'].astype(int)
data_long['S'] = data_long['S'].astype(float)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Set the overall aesthetic.
sns.set(style="whitegrid")

# Create line plots
fig, axs = plt.subplots(2, 1, figsize=(10, 15))

# Plot for K
sns.lineplot(x='K', y='scores', hue='dist', data=data_long, ax=axs[0])
axs[0].set_title('Scores by Number of Clusters (K)')

# Plot for S
sns.lineplot(x='S', y='scores', hue='dist', data=data_long, ax=axs[1])
axs[1].set_title('Scores by Scaling Factor (S)')

plt.tight_layout()
plt.show()

## Understand the cluster assignments

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
import re

data = pd.read_csv('combined_results/combined_labels.csv')

# Extracting K, dist, and S from column names
def extract_params(col):
    match = re.match(r"P=30=K=(\d+)_dist=([\w]+)=S=([\d.]+)_cluster=labels", col)
    if match:
        K = int(match.group(1))
        dist = match.group(2)
        S = float(match.group(3))
        return K, dist, S
    else:
        return None

# Add K, dist, and S to the dataframe
params = pd.DataFrame([extract_params(col) for col in data.columns], columns=['K', 'dist', 'S'])
params['label'] = data.columns
data_melted = pd.melt(data.reset_index(), id_vars='index', value_vars=data.columns)
data_melted.columns = ['index', 'label', 'cluster']
data_melted = data_melted.merge(params, on='label')

In [None]:
unique_K = data_melted['K'].unique()
num_K = len(unique_K)
# Creating the plots with bars side-by-side

fig, axs = plt.subplots(num_K, figsize=(15, 6*num_K))

for i, K in enumerate(unique_K):
    data_subset = data_melted[data_melted['K'] == K]
    sns.histplot(data=data_subset, x="cluster", hue="dist", multiple="dodge", shrink=.8, ax=axs[i])
    axs[i].set_title(f'Number of Points in Each Cluster for K={K}')
    axs[i].set_xlabel('Cluster')
    axs[i].set_ylabel('Count')

plt.tight_layout()
plt.show()

In [None]:
# Creating the plots again, this time coloring by S

fig, axs = plt.subplots(num_K, figsize=(15, 6*num_K))

for i, K in enumerate(unique_K):
    data_subset = data_melted[data_melted['K'] == K]
    sns.histplot(data=data_subset, x="cluster", hue="S", multiple="dodge", shrink=.8, palette="viridis", ax=axs[i])
    axs[i].set_title(f'Number of Points in Each Cluster for K={K}')
    axs[i].set_xlabel('Cluster')
    axs[i].set_ylabel('Count')

plt.tight_layout()
plt.show()

## Visualize the Clusters

In [None]:
# Let's load the data and understand its structure.
weights = pd.read_csv('combined_results/combined_weights.csv')
# Let's create a new dataframe where for every column we list the indexes of the most important three features

def top_three_features(col):
    top_three = col.nlargest(3)
    return pd.Series(top_three.index)

top_features = weights.apply(top_three_features)
top_features.columns = weights.columns
top_features.transpose()