# Unsupervised

In [None]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import kruskal
from scipy.signal import find_peaks
import seaborn as sns
from itertools import combinations
from statannot import add_stat_annotation

In [None]:
# Load the data
data = pd.read_csv('data_train_toy.csv')
data

In [None]:
# Define the function of peak_picking
def peak_picking(ms_data, min_sn=10):
    peaks_df_list = []
    ms_data = data.drop(["Class"], axis=1) # drop the uncessary columns
    for i in range(len(ms_data)):
        spectrum = ms_data.iloc[i].values
        noise_std = np.std(spectrum)
        threshold = noise_std * min_sn
        peaks, _ = find_peaks(spectrum, height=threshold)
        peaks_df_i = pd.DataFrame({
            'spectrum_index': i,
            'm/z': ms_data.columns[peaks],
            'intensity': spectrum[peaks],
        })

        peaks_df_list.append(peaks_df_i)

    peaks_df = pd.concat(peaks_df_list, ignore_index=True)
    peaks_df = peaks_df.dropna(subset=['m/z'])
    peaks_df = peaks_df.pivot_table(index='spectrum_index', columns='m/z', values='intensity')
    data_pick_picked = pd.concat([peaks_df,data['Class']], axis=1)
    data_pick_picked = data_pick_picked.fillna(0)
    data_pick_picked
    
    return data_pick_picked

In [None]:
# Peak picking with S/N > 10 and retrun the peak in Dataframe
data_peak_picked = peak_picking(data, min_sn=10)
data_peak_picked

In [None]:
# Define the function de Unsupervised Heatmap 
def create_heatmap(data,cmap='RdYlGn_r', distance_metric='cosine', z_score=0):
    sns.clustermap(data.groupby('Class').mean().T, cmap=cmap, center=0, col_cluster=False, row_cluster=True, metric=distance_metric, z_score=z_score, cbar_kws={'label': ''}, cbar=True, xticklabels=True, yticklabels=False)
    plt.show()

In [None]:
# Plot the heatmap
create_heatmap(data_peak_picked,cmap='RdYlGn_r',distance_metric='cosine', z_score=0)

In [None]:
# Define the function to identify significant features in a dataset using the Kruskal-Wallis test based on the peak picked data
def significant_features(data, alpha=0.05):
    x = 'Class'
    y_columns = data.columns.tolist()
    if x in y_columns:
        y_columns.remove(x)
    else:
        print("'Class' column not found in the data.")
        return None
    order = data[x].unique()
    significant_columns = []
    num_comparisons = len(y_columns)  
    corrected_alpha = alpha / num_comparisons
    for col in y_columns:
        data_dict = {group: data[col][data[x] == group] for group in order}
        test_statistic, p_value = kruskal(*data_dict.values())
        if p_value <= corrected_alpha:
            significant_columns.append(col)
    return significant_columns

In [None]:
# Call the significant_features function with data_peak_picked and store the result in significant_mz_values
significant_mz_values = significant_features(data_peak_picked)

# Print the number of significant m/z (mass/charge) values found
print(len(significant_mz_values))

# Display the list of significant m/z values.
significant_mz_values

In [None]:
# save the significants m/z values for further analysis
import csv
nom_fichier = 'significant_mz_values.csv'
with open(nom_fichier, 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['Column_name'])
    for element in significant_mz_values:
        writer.writerow([element])

In [None]:
# Integrate results from both unsupervised and supervised approaches
# Upload the CSV file containing all the robust biomarkers as a 'robust_biomarkers' 
robust_biomarkers = pd.read_csv('significant_mz_values.csv')
robust_biomarkers

In [None]:
# Transform this dataframe into a list named 'liste_mz_robust'. 
liste_mz_robust = []
for i in robust_biomarkers["Column_name"]:
    liste_mz_robust.append(str(i))
liste_mz_robust

In [None]:
# Define the functions to display all the boxplots of robust biomarkers at one shot
def boxplot_significant_features(data, mz_values, class_colors=None, test='Kruskal', loc='inside'):
    label = 'Class'
    order = sorted(data[label].unique())  
    box_pairs = list(combinations(order, 2))
    print("Class labels in dataset:", order)  
    custom_palette = {class_label: class_colors.get(class_label, 'blue') for class_label in order}
    num_mz_values = len(mz_values)
    num_cols = int(num_mz_values ** 0.5)  
    num_rows = (num_mz_values + num_cols - 1) // num_cols 
    figsize_x = 16
    figsize_y = 5 * num_rows
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(figsize_x, figsize_y), dpi=100)
    if num_rows == 1:
        axes = axes.reshape(1, -1)
    for i, mz in enumerate(mz_values):
        x = "Class"
        y = mz
        row = i // num_cols
        col = i % num_cols
        ax = axes[row, col]
        sns.boxplot(data=data, x=x, y=y, order=order, ax=ax, palette=custom_palette)
        ax.set_xticklabels(ax.get_xticklabels(), rotation=45)
        add_stat_annotation(ax, data=data, x=x, y=y, order=order, box_pairs=box_pairs,
                            test=test, text_format='star', loc='inside', verbose=0)
        ax.tick_params(axis='y', labelsize=8)
    for i in range(num_mz_values, num_rows * num_cols):
        fig.delaxes(axes.flatten()[i])
    plt.tight_layout()
    plt.show()

In [None]:
# Define the function toisplay the robust biomarekrs one by one   
def one_box_plot(data, mz, test='Kruskal', class_colors=None):
    label = 'Class'
    order = sorted(data[label].unique())  
    box_pairs = list(combinations(order, 2))
    print("Class labels in dataset:", order)  
    x = "Class"
    y = mz
    custom_palette = {class_label: class_colors.get(class_label, 'blue') for class_label in order}
    plt.figure(figsize=(8, 6))
    ax = plt.gca()
    sns.boxplot(data=data, x=x, y=y, order=order, ax=ax, palette=custom_palette)
    ax.set_xticklabels(ax.get_xticklabels(), rotation=30)
    add_stat_annotation(ax, data=data, x=x, y=y, order=order, box_pairs=box_pairs,
                        test=test, text_format='star', loc='outside', verbose=2)
    plt.show() 

In [None]:
from itertools import combinations
from statannot import add_stat_annotation
custom_colors = {'Tumor':'red','Necrosis':'black','Benign':'green'}
boxplot_significant_features(data, liste_mz_robust, class_colors=custom_colors)

In [None]:
mz = '701.55' #specify m/z (ion)
one_box_plot(data, mz, class_colors = custom_colors)