In [None]:
# IMPORTS
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, cut_tree
import matplotlib.pyplot as plt
import networkx as nx
import os
import sys

from sklearn.decomposition import PCA
from lifelines import KaplanMeierFitter
from scipy.stats import spearmanr



sys.path.append(os.getcwd().strip('notebooks') + 'src/')
import WGCNA_functions as wgcnax


import importlib
importlib.reload(wgcnax)


import dill


In [None]:
### LOADING REAL UNPUBLISHED DATA    -     NO PUSHING FOR THE RESULTS

# Move out of the notebook folder to access datasets
working_dir = os.getcwd()
working_dir = working_dir.strip('notebooks')
data_dir = working_dir + 'data/PROTECTED_DATA/BGI_Expression_Data/'


## Load the dataset
# Transcriptomics Data 
transcriptomics_TumorOnly_dir = data_dir + 'CRC.SW.mRNA.symbol.TPM_TumorOnly.csv'
transcriptomics_dataset = pd.read_csv(transcriptomics_TumorOnly_dir, index_col=0)

# Sample info and Clinical Traits Data
CT_for_TumorSamples_dir = data_dir + 'ClinicalTraits_for_TumorSamples.csv'
trait_dataset = pd.read_csv(CT_for_TumorSamples_dir, index_col=0)

SurvivalData_for_TumorSamples_dir = data_dir + 'Survival_Data_for_TumorSamples.csv' # Using only tumor samples
survival_dataset = pd.read_csv(SurvivalData_for_TumorSamples_dir, index_col=0)


# Figures Saving output dir
figures_dir = working_dir + 'results/HC_simple/'

# Check if the directory exists, and if not, create it
if not os.path.exists(figures_dir):
    os.makedirs(figures_dir)
    print(f"Creating directory to save results and figures...")

In [None]:
### BASIC DENDOGRAM USING CORRELATOIN AS THE DISTANCE 
dpi_general = 100
module_member_threshold = 0.25   ## Set the threshold value to consider members of a module.




## We check for zero variance genes & Nan
variances = transcriptomics_dataset.var()  # Compute variance for each column
zero_var_columns = variances[variances == 0].index 
transcriptomics_dataset_filtered = transcriptomics_dataset.drop(columns=zero_var_columns)
print("Columns with zero variance being removed:", zero_var_columns.tolist())


## Compute the metrics for the dendogram based on clustering genes
distances = pdist(transcriptomics_dataset_filtered.T, metric='correlation')
linkage_matrix =  linkage(distances, method="average")


## Plot the dendrogram
plt.figure(figsize=(15, 7))
dendrogram(linkage_matrix, truncate_mode=None, color_threshold=module_member_threshold, 
           labels=transcriptomics_dataset_filtered.T.index, leaf_rotation=90)
title_general_dendogram = 'General Hierarchical Clustering Dendrogram'
plt.title(title_general_dendogram)
plt.xlabel('Genes')
plt.ylabel('Distance (as a measure of correlation)')
plt.tight_layout()
plt.savefig(figures_dir + title_general_dendogram, dpi=100)
plt.show()


In [None]:
### Flat clusters

min_num_members = 5
max_num_members = 150

percentile = 0.9
dpi_modules = 100   


## Identify clusters (modules) in the dendogram
cluster_assignation = fcluster(linkage_matrix, t=percentile, criterion='distance')
unique_clusters = np.unique(cluster_assignation)
corrected_cluster_assignation = np.zeros(cluster_assignation.shape, dtype=int)


# Assign new cluster IDs, filtering based on the size criteria
new_cluster_id = 1
for cluster_id in unique_clusters:
    members = np.where(cluster_assignation == cluster_id)[0]
    if min_num_members <= len(members) <= max_num_members:
        corrected_cluster_assignation[members] = new_cluster_id
        new_cluster_id += 1


print(f"{len(np.unique(corrected_cluster_assignation))} modules where identified, fulfilling the restrictions")

module_assignment = pd.DataFrame({
    'Gene Name': transcriptomics_dataset_filtered.columns,
    'Module': corrected_cluster_assignation
})

In [None]:
## Plot Cluster representation

fig = plt.figure(figsize=(16, 12))

# Histogram of the number of genes in each module, excluding not assigned (Module 0)
ax1 = fig.add_subplot(2, 1, 1)
genes_in_modules = module_assignment[module_assignment['Module'] != 0]['Module'].value_counts().sort_index()
ax1.bar(genes_in_modules.index, genes_in_modules.values)
ax1.set_title('Number of Genes in Each Module')
ax1.set_xlabel('Module')
ax1.set_ylabel('Number of Genes')

# Setup for pie charts
ax2 = fig.add_subplot(2, 2, 3)
ax3 = fig.add_subplot(2, 2, 4)

# Pie chart of genes distribution across modules
module_sizes = module_assignment['Module'].value_counts()
ax2.pie(module_sizes, labels=module_sizes.index, autopct='%1.1f%%', startangle=140)
ax2.set_title('Distribution of Genes Across Modules')

# Percentage of Genes Assigned to Clusters vs Not Assigned
total_genes = len(module_assignment)
genes_not_assigned_count = module_assignment[module_assignment['Module'] == 0].shape[0]
genes_assigned_count = total_genes - genes_not_assigned_count
sizes = [genes_assigned_count, genes_not_assigned_count]
labels = ['Assigned to Modules', 'Not Assigned (Module 0)']
ax3.pie(sizes, labels=labels, autopct='%1.1f%%', startangle=140)
ax3.set_title('Percentage of Genes Assigned to Clusters vs Not Assigned')

plt.tight_layout()
plt.savefig(figures_dir + 'ModuleDistribution', dpi=100)

plt.show()

In [None]:
### Build a full expression profile for all the genes, keeping the module assignation

transcriptomics_data_transposed = transcriptomics_dataset_filtered.T
transcriptomics_data_transposed.reset_index(inplace=True)
transcriptomics_data_transposed.rename(columns={'index': 'Gene Name'}, inplace=True)


expression_profiles = pd.merge(module_assignment, transcriptomics_data_transposed, on='Gene Name', how='left')


In [None]:

print(f"Calculating EigenGenes...")

eigengenes = []

## Iterate through each module to calculate its eigen gene
for module in expression_profiles['Module'].unique():

    # Skip module 0 as it represents unassigned genes
    if module == 0:
        continue
    else:
        # Extract the Expression Profile for all genes in this module
        module_expression_profile = expression_profiles[expression_profiles['Module'] == module].iloc[:, 2:]

        # Perform PCA on the expression data of the current module
        pca = PCA(n_components=1)
        pca.fit(module_expression_profile)
        
        # The first principal component is the eigengene
        eigengene = pca.components_[0]

        # Create a DataFrame for the eigengene with the correct sample labels and the module id
        eigengene_df = pd.DataFrame(eigengene.reshape(1, -1), columns=expression_profiles.columns[2:])
        eigengene_df.insert(0, 'Module', module)
        eigengenes.append(eigengene_df)

eigengenes = pd.concat(eigengenes, ignore_index=True)


## Plot the Expression Profile for the Eigengenes across pacients
print(f"Plotting and Saving the Eigengene Expression Profile Across Samples...")
title_figure = 'Eigengene Expression Profile Across Samples'

sample_labels = eigengenes.columns[1:]

plt.figure(figsize=(15, 10))
for index, row in eigengenes.iterrows():
    # Convert eigengene array stored as list back to numpy array for plotting
    eigengene_values = np.array(row[1:].values)
    
    # Plotting the eigengene values
    plt.plot(sample_labels, eigengene_values, label=f'Module {row["Module"]}')

plt.title(title_figure, fontsize=20)
plt.xlabel('Samples (pacients)', fontsize=10)
plt.ylabel('Eigengene Expression Level', fontsize=10)
plt.xticks(rotation=90)
plt.xticks([])
#plt.legend()
plt.tight_layout()
plt.savefig(figures_dir + title_figure, dpi=100)
plt.show()
print(f"Done")


In [None]:
### Step 8: Module-Trait Relationship
# Run the Analysis, encoding the variables as categorical, and calculating correlation and p-value

# THIS USES THE NEW ENCODING AND CORRELATION FUNCTIONS
trait_columns = list(trait_dataset.columns[1:] )
correlations, p_values = wgcnax.eigen_trait_correlations_DC(eigengenes, trait_dataset, trait_columns)

In [None]:
wgcnax.correlation_pvalue_heatmap(correlations, p_values, figures_dir)

In [None]:
# Find rows where all cells have abs(value) < 0.4 and drope them
threshold_of_interest = 0.4

# Filter out non relevant modules
not_significant_modules = (correlations.abs() < threshold_of_interest).all(axis=1)
modules_to_drop = correlations.index[not_significant_modules]
print(f"The following Modules are droped: {list(modules_to_drop)}")
correlations_filtered = correlations.drop(index=modules_to_drop)
p_values_filtered = p_values.drop(index=modules_to_drop)


In [None]:
## Plot the Heatmap for the module-trait relationship

#### Filtered Heatmap, displaying only reelevant clusters

wgcnax.correlation_pvalue_heatmap(correlations_filtered, p_values_filtered, figures_dir)

In [None]:
significant_modules = []
for trait in correlations.columns:
    significant_clusters = np.abs(correlations[trait]) > threshold_of_interest
    significant_clusters = significant_clusters[significant_clusters].index.tolist()
    if significant_clusters:
        for module in significant_clusters:
            significant_modules.append(module)
            print(f'Trait: {trait}  correlates with Module: {module}, showing a correlation of {correlations.at[module, trait]:.3f}')

print('\n')
for module in set(significant_modules):
    module_expression_profile = expression_profiles[expression_profiles['Module'] == module]
    module_Genes = module_expression_profile['Gene Name'].tolist()
    print(f'The module with id:{module} clusters {len(module_Genes)} genes.')


In [None]:
### Step 9: Survival plot

# Get the modules that show high correlations
wgcnax.survival_probability(correlations, threshold_of_interest, expression_profiles, survival_dataset, figures_dir)

In [None]:
## Plot a Network Representation of the Relevant Clusters

# Color mapping for different intervals of correlation strengths
interval_colors = {
    '0.9 - 1.0': 'red',
    '0.8 - 0.9': 'orange',
    '0.7 - 0.8': 'yellow',
    '0.6 - 0.7': 'blue',
    
    '0.0 - 0.5': 'green',
    }

# Function to determine color based on weight
def get_edge_color(weight, interval_colors):
    for interval, color in interval_colors.items():
        lower, upper = map(float, interval.split(' - '))
        if lower <= abs(weight) <= upper:
            return color
    return 'grey'


num_modules = len(set(significant_modules))
grid_size = int(np.ceil(np.sqrt(num_modules)))
plt.figure(figsize=(grid_size * 5, grid_size * 5))
for i, cluster in enumerate(set(significant_modules), start = 1):
    # Build Expression Profile for the Module
    module_profile = expression_profiles[expression_profiles['Module'] == cluster].copy()
    module_profile.set_index('Gene Name', inplace=True)
    module_profile.drop('Module', axis=1, inplace=True)

    # Add Eigengene
    module_eigengene = eigengenes[eigengenes['Module'] == cluster].copy()
    module_eigengene.set_index('Module', inplace=True)
    module_eigengene.rename(index={cluster: 'EigenGene'}, inplace=True)
    module_profile = pd.concat([module_profile, module_eigengene])

    # Convert to numpy and calculate Spearman correlation -> Membership function in a way
    module_profile_matrix = module_profile.to_numpy()
    corr, _ = spearmanr(module_profile_matrix, axis=1)
    corr_matrix = pd.DataFrame(corr, index=module_profile.index, columns=module_profile.index)
    
    # Network creation and node addition
    G = nx.Graph()
    for gene in module_profile.index:
        G.add_node(gene)
    
    # Edge addition based on correlation
    for gene1 in module_profile.index:
        for gene2 in module_profile.index:
            if gene1 != gene2:
                weight = corr_matrix.loc[gene1, gene2]
                color = get_edge_color(weight, interval_colors)
                G.add_edge(gene1, gene2, weight=weight, color=color)
    
    # Plotting the network in a subplot
    plt.subplot(grid_size, grid_size, i)
    pos = nx.spring_layout(G, k=0.5, iterations=20)
    edges = G.edges(data=True)
    nx.draw(G, pos, with_labels=True, node_color=['red' if name=='EigenGene' else 'skyblue' for name in G.nodes], 
            node_size=500, 
            edge_color=[data['color'] for _, _, data in edges], linewidths=1, font_size=10,
            width=[(data['weight'])*2 for _, _, data in edges])

    plt.title(f'Module {cluster}')
    plt.axis('off')



# Add a legend for the whole figure
legend_labels = {label: plt.Line2D([0], [0], color=color, lw=4) for label, color in interval_colors.items()}
plt.figlegend(legend_labels.values(), legend_labels.keys(), loc='upper right')
plt.tight_layout()
plt.suptitle('Network Representation of all Relevant Modules', fontsize=20)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

plt.savefig(figures_dir + title_figure, dpi=200)
plt.show()

In [None]:
# Save session 
session_file = working_dir + 'data/Sessions/Hierarchical_Clustering_session.pkl'
dill.dump_session(session_file)

In [None]:
# Move out of the notebook folder to access datasets
working_dir = os.getcwd()
working_dir = working_dir.strip('notebooks')

# Load session 
session_file = '/media/bernat/Work/Hierarchical_Clustering_session.pkl'
dill.load_session(session_file)