In [None]:
# IMPORTS
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster, cut_tree
import matplotlib.pyplot as plt
import networkx as nx
import os
import sys

from sklearn.decomposition import PCA
from lifelines import KaplanMeierFitter
from scipy.stats import spearmanr


sys.path.append(os.getcwd().strip('notebooks') + 'src/')
import WGCNA_functions as wgcnax
import importlib
importlib.reload(wgcnax)


import importlib
importlib.reload(wgcnax)


import dill


In [None]:
### LOADING REAL UNPUBLISHED DATA    -     NO PUSHING FOR THE RESULTS

# Move out of the notebook folder to access datasets
working_dir = os.getcwd()
working_dir = working_dir.strip('notebooks')
data_dir = working_dir + 'data/PROTECTED_DATA/BGI_Expression_Data/'


## Load the dataset
# Transcriptomics Data 
transcriptomics_All_dir = data_dir + 'CRC.SW.mRNA.symbol.TPM.csv'
transcriptomics_TumorOnly_dir = data_dir + 'CRC.SW.mRNA.symbol.TPM_TumorOnly.csv'
transcriptomics_NormalOnly_dir = data_dir + 'CRC.SW.mRNA.symbol.TPM_NormalOnly.csv'

transcriptomics_All = pd.read_csv(transcriptomics_All_dir, index_col=0)
transcriptomics_TumorOnly = pd.read_csv(transcriptomics_TumorOnly_dir, index_col=0)
transcriptomics_NormalOnly = pd.read_csv(transcriptomics_NormalOnly_dir, index_col=0)



# Clinical Traits Data
CT_for_All_dir = data_dir + 'ClinicalTraits_for_AllSamples.csv'
CT_for_TumorSamples_dir = data_dir + 'ClinicalTraits_for_TumorSamples.csv'
CT_for_NormalSamples_dir = data_dir + 'ClinicalTraits_for_NormalSamples.csv'

CT_for_All = pd.read_csv(CT_for_All_dir, index_col=0)
CT_for_TumorSamples = pd.read_csv(CT_for_TumorSamples_dir, index_col=0)
CT_for_NormalSamples = pd.read_csv(CT_for_NormalSamples_dir, index_col=0)



# Survival Data
SurvivalData_for_TumorSamples_dir = data_dir + 'Survival_Data_for_TumorSamples.csv' # Using only tumor samples
SurvivalData_for_TumorSamples = pd.read_csv(SurvivalData_for_TumorSamples_dir, index_col=0)



# Figures Saving output dir
figures_dir = working_dir + 'results/HC_simple/'

# Check if the directory exists, and if not, create it
if not os.path.exists(figures_dir):
    os.makedirs(figures_dir)
    print(f"Creating directory to save results and figures...")


#transcriptomics_dataset = transcriptomics_All
transcriptomics_dataset = transcriptomics_TumorOnly


"""
## Make a subset to save RAM
subset_dataset_size = 2000
transcriptomics_dataset = transcriptomics_dataset.iloc[:, :subset_dataset_size] 
"""

### Pre-processing

Consitent with WGCNA (preprocess_TPM)

In [None]:
expression_th = 1.5

# WGCNA preprocessing step for unification of methods
transcriptomics_dataset_filtered = wgcnax.preprocess_TPM(transcriptomics_dataset, expression_th)


# Print the number of genes removed
num_genes_removed = transcriptomics_dataset.shape[1] - transcriptomics_dataset_filtered.shape[1]
print(f"preprocess_TPM_outlier_deletion function removed {num_genes_removed} genes")


### Dendogram and Flat Clusters

In [None]:
### BASIC DENDOGRAM USING CORRELATOIN AS THE DISTANCE 
dpi_general = 100
module_member_threshold = 0.25   ## Set the threshold value to consider members of a module.


## Compute the metrics for the dendogram based on clustering genes
distances = pdist(transcriptomics_dataset_filtered.T, metric='correlation')
linkage_matrix =  linkage(distances, method="average")

## Plot the dendrogram
plt.figure(figsize=(15, 7))
dendrogram(linkage_matrix, truncate_mode=None, color_threshold=module_member_threshold, 
           labels=transcriptomics_dataset_filtered.T.index, leaf_rotation=90)
title_general_dendogram = 'General Hierarchical Clustering Dendrogram'
plt.title(title_general_dendogram)
plt.xlabel('Genes')
plt.ylabel('Distance (as a measure of correlation)')
plt.tight_layout()
plt.xticks([])
plt.savefig(figures_dir + title_general_dendogram, dpi=100)
plt.show()


In [None]:
### Flat clusters

min_num_members = 15
max_num_members = 250

percentile = 0.80
dpi_modules = 100   


## Identify clusters (modules) in the dendogram
cluster_assignation = fcluster(linkage_matrix, t=percentile, criterion='distance')
unique_clusters = np.unique(cluster_assignation)
corrected_cluster_assignation = np.zeros(cluster_assignation.shape, dtype=int)


# Assign new cluster IDs, filtering based on the size criteria
new_cluster_id = 1
for cluster_id in unique_clusters:
    members = np.where(cluster_assignation == cluster_id)[0]
    if min_num_members <= len(members) <= max_num_members:
        corrected_cluster_assignation[members] = new_cluster_id
        new_cluster_id += 1


print(f"{len(np.unique(corrected_cluster_assignation))} modules where identified, fulfilling the restrictions")

module_assignment = pd.DataFrame({
    'Gene Name': transcriptomics_dataset_filtered.columns,
    'Gene' :  range(1, len(corrected_cluster_assignation) + 1),
    'Module': corrected_cluster_assignation
})

In [None]:
## Plot Cluster representation

wgcnax.plot_module_distribution(module_assignment)

### Build Expression Profiles separeting Tumor Samples from Normal Samples

Coherent with WGCNA

#### New modification

The steps are separated, to compare how it behaves using all samples, or only tumor

In [None]:
### Build 3 datasated containing: Only Tumor Samples, only Normal Samples, and both. 

Transc_TumorOnly = transcriptomics_dataset_filtered[transcriptomics_dataset_filtered.index.isin(transcriptomics_TumorOnly.index)]


In [None]:
Transc_NormalOnly = transcriptomics_dataset_filtered[transcriptomics_dataset_filtered.index.isin(transcriptomics_NormalOnly.index)]
Transc_All = transcriptomics_dataset_filtered

In [None]:
### Build a full expression profile for all the genes, keeping the module assignation, separating sample types
# We use WGCNA method for consistency

want_plots = True

## Tumor only Eigengenes
TumorOnly_expression_profiles = wgcnax.expression_profile_for_cluster(module_assignment, Transc_TumorOnly)
TumorOnly_eigen_genes = wgcnax.calculate_eigen_genes(TumorOnly_expression_profiles, want_plots, figures_dir)


In [None]:

## Normal only Eigengenes
NormalOnly_expression_profiles = wgcnax.expression_profile_for_cluster(module_assignment, Transc_NormalOnly)
NormalOnly_eigen_genes = wgcnax.calculate_eigen_genes(NormalOnly_expression_profiles, want_plots, figures_dir)

## All Normal and Tumor Samples Eigengenes
All_expression_profiles = wgcnax.expression_profile_for_cluster(module_assignment, Transc_All)
All_eigen_genes = wgcnax.calculate_eigen_genes(All_expression_profiles, want_plots, figures_dir)


if TumorOnly_eigen_genes.shape[1] != 1064 or NormalOnly_eigen_genes.shape[1] != 121 or All_expression_profiles.shape[1] != 1185:
    print(f"Something is wrong, the dimensions are not matching the expected size.")


In [None]:
###  Module-Trait Relationship
# Plot 3 Heatmaps, to separate between effect in Tumor vs. Normal

# The new heatmap required a p-value threshold
p_value_th = 0.05

# TUMOR SAMPLES ONLY
trait_columns = list(CT_for_TumorSamples.columns[1:] )
correlations_T, p_values_T = wgcnax.eigen_trait_correlations_DC(TumorOnly_eigen_genes, CT_for_TumorSamples, trait_columns)
## Plot the Heatmap for the module-trait relationship
wgcnax.correlation_heatmap(correlations_T, p_values_T, figures_dir, title = 'Tumor Samples Only', p_value_th=p_value_th)



In [None]:

# Normal SAMPLES ONLY
trait_columns = list(CT_for_NormalSamples.columns[1:] )
correlations_N, p_values_N = wgcnax.eigen_trait_correlations_DC(NormalOnly_eigen_genes, CT_for_NormalSamples, trait_columns)
## Plot the Heatmap for the module-trait relationship
wgcnax.correlation_heatmap(correlations_N, p_values_N, figures_dir, title = 'Normal Samples Only', p_value_th=p_value_th)


# All SAMPLES
trait_columns = list(CT_for_All.columns[1:] )
correlations_TN, p_values_TN = wgcnax.eigen_trait_correlations_DC(All_eigen_genes, CT_for_All, trait_columns)
## Plot the Heatmap for the module-trait relationship
wgcnax.correlation_heatmap(correlations_TN, p_values_TN, figures_dir, title = 'All Samples', p_value_th=p_value_th)

In [None]:
### More Special Heatmaps -- Not relevant


#### All SAMPLES correlated with Tumor or Normal Tissue

## Create Dataframe with all samples, annotated as Tumor or Normal Sample
CT_for_All.reset_index(inplace=True)
All_Samples_Labeled = CT_for_All[['Sample_ID']].copy()
All_Samples_Labeled['SampleType'] = 0

# If it contains 'T', mark 'Tumor' as 1, if it contains 'N', mark as 0
All_Samples_Labeled['SampleType'] = All_Samples_Labeled['Sample_ID'].apply(
    lambda sample_id: 'Tumor' if 'T' in sample_id else ('Normal' if 'N' in sample_id else None))

# Plot Heatmap
trait_columns = list(All_Samples_Labeled.columns[1:] )
correlations_TNL, p_values_TNL = wgcnax.eigen_trait_correlations_DC(All_eigen_genes, All_Samples_Labeled, trait_columns)
## Plot the Heatmap for the module-trait relationship
wgcnax.correlation_heatmap(correlations_TNL, p_values_TNL, figures_dir, title = 'All Samples, Labeled', p_value_th=p_value_th)




#### Tumor only HeatMap, splitted by Male or Female

# Initialize dictionaries to store columns for each gender, and avoid high fragmentation
columns_male = {'Module': TumorOnly_eigen_genes['Module']}
columns_female = {'Module': TumorOnly_eigen_genes['Module']}

for sample_id in TumorOnly_eigen_genes.columns.drop('Module'):
    if sample_id in CT_for_TumorSamples.index:
        gender = CT_for_TumorSamples.loc[sample_id, 'Gender']

        # Based on the gender, add the column to the corresponding dictionary
        if gender == 'Male':
            columns_male[sample_id] = TumorOnly_eigen_genes[sample_id]
        elif gender == 'Female':
            columns_female[sample_id] = TumorOnly_eigen_genes[sample_id]

# Concatenate all the collected columns at once for males and females
TumorOnly_eigen_genes_Male = pd.DataFrame(columns_male)
TumorOnly_eigen_genes_Female = pd.DataFrame(columns_female)

# Split the ClinicalTrait dataframe also into Female Male
CT_for_TumorSamples_Female = CT_for_TumorSamples[CT_for_TumorSamples['Gender'] == 'Female'].drop('Gender', axis=1) 
CT_for_TumorSamples_Male = CT_for_TumorSamples[CT_for_TumorSamples['Gender'] == 'Male'].drop('Gender', axis=1)



# Plot Heatmap for Males
trait_columns = list(CT_for_TumorSamples_Male.columns[1:] )
correlations_TM, p_values_TM = wgcnax.eigen_trait_correlations_DC(TumorOnly_eigen_genes_Male, CT_for_TumorSamples_Male, trait_columns)
## Plot the Heatmap for the module-trait relationship
wgcnax.correlation_heatmap(correlations_TM, p_values_TM, figures_dir, title = 'Tumor and Male Only', p_value_th=p_value_th)


# Plot Heatmap for Females
trait_columns = list(CT_for_TumorSamples_Female.columns[1:] )
correlations_TM, p_values_TM = wgcnax.eigen_trait_correlations_DC(TumorOnly_eigen_genes_Female, CT_for_TumorSamples_Female, trait_columns)
## Plot the Heatmap for the module-trait relationship
wgcnax.correlation_heatmap(correlations_TM, p_values_TM, figures_dir, title = 'Tumor and Female Only', p_value_th=p_value_th)

### Improve visualization and do Results selection on the interesting heatmap

in this case, only Tumor Samples

In [None]:
# Select what heatmap you want
correlations = correlations_T
p_values = p_values_T
expression_profiles = TumorOnly_expression_profiles
eigengenes = TumorOnly_eigen_genes


# Find rows where all cells have abs(value) < 0.4 and drope them
threshold_of_interest = 0.4

# Filter out non relevant modules
not_significant_modules = (correlations.abs() < threshold_of_interest).all(axis=1)
modules_to_drop = correlations.index[not_significant_modules]
print(f"The following Modules are droped: {list(modules_to_drop)}")
correlations_filtered = correlations.drop(index=modules_to_drop)
p_values_filtered = p_values.drop(index=modules_to_drop)


In [None]:
#### Filtered Heatmap, displaying only reelevant clusters

wgcnax.correlation_heatmap(correlations_filtered, p_values_filtered, figures_dir,  title = 'Selected Modules with cor>0,4', p_value_th=p_value_th)

In [None]:
# Print some data about sleected modules

significant_modules = []
for trait in correlations.columns:
    significant_clusters = np.abs(correlations[trait]) > threshold_of_interest
    significant_clusters = significant_clusters[significant_clusters].index.tolist()
    if significant_clusters:
        for module in significant_clusters:
            significant_modules.append(module)
            print(f'Trait: {trait}  correlates with Module: {module}, showing a correlation of {correlations.at[module, trait]:.3f}')

print('\n')
for module in set(significant_modules):
    module_expression_profile = expression_profiles[expression_profiles['Module'] == module]
    module_Genes = module_expression_profile['Gene Name'].tolist()
    print(f'The module with id:{module} clusters {len(module_Genes)} genes.')


In [None]:
### Step 9: Survival plot

# Get the modules that show high correlations
wgcnax.survival_probability(correlations, threshold_of_interest, expression_profiles, SurvivalData_for_TumorSamples, figures_dir)

In [None]:
## Plot a Network Representation of the Relevant Clusters

# Color mapping for different intervals of correlation strengths
interval_colors = {
    '0.9 - 1.0': 'red',
    '0.8 - 0.9': 'orange',
    '0.7 - 0.8': 'yellow',
    '0.6 - 0.7': 'blue',
    
    '0.0 - 0.5': 'green',
    }

# Function to determine color based on weight
def get_edge_color(weight, interval_colors):
    for interval, color in interval_colors.items():
        lower, upper = map(float, interval.split(' - '))
        if lower <= abs(weight) <= upper:
            return color
    return 'grey'


num_modules = len(set(significant_modules))
grid_size = int(np.ceil(np.sqrt(num_modules)))
plt.figure(figsize=(grid_size * 5, grid_size * 5))
for i, cluster in enumerate(set(significant_modules), start = 1):
    # Build Expression Profile for the Module
    module_profile = expression_profiles[expression_profiles['Module'] == cluster].copy()
    module_profile.set_index('Gene Name', inplace=True)
    module_profile.drop('Module', axis=1, inplace=True)

    # Add Eigengene
    module_eigengene = eigengenes[eigengenes['Module'] == cluster].copy()
    module_eigengene.set_index('Module', inplace=True)
    module_eigengene.rename(index={cluster: 'EigenGene'}, inplace=True)
    module_profile = pd.concat([module_profile, module_eigengene])

    # Convert to numpy and calculate Spearman correlation -> Membership function in a way
    module_profile_matrix = module_profile.to_numpy()
    corr, _ = spearmanr(module_profile_matrix, axis=1)
    corr_matrix = pd.DataFrame(corr, index=module_profile.index, columns=module_profile.index)
    
    # Network creation and node addition
    G = nx.Graph()
    for gene in module_profile.index:
        G.add_node(gene)
    
    # Edge addition based on correlation
    for gene1 in module_profile.index:
        for gene2 in module_profile.index:
            if gene1 != gene2:
                weight = corr_matrix.loc[gene1, gene2]
                color = get_edge_color(weight, interval_colors)
                G.add_edge(gene1, gene2, weight=weight, color=color)
    
    # Plotting the network in a subplot
    plt.subplot(grid_size, grid_size, i)
    pos = nx.spring_layout(G, k=0.5, iterations=20)
    edges = G.edges(data=True)
    nx.draw(G, pos, with_labels=True, node_color=['red' if name=='EigenGene' else 'skyblue' for name in G.nodes], 
            node_size=500, 
            edge_color=[data['color'] for _, _, data in edges], linewidths=1, font_size=10,
            width=[(data['weight'])*2 for _, _, data in edges])

    plt.title(f'Module {cluster}')
    plt.axis('off')



# Add a legend for the whole figure
title_figure = 'Network Representation of all Relevant Modules'
legend_labels = {label: plt.Line2D([0], [0], color=color, lw=4) for label, color in interval_colors.items()}
plt.figlegend(legend_labels.values(), legend_labels.keys(), loc='upper right')
plt.tight_layout()
plt.suptitle(title_figure, fontsize=20)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

plt.savefig(figures_dir + title_figure, dpi=200)
plt.show()