In [None]:
### GENERAL CONFIGURATION FOR THE ANALYSIS:
# Imports
import os
import pandas as pd
import sys
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import numpy as np


sys.path.append(os.getcwd().strip('notebooks') + 'src/')
import WGCNA_functions as wgcnax



# Colors for the terminal outputs
ENDC = "\033[0m"
BOLD = "\033[1m"
UNDERLINE = "\033[4m"

OKBLUE = "\033[94m"
OKGREEN = "\033[92m"
WARNING = "\033[93m"
FAIL = "\033[91m"

# Settings for printing dataframes
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)



### LOADING REAL UNPUBLISHED DATA    -     NO PUSHING FOR THE RESULTS

# Move out of the notebook folder to access datasets
working_dir = os.getcwd()
working_dir = working_dir.strip('notebooks')
data_dir = working_dir + 'data/PROTECTED_DATA/BGI_Expression_Data/'


## Load the dataset
# Transcriptomics Data 
transcriptomics_TPM_dataset_dir = data_dir + 'CRC.SW.mRNA.symbol.TPM_NOnormal.csv'  
transcriptomics_dataset = pd.read_csv(transcriptomics_TPM_dataset_dir, index_col=0)

# Sample info and Clinical Traits Data
sample_info_traits_dir = data_dir + 'All_Traits_Without_Normal.csv' # Using only tumor samples
trait_dataset = pd.read_csv(sample_info_traits_dir, index_col=0)


sample_info_traits_dir = data_dir + 'Survival_Without_Normal.csv' # Using only tumor samples
survival_dataset = pd.read_csv(sample_info_traits_dir, index_col=0)



# Figures Saving output dir
figures_dir = working_dir + 'results/FullOptimization/'

# Check if the directory exists, and if not, create it
if not os.path.exists(figures_dir):
    os.makedirs(figures_dir)
    print(f"{BOLD}{OKBLUE}Creating directory to save results and figures...{ENDC}")





"""
## Make a subset to save RAM
subset_dataset_size = 200
transcriptomics_dataset = transcriptomics_dataset.iloc[:, :subset_dataset_size] 

# RAM usage estimation in GB
RAM_estimate = (subset_dataset_size * subset_dataset_size * 8) / (1024**3)
print(f"The aproximated RAM to analyse this size of dataset is: {RAM_estimate} GB")
"""


AUTOMATIC SEARCH FOR THE BEST PARAMETERS









In [None]:
### PARAMETERS for automatic search


# SETTINGS FOR PLOTTING FIGURES
want_plots = False


# Step 1.
expression_th_vec = [0.5, 1, 1.5, 2, 5, 10]
### As a last step, check the effect of the different preprocessing functions!!!!


# Step 3.
RsquaredCut_vec = [0.75, 0.85, 0.95]
MeanCut_vec = [50, 100]
block_size_scalefit_vec = [5, 10, 20, 30, 50]

RsquaredCut = 0.9
MeanCut = 100
block_size_scalefit = 10

adjacency_type = "unsigned"


# Step 4.
TOMDenom = "mean"


# Step 6.
min_memb_cluster_vect = [5, 10, 15, 20]
height_percentile_vect = [70, 75, 80, 85, 90]

min_memb_cluster = 10
height_percentile = 85

Optimization of  Step 1


In [None]:
# Running a partial search for best parameters
# Using parameter segmentation to reduce number of iterations

# Initialize the DataFrame for Results
optimization_results = pd.DataFrame(columns=['Parameter_config', 'Num Clusters', 'Max Correlation', 'Mean Correlation',
                                             'Num Correlations over 7', 'Mean Correlation over 7', 'Num correlations over 8'])



# Optimization of the preprocessing threshold. Thus, doing full executions with the different param
for expression_th in expression_th_vec:
    correlations, p_values = wgcnax.run_full_WGCNA(transcriptomics_dataset, expression_th, trait_dataset, want_plots, figures_dir, \
                            RsquaredCut, MeanCut, block_size_scalefit, adjacency_type, TOMDenom, \
                            height_percentile, min_memb_cluster)


    ## Storing results for optimization analysis
    # Flatten the matrix to a 1D array for calculations, ignoring NaN values if any.
    correlation_values = correlations.values.flatten()
    correlation_values = correlation_values[~np.isnan(correlation_values)]  # Removes NaN values if present

    # Calculating metrics directly from the array of correlation values.
    num_clusters = correlations.shape[0]
    max_correlation = np.max(np.abs(correlation_values))
    mean_correlation = np.mean(correlation_values)
    num_correlations_over_7 = np.sum(correlation_values > 0.7)
    mean_correlation_over_7_values = correlation_values[correlation_values > 0.7]
    mean_correlation_over_7 = np.mean(mean_correlation_over_7_values) if mean_correlation_over_7_values.size > 0 else np.nan
    num_correlations_over_8 = np.sum(correlation_values > 0.8)

    # Append new row to optimization_results DataFrame
    iteration_id = "expression_th=" + str(expression_th)
    
    new_row = pd.DataFrame([{
        'Parameter_config': iteration_id,
        'Num Clusters': num_clusters,
        'Max Correlation': max_correlation,
        'Mean Correlation': mean_correlation,
        'Num Correlations over 7': num_correlations_over_7,
        'Mean Correlation over 7': mean_correlation_over_7,
        'Num correlations over 8': num_correlations_over_8
    }])
    optimization_results = pd.concat([optimization_results, new_row], ignore_index=True)




In [None]:
print(optimization_results)


# Set the figure and axes
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True, figsize=(15, 10))

# Plotting the first subplot
ax1.plot(optimization_results['Parameter_config'], optimization_results['Num Clusters'], label='Num Clusters', marker='o')
ax1.plot(optimization_results['Parameter_config'], optimization_results['Num Correlations over 7'], label='Num Correlations over 7', marker='o')
ax1.plot(optimization_results['Parameter_config'], optimization_results['Num correlations over 8'], label='Num correlations over 8', marker='o')
ax1.set_ylabel('Counts')
ax1.legend()
ax1.grid(True)

# Plotting the second subplot
ax2.plot(optimization_results['Parameter_config'], optimization_results['Max Correlation'], label='Max Correlation', marker='o')
ax2.plot(optimization_results['Parameter_config'], optimization_results['Mean Correlation over 7'], label='Mean Correlation over 7', marker='o')
ax2.set_ylabel('Correlation Values')
ax2.legend()
ax2.grid(True)

# Plotting the third subplot
ax3.plot(optimization_results['Parameter_config'], optimization_results['Mean Correlation'], label='Mean Correlation', marker='o')
# ax3.plot(optimization_results['Parameter_config'], optimization_results['Mean Correlation over 7'], label='Mean Correlation over 7', marker='o')
ax3.set_ylabel('Correlation Values')
ax3.legend()
ax3.grid(True)

# Setting the y-axis range for the second subplot
ax2.set_ylim(0, 1)

# Setting the x-axis label for the bottom subplot
ax3.set_xlabel('Parameter_config')
plt.xticks(rotation=90) 

# Improve layout to prevent overlap
plt.tight_layout()

# Display the plot
plt.show()

Optimization of Step 3 
pickSoftThreshold 

In [None]:

expression_th = 1.5





# Optimization of the soft threshold Algorithm. Thus, doing full executions with the different param
for RsquaredCut in RsquaredCut_vec:
    for MeanCut in MeanCut_vec:
        for block_size_scalefit in block_size_scalefit_vec:

            correlations, p_values = wgcnax.run_full_WGCNA(transcriptomics_dataset, expression_th, trait_dataset, want_plots, figures_dir, \
                                    RsquaredCut, MeanCut, block_size_scalefit, adjacency_type, TOMDenom, \
                                    height_percentile, min_memb_cluster)


            ## Storing results for optimization analysis
            # Flatten the matrix to a 1D array for calculations, ignoring NaN values if any.
            correlation_values = correlations.values.flatten()
            correlation_values = correlation_values[~np.isnan(correlation_values)]  # Removes NaN values if present

            # Calculating metrics directly from the array of correlation values.
            num_clusters = correlations.shape[0]
            max_correlation = np.max(np.abs(correlation_values))
            mean_correlation = np.mean(correlation_values)
            num_correlations_over_7 = np.sum(correlation_values > 0.7)
            mean_correlation_over_7_values = correlation_values[correlation_values > 0.7]
            mean_correlation_over_7 = np.mean(mean_correlation_over_7_values) if mean_correlation_over_7_values.size > 0 else np.nan
            num_correlations_over_8 = np.sum(correlation_values > 0.8)

            # Append new row to optimization_results DataFrame
            iteration_id = "RsquaredCut=" + str(RsquaredCut) + " MeanCut=" + str(MeanCut) + " block=" + str(block_size_scalefit)
            
            new_row = pd.DataFrame([{
                'Parameter_config': iteration_id,
                'Num Clusters': num_clusters,
                'Max Correlation': max_correlation,
                'Mean Correlation': mean_correlation,
                'Num Correlations over 7': num_correlations_over_7,
                'Mean Correlation over 7': mean_correlation_over_7,
                'Num correlations over 8': num_correlations_over_8
            }])
            optimization_results = pd.concat([optimization_results, new_row], ignore_index=True)


In [None]:
print(optimization_results)


# Set the figure and axes
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True, figsize=(15, 10))

# Plotting the first subplot
ax1.plot(optimization_results['Parameter_config'], optimization_results['Num Clusters'], label='Num Clusters', marker='o')
ax1.plot(optimization_results['Parameter_config'], optimization_results['Num Correlations over 7'], label='Num Correlations over 7', marker='o')
ax1.plot(optimization_results['Parameter_config'], optimization_results['Num correlations over 8'], label='Num correlations over 8', marker='o')
ax1.set_ylabel('Counts')
ax1.legend()
ax1.grid(True)

# Plotting the second subplot
ax2.plot(optimization_results['Parameter_config'], optimization_results['Max Correlation'], label='Max Correlation', marker='o')
ax2.plot(optimization_results['Parameter_config'], optimization_results['Mean Correlation over 7'], label='Mean Correlation over 7', marker='o')
ax2.set_ylabel('Correlation Values')
ax2.legend()
ax2.grid(True)

# Plotting the third subplot
ax3.plot(optimization_results['Parameter_config'], optimization_results['Mean Correlation'], label='Mean Correlation', marker='o')
# ax3.plot(optimization_results['Parameter_config'], optimization_results['Mean Correlation over 7'], label='Mean Correlation over 7', marker='o')
ax3.set_ylabel('Correlation Values')
ax3.legend()
ax3.grid(True)

# Setting the y-axis range for the second subplot
ax2.set_ylim(0, 1)

# Setting the x-axis label for the bottom subplot
ax3.set_xlabel('Parameter_config')
plt.xticks(rotation=90) 

# Improve layout to prevent overlap
plt.tight_layout()

# Display the plot
plt.show()

Optimization of Step 6
Tree cutting algorithm 

In [None]:
# Initialize the DataFrame for Results onthis step
optimization_results_two = pd.DataFrame(columns=['Parameter_config', 'Num Clusters', 'Max Correlation', 'Mean Correlation',
                'Num Correlations over 5','Num Correlations over 6', 'Num correlations over 7'])


RsquaredCut = 0.9
MeanCut = 100
block_size_scalefit = 5



linkage_matrix, transcriptomics_dataset_filtered, trait_dataset_filtered = wgcnax.run_partialA_WGCNA(transcriptomics_dataset, \
                                                                            expression_th, trait_dataset, want_plots, figures_dir, \
                                                                            RsquaredCut, MeanCut, block_size_scalefit, adjacency_type, TOMDenom)

# Optimization of the soft threshold Algorithm. Thus, doing full executions with the different param
for min_memb_cluster in min_memb_cluster_vect:
    for height_percentile in height_percentile_vect:

            module_assignment, correlations, p_values = wgcnax.run_partialB_WGCNA(linkage_matrix, transcriptomics_dataset_filtered, \
                                                                trait_dataset_filtered, want_plots, figures_dir, \
                                                                height_percentile, min_memb_cluster)


            ## Storing results for optimization analysis
            # Flatten the matrix to a 1D array for calculations, ignoring NaN values if any.
            correlation_values = correlations.values.flatten()
            correlation_values = correlation_values[~np.isnan(correlation_values)]  # Removes NaN values if present

            # Calculating metrics directly from the array of correlation values.
            num_clusters = correlations.shape[0]
            max_correlation = np.max(np.abs(correlation_values))
            mean_correlation = np.mean(correlation_values)
            num_correlations_over_5 = np.sum(np.abs(correlation_values) > 0.5)
            num_correlations_over_6 = np.sum(np.abs(correlation_values) > 0.6)
            num_correlations_over_7 = np.sum(np.abs(correlation_values) > 0.7)

            # Append new row to optimization_results DataFrame
            iteration_id = "min_memb_cluster=" + str(min_memb_cluster) + " height_percentile=" + str(height_percentile)
            
            new_row = pd.DataFrame([{
                'Parameter_config': iteration_id,
                'Num Clusters': num_clusters,
                'Max Correlation': max_correlation,
                'Mean Correlation': mean_correlation,
                'Num Correlations over 5' : num_correlations_over_5,
                'Num Correlations over 6': num_correlations_over_6,
                'Num correlations over 7': num_correlations_over_7
            }])
            optimization_results_two = pd.concat([optimization_results_two, new_row], ignore_index=True)


In [None]:
print(optimization_results_two)


# Set the figure and axes
fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True, figsize=(15, 10))

# Plotting the first subplot
ax1.plot(optimization_results_two['Parameter_config'], optimization_results_two['Num Clusters'], label='Num Clusters', marker='o')
ax1.plot(optimization_results_two['Parameter_config'], optimization_results_two['Num Correlations over 5'], label='Num Correlations over 5', marker='o')
ax1.plot(optimization_results_two['Parameter_config'], optimization_results_two['Num Correlations over 6'], label='Num correlations over 6', marker='o')
ax1.set_ylabel('Counts')
ax1.legend()
ax1.grid(True)

# Plotting the second subplot
ax2.plot(optimization_results_two['Parameter_config'], optimization_results_two['Max Correlation'], label='Max Correlation', marker='o')
#ax2.plot(optimization_results_two['Parameter_config'], optimization_results_two['Mean Correlation 6<x<7'], label='Mean Correlation over 6', marker='o')
ax2.set_ylabel('Correlation Values')
ax2.legend()
ax2.grid(True)

# Plotting the third subplot
ax3.plot(optimization_results_two['Parameter_config'], optimization_results_two['Mean Correlation'], label='Mean Correlation', marker='o')
# ax3.plot(optimization_results_two['Parameter_config'], optimization_results_two['Mean Correlation over 7'], label='Mean Correlation over 7', marker='o')
ax3.set_ylabel('Correlation Values')
ax3.legend()
ax3.grid(True)

# Setting the y-axis range for the second subplot
ax2.set_ylim(-1, 1)

# Setting the x-axis label for the bottom subplot
ax3.set_xlabel('Parameter_config')
plt.xticks(rotation=90) 

# Improve layout to prevent overlap
plt.tight_layout()

# Display the plot
plt.show()

.

.

.

.

.

.

.

.

.

.

FULL EXECUTION DISPLAYING ALL STEPS AND ALL PLOTS

SINGLE CONFIGURATION OF PARAMETERS

In [None]:
# Figures Saving output dir
figures_dir = working_dir + 'results/FullOptimization/BestConfig[1.5_0.9_100_5_15_80]/'

# Check if the directory exists, and if not, create it
if not os.path.exists(figures_dir):
    os.makedirs(figures_dir)
    print(f"{BOLD}{OKBLUE}Creating directory to save results and figures...{ENDC}")


In [None]:
### PARAMETERS

# SETTINGS FOR PLOTTING FIGURES
want_plots = True

# Step 1.
expression_th = 1.5           # Preprocessing expression threshold

# Step 3.
RsquaredCut = 0.9
MeanCut = 100
block_size_scalefit = 5
adjacency_type = "unsigned"

# Step 4.
TOMDenom = "mean"

# Step 6.
min_memb_cluster = 15
height_percentile = 80 
############### Using percentile 90 seems to fuck up the whole survival plot for some reason!!!
############### Look into it later

In [None]:
### Step 1: Data Preprocessing (Normalization)
print(f"{BOLD}{OKBLUE}Step 1{ENDC}")
transcriptomics_dataset_filtered, trait_dataset_filtered = wgcnax.preprocess_TPM_outlier_deletion(transcriptomics_dataset, expression_th, trait_dataset)


### Step 2: Constructing a Co-expression Similarity Matrix (Correlation Matrix)
print(f"{BOLD}{OKBLUE}\n\nStep 2{ENDC}")
correlation_matrix_np = wgcnax.correlation_matrix(transcriptomics_dataset_filtered, want_plots, figures_dir)
wgcnax.matrix_np_check(correlation_matrix_np, 1, -1, 1)


### Step 3: Transforming into an adjacency matrix using a soft threshold power
print(f"{BOLD}{OKBLUE}\n\nStep 3{ENDC}")
optimal_power = wgcnax.pickSoftThreshold(correlation_matrix_np, transcriptomics_dataset_filtered, RsquaredCut, MeanCut, True, figures_dir, block_size_scalefit)

adjacency_matrix_np = wgcnax.adjacencyM_from_correlationM(correlation_matrix_np, optimal_power, adjacency_type, want_plots, figures_dir)
wgcnax.matrix_np_check(adjacency_matrix_np, 1, 0, 1)


### Step 4: Converting adjacency matrix into a topological overlap matrix (TOM)
print(f"{BOLD}{OKBLUE}\n\nStep 4{ENDC}")
simTOM_np = wgcnax.calculate_tom(adjacency_matrix_np, TOMDenom, adjacency_type, want_plots, figures_dir)
dissTOM_np = 1 - simTOM_np
wgcnax.matrix_np_check(simTOM_np, 1, 0, 1)


### Step 5: Hierarchical clustering
print(f"{BOLD}{OKBLUE}\n\nStep 5{ENDC}")
linkage_matrix = wgcnax.hierarchical_clustering(dissTOM_np, False, figures_dir)


### Step 6: Module identification
print(f"{BOLD}{OKBLUE}\n\nStep 6{ENDC}")
module_assignment, cut_height = wgcnax.identify_modules_simple_version(linkage_matrix, height_percentile, min_memb_cluster)
module_assignment.insert(0, 'Gene Name', list(transcriptomics_dataset_filtered))


### Step 7: Calculate EigenGenes for all identified Modules
print(f"{BOLD}{OKBLUE}\n\nStep 7{ENDC}")
expression_profiles = wgcnax.expression_profile_for_cluster(module_assignment, transcriptomics_dataset_filtered)

eigen_genes = wgcnax.calculate_eigen_genes(expression_profiles, want_plots, figures_dir)


# Step 8.1 - Using new encoding for special variables
print(f"{BOLD}{OKBLUE}\n\nStep 8{ENDC}")
trait_columns = list(trait_dataset_filtered.columns[1:] )
correlations, p_values = wgcnax.eigen_trait_correlations_DC(eigen_genes, trait_dataset_filtered, trait_columns)


print(f"{BOLD}{OKBLUE}Done\n\n{ENDC}")

In [None]:
### Visualizations
# Plot visualization of clustersfrom scipy import stats

wgcnax.plot_module_distribution(module_assignment)




## Plot the HEatmap
print(f"{BOLD}{OKBLUE}Plotting and Saving the Module EigenGene to Clinical Trait Correlation...{ENDC}")
title_figure = 'Module Eigengene to Clinical Trait Correlation'

annotations = correlations.round(3).astype(str) + '\n(' + p_values.round(5).astype(str) + ')'

plt.figure(figsize=(40, 40)) 
sns.heatmap(correlations, annot=annotations.values, fmt='', cmap='coolwarm', center=0, vmin=-1, vmax=1)
plt.title(title_figure, fontsize=20)
plt.xlabel('Selected Clincal Traits', fontsize=10)
plt.ylabel('Identified Modules, represented by their EigenGene', fontsize=10)
plt.savefig(figures_dir + title_figure, dpi=150)
plt.show()
print(f"{BOLD}{OKBLUE}Done{ENDC}")

In [None]:
threshold_of_interest = 0.5


significant_modules = []
for trait in correlations.columns:
    significant_clusters = correlations[trait] > threshold_of_interest
    significant_clusters = significant_clusters[significant_clusters].index.tolist()
    if significant_clusters:
        for module in significant_clusters:
            significant_modules.append(module)
            print(f'Trait: {trait}  correlates with Module: {module}, showing a correlation of {correlations.at[module, trait]:.3f}')

print('\n')
for module in set(significant_modules):
    module_expression_profile = expression_profiles[expression_profiles['Module'] == module]
    module_Genes = module_expression_profile['Gene Name'].tolist()
    print(f'The module with id:{module} clusters {len(module_Genes)} genes.')




### Step 9: Survival plot

# Get the modules that show high correlations
wgcnax.survival_probability(correlations, threshold_of_interest, expression_profiles, survival_dataset, figures_dir)