Notebook to run the WGCNA calling for the functions from the py file. To implement once the method is functioning correctly

In [None]:
### GENERAL CONFIGURATION FOR THE ANALYSIS:
# Imports
import os
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import seaborn as sns
import networkx as nx
from scipy.stats import spearmanr

sys.path.append(os.getcwd().strip('notebooks') + 'src/')
import WGCNA_functions as wgcnax
import importlib
importlib.reload(wgcnax)


 # Save work session for easy execution on plots
import dill



# Colors for the terminal outputs
ENDC = "\033[0m"
BOLD = "\033[1m"
UNDERLINE = "\033[4m"

OKBLUE = "\033[94m"
OKGREEN = "\033[92m"
WARNING = "\033[93m"
FAIL = "\033[91m"

# Settings for printing dataframes
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', 1000)

In [None]:
# SETTINGS FOR PLOTTING FIGURES
want_plots = False

In [None]:
### LOADING REAL UNPUBLISHED DATA    -     NO PUSHING FOR THE RESULTS

# Move out of the notebook folder to access datasets
working_dir = os.getcwd()
working_dir = working_dir.strip('notebooks')
data_dir = working_dir + 'data/PROTECTED_DATA/BGI_Expression_Data/'



## Load the dataset
# Transcriptomics Data 
transcriptomics_All_dir = data_dir + 'CRC.SW.mRNA.symbol.TPM.csv'
transcriptomics_TumorOnly_dir = data_dir + 'CRC.SW.mRNA.symbol.TPM_TumorOnly.csv'
transcriptomics_NormalOnly_dir = data_dir + 'CRC.SW.mRNA.symbol.TPM_NormalOnly.csv'

transcriptomics_All = pd.read_csv(transcriptomics_All_dir, index_col=0)
transcriptomics_TumorOnly = pd.read_csv(transcriptomics_TumorOnly_dir, index_col=0)
transcriptomics_NormalOnly = pd.read_csv(transcriptomics_NormalOnly_dir, index_col=0)



# Clinical Traits Data
CT_for_All_dir = data_dir + 'ClinicalTraits_for_AllSamples.csv'
CT_for_TumorSamples_dir = data_dir + 'ClinicalTraits_for_TumorSamples.csv'
CT_for_NormalSamples_dir = data_dir + 'ClinicalTraits_for_NormalSamples.csv'

CT_for_All = pd.read_csv(CT_for_All_dir, index_col=0)
CT_for_TumorSamples = pd.read_csv(CT_for_TumorSamples_dir, index_col=0)
CT_for_NormalSamples = pd.read_csv(CT_for_NormalSamples_dir, index_col=0)



# Survival Data
SurvivalData_for_TumorSamples_dir = data_dir + 'Survival_Data_for_TumorSamples.csv' # Using only tumor samples
SurvivalData_for_TumorSamples = pd.read_csv(SurvivalData_for_TumorSamples_dir, index_col=0)



# Figures Saving output dir
figures_dir = working_dir + 'results/NormalandTumor_usingOnlyTumor_bestConfig/'

# Check if the directory exists, and if not, create it
if not os.path.exists(figures_dir):
    os.makedirs(figures_dir)
    print(f"{BOLD}{OKBLUE}Creating directory to save results and figures...{ENDC}")






## WE WANT TO USE ALL DATA FOR CLUSTERING, AND THEN SEPARATE TYPES OF SAMPLES IN THE EIGENGENE STEP
transcriptomics_dataset = transcriptomics_All



"""

## Make a subset to save RAM
subset_dataset_size = 200
transcriptomics_dataset = transcriptomics_dataset.iloc[:, :subset_dataset_size] 

# RAM usage estimation in GB
RAM_estimate = (subset_dataset_size * subset_dataset_size * 8) / (1024**3)
print(f"The aproximated RAM to analyse this size of dataset is: {RAM_estimate} GB")
"""

In [None]:
### Step 1: Data Preprocessing (Normalization)


# Visualize original and preprocessed data with PCA
# Create a 2x2 subplot grid
fig, axs = plt.subplots(2, 2, figsize=(14, 12))

# Plot each PCA visualization
expression_th = 1.5

wgcnax.plot_pca(transcriptomics_dataset, title='PCA of Original Data', ax=axs[0, 0])
wgcnax.plot_pca(wgcnax.simple_preprocess(transcriptomics_dataset), title='PCA of Preprocessed with simple_preprocess', ax=axs[0, 1])
wgcnax.plot_pca(wgcnax.preprocess_TPM(transcriptomics_dataset, expression_th), title='PCA of Preprocessed with preprocess_TPM', ax=axs[1, 0])
datasetttt, traitsss = wgcnax.preprocess_TPM_outlier_deletion(transcriptomics_dataset, expression_th, CT_for_All)
wgcnax.plot_pca(datasetttt, title='PCA of Preprocessed with preprocess_TPM_outlier_deletion', ax=axs[1, 1])
plt.tight_layout()
plt.show()



# Basic Preprocessing, Expected to perform the worst
# transcriptomics_dataset_filtered = wgcnax.preprocess_TPM(transcriptomics_dataset, expression_th)


# Zscore, expected to perform a bit better than with Basic TPM, but still not great
#transcriptomics_dataset_filtered = wgcnax.preprocess_TPM_Zscore(transcriptomics_dataset, expression_th)


# PCA Analysis, which should be the best one by a lot
transcriptomics_dataset_filtered, CT_for_All_filtered = wgcnax.preprocess_TPM_outlier_deletion(transcriptomics_dataset, expression_th, CT_for_All)


# Super agressive Preprocessing from the paper on GCNN - Performs a lot worse
#transcriptomics_dataset_filtered = wgcnax.preprocess_agresive(transcriptomics_dataset)
#wgcnax.plot_pca(transcriptomics_dataset_filtered, title='PCA of Agressive Preproces')




print(f"{BOLD}{OKBLUE}Done...{ENDC}")

In [None]:
### Step 2: Constructing a Co-expression Similarity Matrix (Correlation Matrix)

correlation_matrix_np = wgcnax.correlation_matrix(transcriptomics_dataset_filtered, want_plots, figures_dir)

wgcnax.matrix_np_check(correlation_matrix_np, 1, -1, 1)

In [None]:
### Step 3: Transforming into an adjacency matrix using a soft threshold power

## Parameters for execution.
RsquaredCut = 0.9
MeanCut = 100
block_size_scalefit = 5
adjacency_type = "unsigned"


# Get the optimal power for the adjacency matrix (fitting to a power-law distritbution)
optimal_power = wgcnax.pickSoftThreshold(correlation_matrix_np, transcriptomics_dataset_filtered, RsquaredCut, MeanCut, want_plots, figures_dir, block_size_scalefit)

# Get the adjacency matrix
adjacency_matrix_np = wgcnax.adjacencyM_from_correlationM(correlation_matrix_np, optimal_power, adjacency_type, want_plots, figures_dir)
wgcnax.matrix_np_check(adjacency_matrix_np, 1, 0, 1)

In [None]:
### Step 4: Converting adjacency matrix into a topological overlap matrix (TOM)

# TOMDenom must be either 'min' or 'mean'. More explanation in the function itself
TOMDenom = "mean"

# Get the TOM matrix
simTOM_np = wgcnax.calculate_tom(adjacency_matrix_np, TOMDenom, adjacency_type, want_plots, figures_dir)
dissTOM_np = 1 - simTOM_np
wgcnax.matrix_np_check(simTOM_np, 1, 0, 1)

In [None]:
### Step 5: Hierarchical clustering
linkage_matrix = wgcnax.hierarchical_clustering(dissTOM_np, want_plots, figures_dir)

In [None]:
### Step 6: Module identification

#Parameters
min_memb_cluster = 10    # 15 seems to be optimal
height_percentile = 80  

module_assignment, cut_height = wgcnax.identify_modules_simple_version(linkage_matrix, height_percentile, min_memb_cluster)

# Add the Gene name to the clustering table
module_assignment.insert(0, 'Gene Name', list(transcriptomics_dataset_filtered))

# Plot visualization of clusters
wgcnax.plot_module_distribution(module_assignment)


In [None]:
"""
### Step 6: Module identification - SECOND METHOD

module_assignment_sec, cut_height = wgcnax.identify_modules_auto_deep_split(linkage_matrix, dissTOM_np, min_memb_cluster)

# Add the Gene name to the clustering table
module_assignment_sec.insert(0, 'Gene Name', list(transcriptomics_dataset_filtered))

# Plot visualization of clusters
wgcnax.plot_module_distribution(module_assignment_sec)
"""

# Separation into 3 segmented analysis

Up to this point, we built the Flat Clusters using all samples (Tumor and Normal), and now we will separate them:
  - Eigengene for Normal Samples only
  - Eigengene for Tumor Samples only
  - Eigengene with All Samples

Separated to study effect of using all samples or only tumor in final results

In [None]:
### Build 3 datasated containing: Only Tumor Samples, only Normal Samples, and both.
Transc_TumorOnly = transcriptomics_dataset_filtered[transcriptomics_dataset_filtered.index.isin(transcriptomics_TumorOnly.index)]
Transc_NormalOnly = transcriptomics_dataset_filtered[transcriptomics_dataset_filtered.index.isin(transcriptomics_NormalOnly.index)]
Transc_All = transcriptomics_dataset_filtered

In [None]:
# It is intersting to observe that PCA outlier detection eliminates 11 samples, all from the Normal Samples.

print(Transc_TumorOnly.shape)
print(Transc_NormalOnly.shape)
print(Transc_All.shape)

print("\n")
print(CT_for_TumorSamples.shape)
print(CT_for_NormalSamples.shape)
print(CT_for_All.shape)

In [None]:
# If not running PCA
# CT_for_All_filtered = CT_for_All

In [None]:
### Ensure that the PCA Sample Outlier Detection is applied to all CT dataset
CT_for_TumorSamples_filtered = CT_for_TumorSamples[CT_for_TumorSamples.index.isin(CT_for_All_filtered.index)]
CT_for_NormalSamples_filtered = CT_for_NormalSamples[CT_for_NormalSamples.index.isin(CT_for_All_filtered.index)]


In [None]:
### Step 7: Calculate EigenGenes for all identified Modules, for each configuration of Samples

## Tumor only Eigengenes
TumorOnly_expression_profiles = wgcnax.expression_profile_for_cluster(module_assignment, Transc_TumorOnly)
TumorOnly_eigen_genes = wgcnax.calculate_eigen_genes(TumorOnly_expression_profiles, want_plots, figures_dir)

## Normal only Eigengenes
NormalOnly_expression_profiles = wgcnax.expression_profile_for_cluster(module_assignment, Transc_NormalOnly)
NormalOnly_eigen_genes = wgcnax.calculate_eigen_genes(NormalOnly_expression_profiles, want_plots, figures_dir)

## All Normal and Tumor Samples Eigengenes
All_expression_profiles = wgcnax.expression_profile_for_cluster(module_assignment, Transc_All)
All_eigen_genes = wgcnax.calculate_eigen_genes(All_expression_profiles, want_plots, figures_dir)


if TumorOnly_eigen_genes.shape[1] != 1064 or NormalOnly_eigen_genes.shape[1] != 121 or All_expression_profiles.shape[1] != 1185:
    print(f"Something is wrong, the dimensions are not matching the expected size.")

In [None]:
### Step 8: Module-Trait Relationship
# Plot 3 Heatmaps, to separate between effect in Tumor vs. Normal

p_value_th = 0.05

# TUMOR SAMPLES ONLY
trait_columns = list(CT_for_TumorSamples.columns[1:] )
correlations_T, p_values_T = wgcnax.eigen_trait_correlations_DC(TumorOnly_eigen_genes, CT_for_TumorSamples_filtered, trait_columns)
## Plot the Heatmap for the module-trait relationship
wgcnax.correlation_heatmap(correlations_T, p_values_T, figures_dir, title = 'Tumor Samples Only', p_value_th=p_value_th)


# Normal SAMPLES ONLY
trait_columns = list(CT_for_NormalSamples.columns[1:] )
correlations_N, p_values_N = wgcnax.eigen_trait_correlations_DC(NormalOnly_eigen_genes, CT_for_NormalSamples_filtered, trait_columns)
## Plot the Heatmap for the module-trait relationship
wgcnax.correlation_heatmap(correlations_N, p_values_N, figures_dir, title = 'Normal Samples Only', p_value_th=p_value_th)


# All SAMPLES
trait_columns = list(CT_for_All.columns[1:] )
correlations_TN, p_values_TN = wgcnax.eigen_trait_correlations_DC(All_eigen_genes, CT_for_All_filtered, trait_columns)
## Plot the Heatmap for the module-trait relationship
wgcnax.correlation_heatmap(correlations_TN, p_values_TN, figures_dir, title = 'All Samples', p_value_th=p_value_th)

In [None]:
### More Special Heatmaps -- Not relevant


#### All SAMPLES correlated with Tumor or Normal Tissue

## Create Dataframe with all samples, annotated as Tumor or Normal Sample
CT_for_All_filtered.reset_index(inplace=True)
All_Samples_Labeled = CT_for_All_filtered[['Sample_ID']].copy()
All_Samples_Labeled['SampleType'] = 0

# If it contains 'T', mark 'Tumor' as 1, if it contains 'N', mark as 0
All_Samples_Labeled['SampleType'] = All_Samples_Labeled['Sample_ID'].apply(
    lambda sample_id: 'Tumor' if 'T' in sample_id else ('Normal' if 'N' in sample_id else None))

# Plot Heatmap
trait_columns = list(All_Samples_Labeled.columns[1:] )
correlations_TNL, p_values_TNL = wgcnax.eigen_trait_correlations_DC(All_eigen_genes, All_Samples_Labeled, trait_columns)
## Plot the Heatmap for the module-trait relationship
wgcnax.correlation_heatmap(correlations_TNL, p_values_TNL, figures_dir, title = 'All Samples, Labeled', p_value_th=p_value_th)





#### Tumor only HeatMap, splitted by Male or Female

# Initialize dictionaries to store columns for each gender, and avoid high fragmentation
columns_male = {'Module': TumorOnly_eigen_genes['Module']}
columns_female = {'Module': TumorOnly_eigen_genes['Module']}

for sample_id in TumorOnly_eigen_genes.columns.drop('Module'):
    if sample_id in CT_for_TumorSamples_filtered.index:
        gender = CT_for_TumorSamples_filtered.loc[sample_id, 'Gender']

        # Based on the gender, add the column to the corresponding dictionary
        if gender == 'Male':
            columns_male[sample_id] = TumorOnly_eigen_genes[sample_id]
        elif gender == 'Female':
            columns_female[sample_id] = TumorOnly_eigen_genes[sample_id]

# Concatenate all the collected columns at once for males and females
TumorOnly_eigen_genes_Male = pd.DataFrame(columns_male)
TumorOnly_eigen_genes_Female = pd.DataFrame(columns_female)

# Split the ClinicalTrait dataframe also into Female Male
CT_for_TumorSamples_Female = CT_for_TumorSamples_filtered[CT_for_TumorSamples_filtered['Gender'] == 'Female'].drop('Gender', axis=1) 
CT_for_TumorSamples_Male = CT_for_TumorSamples_filtered[CT_for_TumorSamples_filtered['Gender'] == 'Male'].drop('Gender', axis=1)



# Plot Heatmap for Males
trait_columns = list(CT_for_TumorSamples_Male.columns[1:] )
correlations_TM, p_values_TM = wgcnax.eigen_trait_correlations_DC(TumorOnly_eigen_genes_Male, CT_for_TumorSamples_Male, trait_columns)
## Plot the Heatmap for the module-trait relationship
wgcnax.correlation_heatmap(correlations_TM, p_values_TM, figures_dir, title = 'Tumor and Male Only', p_value_th=p_value_th)


# Plot Heatmap for Females
trait_columns = list(CT_for_TumorSamples_Female.columns[1:] )
correlations_TM, p_values_TM = wgcnax.eigen_trait_correlations_DC(TumorOnly_eigen_genes_Female, CT_for_TumorSamples_Female, trait_columns)
## Plot the Heatmap for the module-trait relationship
wgcnax.correlation_heatmap(correlations_TM, p_values_TM, figures_dir, title = 'Tumor and Female Only', p_value_th=p_value_th)



Basically, after checking everything, seems to be more relevant to keep the samples separated (normal and tumor segregated), and to keep genders toghether. Now, we can study the effect of this configuration in everything else

### Improve visualization and do Results selection on the interesting heatmap

in this case, only Tumor Samples

In [None]:
# Select what heatmap you want
correlations = correlations_T
p_values = p_values_T
expression_profiles = TumorOnly_expression_profiles
eigengenes = TumorOnly_eigen_genes


# Find rows where all cells have abs(value) < 0.45 and drope them
threshold_of_interest = 0.5

# Filter out non relevant modules
not_significant_modules = (correlations.abs() < threshold_of_interest).all(axis=1)
modules_to_drop = correlations.index[not_significant_modules]
print(f"The following Modules are droped: {list(modules_to_drop)}")
correlations_filtered = correlations.drop(index=modules_to_drop)
p_values_filtered = p_values.drop(index=modules_to_drop)


In [None]:
#### Filtered Heatmap, displaying only reelevant clusters

wgcnax.correlation_heatmap(correlations_filtered, p_values_filtered, figures_dir,  title = 'Selected Modules with cor>0,4', p_value_th=p_value_th)

In [None]:
# Print some data about sleected modules

significant_modules = []
for trait in correlations.columns:
    significant_clusters = np.abs(correlations[trait]) > threshold_of_interest
    significant_clusters = significant_clusters[significant_clusters].index.tolist()
    if significant_clusters:
        for module in significant_clusters:
            significant_modules.append(module)
            print(f'Trait: {trait}  correlates with Module: {module}, showing a correlation of {correlations.at[module, trait]:.3f}')

print('\n')
for module in set(significant_modules):
    module_expression_profile = expression_profiles[expression_profiles['Module'] == module]
    module_Genes = module_expression_profile['Gene Name'].tolist()
    print(f'The module with id:{module} clusters {len(module_Genes)} genes.')


In [None]:
### Step 9: Survival plot

# Get the modules that show high correlations
wgcnax.survival_probability(correlations, threshold_of_interest, expression_profiles, SurvivalData_for_TumorSamples, figures_dir)

In [None]:
## Plot a Network Representation of the Relevant Clusters FOR TUMOR SAMPLES

# Color mapping for different intervals of correlation strengths
interval_colors = {
    '0.9 - 1.0': 'red',
    '0.8 - 0.9': 'orange',
    '0.7 - 0.8': 'yellow',
    '0.6 - 0.7': 'blue',
    
    '0.0 - 0.5': 'green',
    }

# Function to determine color based on weight
def get_edge_color(weight, interval_colors):
    for interval, color in interval_colors.items():
        lower, upper = map(float, interval.split(' - '))
        if lower <= abs(weight) <= upper:
            return color
    return 'grey'


num_modules = len(set(significant_modules))
grid_size = int(np.ceil(np.sqrt(num_modules)))
plt.figure(figsize=(grid_size * 5, grid_size * 5))
for i, cluster in enumerate(set(significant_modules), start = 1):
    # Build Expression Profile for the Module
    module_profile = expression_profiles[expression_profiles['Module'] == cluster].copy()
    module_profile.set_index('Gene Name', inplace=True)
    module_profile.drop('Module', axis=1, inplace=True)

    # Add Eigengene
    module_eigengene = eigengenes[eigengenes['Module'] == cluster].copy()
    module_eigengene.set_index('Module', inplace=True)
    module_eigengene.rename(index={cluster: 'EigenGene'}, inplace=True)
    module_profile = pd.concat([module_profile, module_eigengene])

    # Convert to numpy and calculate Spearman correlation -> Membership function in a way
    module_profile_matrix = module_profile.to_numpy()
    corr, _ = spearmanr(module_profile_matrix, axis=1)
    corr_matrix = pd.DataFrame(corr, index=module_profile.index, columns=module_profile.index)
    
    # Network creation and node addition
    G = nx.Graph()
    for gene in module_profile.index:
        G.add_node(gene)
    
    # Edge addition based on correlation
    for gene1 in module_profile.index:
        for gene2 in module_profile.index:
            if gene1 != gene2:
                weight = corr_matrix.loc[gene1, gene2]
                color = get_edge_color(weight, interval_colors)
                G.add_edge(gene1, gene2, weight=weight, color=color)
    
    # Plotting the network in a subplot
    plt.subplot(grid_size, grid_size, i)
    pos = nx.spring_layout(G, k=0.5, iterations=20)
    edges = G.edges(data=True)
    nx.draw(G, pos, with_labels=True, node_color=['red' if name=='EigenGene' else 'skyblue' for name in G.nodes], 
            node_size=500, 
            edge_color=[data['color'] for _, _, data in edges], linewidths=1, font_size=10,
            width=[(data['weight'])*2 for _, _, data in edges])

    plt.title(f'Module {cluster}')
    plt.axis('off')



# Add a legend for the whole figure
title_figure = 'Network Representation of all Relevant Modules'
legend_labels = {label: plt.Line2D([0], [0], color=color, lw=4) for label, color in interval_colors.items()}
plt.figlegend(legend_labels.values(), legend_labels.keys(), loc='upper right')
plt.tight_layout()
plt.suptitle(title_figure, fontsize=20)
plt.tight_layout(rect=[0, 0.03, 1, 0.95])

plt.savefig(figures_dir + title_figure, dpi=200)
plt.show()