Notebook to run the WGCNA calling for the functions from the py file. To implement once the method is functioning correctly

In [None]:
### GENERAL CONFIGURATION FOR THE ANALYSIS:
# Imports
import os
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import seaborn as sns


sys.path.append(os.getcwd().strip('notebooks') + 'src/')
import WGCNA_functions as wgcnax



# Colors for the terminal outputs
ENDC = "\033[0m"
BOLD = "\033[1m"
UNDERLINE = "\033[4m"

OKBLUE = "\033[94m"
OKGREEN = "\033[92m"
WARNING = "\033[93m"
FAIL = "\033[91m"

# Settings for printing dataframes
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', 1000)

# SETTINGS FOR PLOTTING FIGURES
want_plots = True


In [None]:
### LOADING REAL UNPUBLISHED DATA    -     NO PUSHING FOR THE RESULTS

# Move out of the notebook folder to access datasets
working_dir = os.getcwd()
working_dir = working_dir.strip('notebooks')
data_dir = working_dir + 'data/PROTECTED_DATA/BGI_Expression_Data/'


## Load the dataset
# Transcriptomics Data 
transcriptomics_TPM_dataset_dir = data_dir + 'CRC.SW.mRNA.symbol.TPM_NOnormal.csv'  
transcriptomics_dataset = pd.read_csv(transcriptomics_TPM_dataset_dir, index_col=0)

# Sample info and Clinical Traits Data
sample_info_traits_dir = data_dir + 'All_Traits_Without_Normal.csv' # Using only tumor samples
trait_dataset = pd.read_csv(sample_info_traits_dir, index_col=0)

sample_info_traits_dir = data_dir + 'Survival_Without_Normal.csv' # Using only tumor samples
survival_dataset = pd.read_csv(sample_info_traits_dir, index_col=0)


# Figures Saving output dir
figures_dir = working_dir + 'results/testing/'

# Check if the directory exists, and if not, create it
if not os.path.exists(figures_dir):
    os.makedirs(figures_dir)
    print(f"{BOLD}{OKBLUE}Creating directory to save results and figures...{ENDC}")
    

## Make a subset to save RAM
subset_dataset_size = 200
transcriptomics_dataset = transcriptomics_dataset.iloc[:, :subset_dataset_size] 

# RAM usage estimation in GB
RAM_estimate = (subset_dataset_size * subset_dataset_size * 8) / (1024**3)
print(f"The aproximated RAM to analyse this size of dataset is: {RAM_estimate} GB")


In [None]:
### Step 1: Data Preprocessing (Normalization)


# Visualize original and preprocessed data with PCA
# Create a 2x2 subplot grid
fig, axs = plt.subplots(2, 2, figsize=(14, 12))

# Plot each PCA visualization
expression_th = 1

wgcnax.plot_pca(transcriptomics_dataset, title='PCA of Original Data', ax=axs[0, 0])
wgcnax.plot_pca(wgcnax.simple_preprocess(transcriptomics_dataset), title='PCA of Preprocessed with simple_preprocess', ax=axs[0, 1])
wgcnax.plot_pca(wgcnax.preprocess_TPM(transcriptomics_dataset, expression_th), title='PCA of Preprocessed with preprocess_TPM', ax=axs[1, 0])
datasetttt, traitsss = wgcnax.preprocess_TPM_outlier_deletion(transcriptomics_dataset, expression_th, trait_dataset)
wgcnax.plot_pca(datasetttt, title='PCA of Preprocessed with preprocess_TPM_outlier_deletion', ax=axs[1, 1])
plt.tight_layout()
plt.show()




# Pick the preprocessing to actually use:
transcriptomics_dataset_filtered, trait_dataset_filtered = wgcnax.preprocess_TPM_outlier_deletion(transcriptomics_dataset, expression_th, trait_dataset)


print(f"{BOLD}{OKBLUE}Done...{ENDC}")

In [None]:
### Step 2: Constructing a Co-expression Similarity Matrix (Correlation Matrix)

correlation_matrix_np = wgcnax.correlation_matrix(transcriptomics_dataset_filtered, want_plots, figures_dir)

wgcnax.matrix_np_check(correlation_matrix_np, 1, -1, 1)

In [None]:
### Step 3: Transforming into an adjacency matrix using a soft threshold power

## Parameters for execution.
RsquaredCut = 0.8
MeanCut = 100
block_size_scalefit = 10
adjacency_type = "unsigned"


# Get the optimal power for the adjacency matrix (fitting to a power-law distritbution)
optimal_power = wgcnax.pickSoftThreshold(correlation_matrix_np, transcriptomics_dataset_filtered, RsquaredCut, MeanCut, want_plots, figures_dir, block_size_scalefit)

# Get the adjacency matrix
adjacency_matrix_np = wgcnax.adjacencyM_from_correlationM(correlation_matrix_np, optimal_power, adjacency_type, want_plots, figures_dir)
wgcnax.matrix_np_check(adjacency_matrix_np, 1, 0, 1)

In [None]:
### Step 4: Converting adjacency matrix into a topological overlap matrix (TOM)

# TOMDenom must be either 'min' or 'mean'. More explanation in the function itself
TOMDenom = "mean"

# Get the TOM matrix
simTOM_np = wgcnax.calculate_tom(adjacency_matrix_np, TOMDenom, adjacency_type, want_plots, figures_dir)
dissTOM_np = 1 - simTOM_np
wgcnax.matrix_np_check(simTOM_np, 1, 0, 1)

In [None]:
### Step 5: Hierarchical clustering
linkage_matrix = wgcnax.hierarchical_clustering(dissTOM_np, want_plots, figures_dir)

In [None]:
### Step 6: Module identification

#Parameters
min_memb_cluster = 10
height_percentile = 80  # Use the percentile of heights to set sensitivity
module_assignment, cut_height = wgcnax.identify_modules_simple_version(linkage_matrix, height_percentile, min_memb_cluster)

# Add the Gene name to the clustering table
module_assignment.insert(0, 'Gene Name', list(transcriptomics_dataset_filtered))

# Plot visualization of clusters
wgcnax.plot_module_distribution(module_assignment)


In [None]:
### Step 6: Module identification - SECOND METHOD

module_assignment_sec, cut_height = wgcnax.identify_modules_auto_deep_split(linkage_matrix, dissTOM_np, min_memb_cluster)

# Add the Gene name to the clustering table
module_assignment_sec.insert(0, 'Gene Name', list(transcriptomics_dataset_filtered))

# Plot visualization of clusters
wgcnax.plot_module_distribution(module_assignment_sec)


In [None]:
### Step 7: Calculate EigenGenes for all identified Modules

## Prepare the dataframe so that it contains all the info we need to continue
expression_profiles = wgcnax.expression_profile_for_cluster(module_assignment, transcriptomics_dataset_filtered)

## Call the function to calculate the eigengenes
eigen_genes = wgcnax.calculate_eigen_genes(expression_profiles, want_plots, figures_dir)

In [None]:
### Step 8: Module-Trait Relationship
# Run the Analysis, encoding the variables as categorical, and calculating correlation and p-value

# THIS USES THE NEW ENCODING AND CORRELATION FUNCTIONS
print(f"{BOLD}{OKBLUE}\n\nStep 8{ENDC}")
trait_columns = list(trait_dataset_filtered.columns[1:] )
correlations, p_values = wgcnax.eigen_trait_correlations_DC(eigen_genes, trait_dataset_filtered, trait_columns)

## Plot the Heatmap for the module-trait relationship
wgcnax.correlation_pvalue_heatmap(correlations, p_values, figures_dir)

In [None]:
### Step 9: Survival plot

# Get the modules that show high correlations
corr_th = 0.4
wgcnax.survival_probability(correlations, corr_th, expression_profiles, survival_dataset, figures_dir)