In [1]:
### GENERAL CONFIGURATION FOR THE ANALYSIS:
# Imports
import os
import pandas as pd
import sys
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from fpdf import FPDF


sys.path.append(os.getcwd().strip('notebooks') + 'src/')
import WGCNA_functions as wgcnax



# Colors for the terminal outputs
ENDC = "\033[0m"
BOLD = "\033[1m"
UNDERLINE = "\033[4m"

OKBLUE = "\033[94m"
OKGREEN = "\033[92m"
WARNING = "\033[93m"
FAIL = "\033[91m"

# Settings for printing dataframes
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', 1000)


### LOADING REAL UNPUBLISHED DATA    -     NO PUSHING FOR THE RESULTS

# Move out of the notebook folder to access datasets
working_dir = os.getcwd()
working_dir = working_dir.strip('notebooks')
data_dir = working_dir + 'data/PROTECTED_DATA/BGI_Expression_Data/'


## Load the dataset
# Transcriptomics Data 
transcriptomics_TPM_dataset_dir = data_dir + 'CRC.SW.mRNA.symbol.TPM_NOnormal.csv'  
transcriptomics_dataset = pd.read_csv(transcriptomics_TPM_dataset_dir, index_col=0)

# Sample info and Clinical Traits Data
sample_info_traits_dir = data_dir + 'All_Traits_Without_Normal.csv' # Using only tumor samples
trait_dataset = pd.read_csv(sample_info_traits_dir, index_col=0)
trait_dataset


# Figures Saving output dir
figures_dir = working_dir + 'results/optimization_partialAB/'

# Check if the directory exists, and if not, create it
if not os.path.exists(figures_dir):
    os.makedirs(figures_dir)
    print(f"{BOLD}{OKBLUE}Creating directory to save results and figures...{ENDC}")


[1m[94mCreating directory to save results and figures...[0m


AUTOMATIC SEARCH FOR THE BEST PARAMETERS









In [2]:
### PARAMETERS for automatic search


# SETTINGS FOR PLOTTING FIGURES
want_plots = False

# Step 1.
#expression_th = [0.5, 1, 2, 5, 10]
expression_th = 1  

# Step 3.
#RsquaredCut = [0.75, 0.85, 0.95]
#MeanCut = [50, 100]
#block_size_scalefit = [5, 10, 50, 100]
RsquaredCut = 0.9
MeanCut = 100
block_size_scalefit = 10
adjacency_type = "unsigned"

# Step 4.
TOMDenom = "mean"

# Step 6.
min_memb_cluster_vect = [5, 10, 15, 20, 30]
height_percentile_vect = [60, 70, 80, 90, 95]

In [4]:
# Running a partial search for best parameters

# To store all the results in a pdf: 
class PDF(FPDF):
    def footer(self):
        self.set_y(-15)
        self.set_font('Arial', 'I', 8)
        self.cell(0, 10, f'Page {self.page_no()}', 0, 0, 'C')
# Initialize PDF
pdf = PDF()
pdf.add_page()
pdf.set_font("Arial", size=12)
pdf.set_auto_page_break(auto=True, margin=15)




linkage_matrix, transcriptomics_dataset_filtered, \
    trait_dataset_filtered = wgcnax.run_partialA_WGCNA(transcriptomics_dataset, expression_th, \
                                                       trait_dataset, want_plots, figures_dir, \
                                                        RsquaredCut, MeanCut, block_size_scalefit, adjacency_type, TOMDenom)

for min_memb_cluster in min_memb_cluster_vect:
    for height_percentile in height_percentile_vect:

        module_assignment, correlations, p_values = wgcnax.run_partialB_WGCNA(linkage_matrix, transcriptomics_dataset_filtered, \
                                                            trait_dataset_filtered, want_plots, figures_dir, \
                                                            height_percentile, min_memb_cluster)
        


        # String of parameters
        parameters_text = "The configuration of parameters is min_memb_cluster: " + str(min_memb_cluster) \
                    + " and height_percentile: " + str(height_percentile)
        pdf.cell(0, 10, txt=parameters_text, ln=True)



        # Plot visualization of clustersfrom scipy import stats
        wgcnax.plot_module_distribution(module_assignment)
        # Save and append plot_module_distribution
        fig_path = figures_dir + f"module_distribution_{min_memb_cluster}_{height_percentile}.png"
        wgcnax.plot_module_distribution(module_assignment)
        plt.savefig(fig_path)
        plt.close()
        pdf.image(fig_path, x=10, w=180)  # Adjust x, w as needed



        # Save and append heatmap
        heatmap_path = figures_dir + f"heatmap_{min_memb_cluster}_{height_percentile}.png"
        plt.figure(figsize=(40, 40))
        annotations = correlations.round(3).astype(str) + '\n(' + p_values.round(5).astype(str) + ')'
        sns.heatmap(correlations, annot=annotations.values, fmt='', cmap='coolwarm', center=0, vmin=-1, vmax=1)
        plt.title('Module Eigengene to Clinical Trait Correlation', fontsize=20)
        plt.xlabel('Selected Clinical Traits', fontsize=10)
        plt.ylabel('Identified Modules, represented by their EigenGene', fontsize=10)
        plt.savefig(heatmap_path, dpi=150)
        plt.close()
        pdf.image(heatmap_path, x=10, w=180)  # Adjust x, w as needed


# Save the PDF
pdf.output(figures_dir + "results.pdf")

  pdf.set_font("Arial", size=12)


[1m[94mStep 1[0m
[1m[94mPre-processing...[0m


In [None]:

## Plot the HEatmap
print(f"{BOLD}{OKBLUE}Plotting and Saving the Module EigenGene to Clinical Trait Correlation...{ENDC}")
title_figure = 'Module Eigengene to Clinical Trait Correlation'

annotations = correlations.round(3).astype(str) + '\n(' + p_values.round(5).astype(str) + ')'

plt.figure(figsize=(40, 40)) 
sns.heatmap(correlations, annot=annotations.values, fmt='', cmap='coolwarm', center=0, vmin=-1, vmax=1)
plt.title(title_figure, fontsize=20)
plt.xlabel('Selected Clincal Traits', fontsize=10)
plt.ylabel('Identified Modules, represented by their EigenGene', fontsize=10)
plt.savefig(figures_dir + title_figure, dpi=150)
plt.show()
print(f"{BOLD}{OKBLUE}Done{ENDC}")






















FULL EXECUTION DISPLAYING ALL STEPS AND ALL PLOTS

SINGLE CONFIGURATION OF PARAMETERS

In [None]:
### PARAMETERS

# SETTINGS FOR PLOTTING FIGURES
want_plots = True

# Step 1.
expression_th = 1           # Preprocessing expression threshold

# Step 3.
RsquaredCut = 0.9
MeanCut = 100
block_size_scalefit = 10
adjacency_type = "unsigned"

# Step 4.
TOMDenom = "mean"

# Step 6.
min_memb_cluster = 15
height_percentile = 60  # Use the percentile of heights to set sensitivity

In [None]:
### Step 1: Data Preprocessing (Normalization)
print(f"{BOLD}{OKBLUE}Step 1{ENDC}")
transcriptomics_dataset_filtered, trait_dataset_filtered = wgcnax.preprocess_TPM_outlier_deletion(transcriptomics_dataset, expression_th, trait_dataset)


### Step 2: Constructing a Co-expression Similarity Matrix (Correlation Matrix)
print(f"{BOLD}{OKBLUE}\n\nStep 2{ENDC}")
correlation_matrix_np = wgcnax.correlation_matrix(transcriptomics_dataset_filtered, want_plots, figures_dir)
wgcnax.matrix_np_check(correlation_matrix_np, 1, -1, 1)


### Step 3: Transforming into an adjacency matrix using a soft threshold power
print(f"{BOLD}{OKBLUE}\n\nStep 3{ENDC}")
optimal_power = wgcnax.pickSoftThreshold(correlation_matrix_np, transcriptomics_dataset_filtered, RsquaredCut, MeanCut, True, figures_dir, block_size_scalefit)

adjacency_matrix_np = wgcnax.adjacencyM_from_correlationM(correlation_matrix_np, optimal_power, adjacency_type, want_plots, figures_dir)
wgcnax.matrix_np_check(adjacency_matrix_np, 1, 0, 1)


### Step 4: Converting adjacency matrix into a topological overlap matrix (TOM)
print(f"{BOLD}{OKBLUE}\n\nStep 4{ENDC}")
simTOM_np = wgcnax.calculate_tom(adjacency_matrix_np, TOMDenom, adjacency_type, want_plots, figures_dir)
dissTOM_np = 1 - simTOM_np
wgcnax.matrix_np_check(simTOM_np, 1, 0, 1)


### Step 5: Hierarchical clustering
print(f"{BOLD}{OKBLUE}\n\nStep 5{ENDC}")
linkage_matrix = wgcnax.hierarchical_clustering(dissTOM_np, False, figures_dir)


### Step 6: Module identification
print(f"{BOLD}{OKBLUE}\n\nStep 6{ENDC}")
module_assignment, cut_height = wgcnax.identify_modules_simple_version(linkage_matrix, height_percentile, min_memb_cluster)
module_assignment.insert(0, 'Gene Name', list(transcriptomics_dataset_filtered))


### Step 7: Calculate EigenGenes for all identified Modules
print(f"{BOLD}{OKBLUE}\n\nStep 7{ENDC}")
expression_profiles = wgcnax.expression_profile_for_cluster(module_assignment, transcriptomics_dataset_filtered)

eigen_genes = wgcnax.calculate_eigen_genes(expression_profiles, want_plots, figures_dir)


# Step 8.1 - Using new encoding for special variables
print(f"{BOLD}{OKBLUE}\n\nStep 8{ENDC}")
trait_columns = list(trait_dataset_filtered.columns[1:] )
correlations, p_values = wgcnax.eigen_trait_correlations_DC(eigen_genes, trait_dataset_filtered, trait_columns)


print(f"{BOLD}{OKBLUE}Done\n\n{ENDC}")

In [None]:
### Visualizations
# Plot visualization of clustersfrom scipy import stats

wgcnax.plot_module_distribution(module_assignment)




## Plot the HEatmap
print(f"{BOLD}{OKBLUE}Plotting and Saving the Module EigenGene to Clinical Trait Correlation...{ENDC}")
title_figure = 'Module Eigengene to Clinical Trait Correlation'

annotations = correlations.round(3).astype(str) + '\n(' + p_values.round(5).astype(str) + ')'

plt.figure(figsize=(40, 40)) 
sns.heatmap(correlations, annot=annotations.values, fmt='', cmap='coolwarm', center=0, vmin=-1, vmax=1)
plt.title(title_figure, fontsize=20)
plt.xlabel('Selected Clincal Traits', fontsize=10)
plt.ylabel('Identified Modules, represented by their EigenGene', fontsize=10)
plt.savefig(figures_dir + title_figure, dpi=150)
plt.show()
print(f"{BOLD}{OKBLUE}Done{ENDC}")