In [None]:
### GENERAL CONFIGURATION FOR THE ANALYSIS:
# Imports
import os
import pandas as pd
import numpy as np
import sys
import matplotlib.pyplot as plt
import seaborn as sns


sys.path.append(os.getcwd().strip('notebooks') + 'src/')
import WGCNA_functions as wgcnax



# Colors for the terminal outputs
ENDC = "\033[0m"
BOLD = "\033[1m"
UNDERLINE = "\033[4m"

OKBLUE = "\033[94m"
OKGREEN = "\033[92m"
WARNING = "\033[93m"
FAIL = "\033[91m"

# Settings for printing dataframes
pd.set_option('display.max_columns', 10)
pd.set_option('display.max_rows', 20)
pd.set_option('display.width', 1000)

# SETTINGS FOR PLOTTING FIGURES
want_plots = True


In [None]:
### LOADING REAL UNPUBLISHED DATA    -     NO PUSHING FOR THE RESULTS

# Move out of the notebook folder to access datasets
working_dir = os.getcwd()
working_dir = working_dir.strip('notebooks')
data_dir = working_dir + 'data/PROTECTED_DATA/BGI_Expression_Data/'


## Load the dataset
# Transcriptomics Data 
transcriptomics_OnlyTumor = data_dir + 'CRC.SW.mRNA.symbol.TPM_NOnormal.csv'
transcriptomics_All = data_dir + 'CRC.SW.mRNA.symbol.TPM.csv'  

transcriptomics_OnlyTumor_dataset = pd.read_csv(transcriptomics_OnlyTumor, index_col=0)
transcriptomics_All_dataset = pd.read_csv(transcriptomics_All, index_col=0)



# Sample info and Clinical Traits Data
sample_info_traits_dir = data_dir + 'All_Traits_Without_Normal.csv' # Using only tumor samples
trait_dataset = pd.read_csv(sample_info_traits_dir, index_col=0)

sample_info_traits_dir = data_dir + 'Survival_Without_Normal.csv' # Using only tumor samples
survival_dataset = pd.read_csv(sample_info_traits_dir, index_col=0)


# Figures Saving output dir
figures_dir = working_dir + 'results/DataDistributionGender/'

# Check if the directory exists, and if not, create it
if not os.path.exists(figures_dir):
    os.makedirs(figures_dir)
    print(f"{BOLD}{OKBLUE}Creating directory to save results and figures...{ENDC}")


In [None]:
# Convert Gender to categorical type for efficient counting
trait_dataset['Gender'] = trait_dataset['Gender'].astype('category')

# Adjusting font size globally
plt.rcParams.update({'font.size': 7})

columns_to_explore = list(set(trait_dataset.columns) - {'MSI Sensor2 Score', 'Gender'})

# Calculate the number of rows/columns for the subplot grid
n = len(columns_to_explore) + 1
ncols = 3  
nrows = n // ncols + (n % ncols > 0)

fig, axes = plt.subplots(nrows, ncols, figsize=(ncols*5, nrows*6), squeeze=False)
fig.subplots_adjust(left=0.05, right=1.2, wspace=0.4, hspace=0.4)

# Plot the gender distribution histogram as the first plot
gender_counts = trait_dataset['Gender'].value_counts()
axes[0, 0].bar(gender_counts.index, gender_counts.values)
axes[0, 0].set_title('Gender Distribution')
axes[0, 0].set_xlabel('Gender')
axes[0, 0].set_ylabel('Count')
axes[0, 0].tick_params(axis='x', rotation=45)
axes[0, 0].grid(axis='y', linestyle='--')

# Start plotting the other categories from the second plot position
for i, column in enumerate(columns_to_explore, start=1):
    row = i // ncols
    col = i % ncols
    ax = axes[row, col]
    
    # For each category, count the occurrences by Gender
    category_counts = trait_dataset.groupby(['Gender', column], observed=True).size().unstack(fill_value=0)
    
    # Plot the distribution for each category by Gender
    category_counts.plot(kind='bar', stacked=True, ax=ax)
    ax.set_title(f'Distribution of {column} by Gender')
    ax.set_xlabel('Gender')
    ax.set_ylabel('Count')
    ax.tick_params(axis='x', rotation=45)
    ax.grid(axis='y', linestyle='--')
    ax.legend(title=column, bbox_to_anchor=(1.05, 1), loc='upper left')

# Hide any unused subplots
for j in range(i+1, nrows*ncols):
    fig.delaxes(axes.flatten()[j])

plt.savefig(figures_dir + 'Distribution of classes across genders', dpi = 150)
plt.show()