Setup

In [None]:
import os
import pandas as pd
import numpy as np
import networkx as nx
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

import re
from scipy import stats
from scipy import sparse
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split


## Load and Prepare the Datasets

In [None]:
# Move out of the notebook folder to access datasets
working_dir = os.getcwd()
working_dir = working_dir.strip('notebooks')
data_dir = working_dir + 'data/PROTECTED_DATA/BGI_Expression_Data/'


## Load Transcriptomics Data 
transcriptomics_TPM_dataset_dir = data_dir + 'CRC.SW.mRNA.symbol.TPM_NOnormal.csv'  
transcriptomics_dataset = pd.read_csv(transcriptomics_TPM_dataset_dir, index_col=0)

# Classification Tags
labels_classification_dir = data_dir + 'ClassTags_PrimarySiteDisease.csv' # Using only tumor samples
labels = pd.read_csv(labels_classification_dir, index_col=0)


# Figures Saving output dir


# Convert The directory to the name of the column
trait_used_as_label = labels_classification_dir.replace(data_dir, '').replace('ClassTags_', '').replace('.csv', '')
trait_used_as_label = re.sub(r'(?<=\w)([A-Z])', r' \1', trait_used_as_label) # Add spaces before the capital letters for formatting

# Convert labels to categorical values
class_values = labels[trait_used_as_label].astype('category').cat.codes
labels['label'] = class_values






## Make a subset to save RAM
subset_dataset_size = 2000
transcriptomics_dataset = transcriptomics_dataset.iloc[:, :subset_dataset_size] 

# RAM usage estimation in GB
RAM_estimate = (subset_dataset_size * subset_dataset_size * 8) / (1024**3)
print(f"The aproximated RAM to analyse this size of dataset is: {RAM_estimate} GB")


## Prepare Data for the Model

### Preprocessing

Preprocess the data using the same method as in the WGCNA approach

In [None]:
def preprocess_TPM_outlier_deletion(raw_data, expression_th):

    """
    Cleans raw data by filtering out low expression genes, applying log transformation, and removing outliers based on PCA analysis.
    
    Parameters:
    - raw_data (DataFrame): The raw data as a pandas DataFrame.
    - expression_th (int): The value of expression under which genes are eliminated.
    
    Returns:
    - DataFrame: The dataset after preprocessing and outlier removal.
    """
    # Filter out genes with low expression across all samples
    cleaned_dataset = raw_data.loc[:, (raw_data > expression_th).any(axis=0)].copy()
    
    # Apply log2 transformation to all values except for the first column (gene identifiers)
    cleaned_dataset.iloc[:, 1:] = np.log2(cleaned_dataset.iloc[:, 1:] + 1)
    
    # Outlier detection and removal based on PCA
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(cleaned_dataset.iloc[:, 1:])  # NOT Transpose to have samples as rows for PCA
    z_scores = np.abs(stats.zscore(pca_result, axis=0))
    good_samples = (z_scores < 3).all(axis=1)                      # Keeping samples within 3 standard deviations
    cleaned_dataset = cleaned_dataset[good_samples]

    # Data Standardization (Z-score normalization)
    cleaned_dataset.iloc[:, 1:] = cleaned_dataset.iloc[:, 1:].apply(stats.zscore, axis=0)

    # Print the number of genes removed
    num_genes_removed = raw_data.shape[1] - cleaned_dataset.shape[1]
    print(f"preprocess_TPM_outlier_deletion function removed {num_genes_removed} genes")

    # Print the number of genes removed
    num_pacients_removed = raw_data.shape[0] - cleaned_dataset.shape[0]
    print(f"preprocess_TPM_outlier_deletion function removed {num_pacients_removed} pacients")

    return cleaned_dataset

def plot_pca(dataframe, title, ax = None):
    """
    Performs PCA on the provided dataframe and plots the first two principal components for visualization.
    
    Parameters:
    - dataframe (DataFrame): The dataframe to perform PCA on.
    - title (str): The title of the plot.
    
    Returns:
    - None, it generates the plot
    """
    # Perform PCA analysis
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(dataframe.iloc[:, 1:].T)

    # Plot the first two principal components
    if ax is None:
        fig, ax = plt.subplots()
    ax.scatter(pca_result[:, 0], pca_result[:, 1], alpha=0.5)
    ax.set_title(title)
    ax.set_xlabel('PC1')
    ax.set_ylabel('PC2')


transcriptomics_clean = preprocess_TPM_outlier_deletion(transcriptomics_dataset, expression_th = 1)

fig, axs = plt.subplots(1, 2, figsize=(10, 5))
plot_pca(transcriptomics_dataset, title='PCA of Original Data', ax=axs[0])
plot_pca(transcriptomics_clean, title='PCA of Preprocessed with preprocess_TPM_outlier_deletion', ax=axs[1])

### Graph representation of the dataset

We Tranform the dataset into a network representation. This can be done in many ways, we opt for a self-similarity matrix based on correlation as a metric of similarity

As per the paper:
To create the co-expression graph, Spearman correlation was calculated to generate a correlation matrix between each gene in the dataset.
Spearman Correlation is a widely adopted method to assess monotonic linear or non-linear relationships in sequencing data. 
If the correlation between two genes is >0.6 with a p < 0.05, a weight of 1 is placed in an adjacency matrix, otherwise 0. If there is no correlation >0.6 with a given gene, then that gene is removed from the gene list, leading to the total of genes in the co-expression graph.

In [None]:
## Step 0: Turn dataframe into NumPy matrix for efficiency
transcriptomics_np = transcriptomics_clean.to_numpy()

# Step 1: Calculate Spearman Correlation and p-values
correlations, pvalues = stats.spearmanr(transcriptomics_np)

# Step 2: Construct the Adjacency Matrix
adjacency_matrix_np = (correlations > 0.6) & (pvalues < 0.05)
adjacency_matrix_np = adjacency_matrix_np.astype(int)
adjacency_matrix = pd.DataFrame(adjacency_matrix_np, index=transcriptomics_clean.columns, columns=transcriptomics_clean.columns)

# Step 3: Remove Isolated Genes - does not correlate >0.6 with any other gene
is_not_isolated = adjacency_matrix.sum(axis=1) > 0
filtered_adjacency_matrix = adjacency_matrix.loc[is_not_isolated, is_not_isolated]
print(f'{transcriptomics_clean.shape[1]-filtered_adjacency_matrix.shape[0]} genes were removed as Isolated Genes')

In [None]:
# Step 0: Given 'transcriptomics_clean' DataFrame, ensure 'transcriptomics_np' is updated after filtering
transcriptomics_np_filtered  = transcriptomics_clean.loc[:, is_not_isolated].to_numpy()

# Step 1: Convert the filtered adjacency matrix to a sparse format
sparse_adjacency_matrix = sparse.csr_matrix(filtered_adjacency_matrix.values)


# Convert sparse adjacency matrix to TensorFlow sparse tensor
adjacency_matrix_tensor = tf.sparse.SparseTensor(indices=np.array(list(zip(*sparse_adjacency_matrix.nonzero()))),
                                                 values=sparse_adjacency_matrix.data.astype(np.float32),
                                                 dense_shape=sparse_adjacency_matrix.shape)

# Normalize adjacency matrix with added self-loops for GCN
num_nodes = adjacency_matrix_tensor.dense_shape[0]
indices = tf.concat([adjacency_matrix_tensor.indices, tf.range(num_nodes)[:, tf.newaxis], tf.range(num_nodes)[:, tf.newaxis]], axis=0)
values = tf.concat([adjacency_matrix_tensor.values, tf.ones(num_nodes)], axis=0)
dense_shape = adjacency_matrix_tensor.dense_shape
adjacency_matrix_tensor_with_self_loops = tf.sparse.reorder(tf.SparseTensor(indices=indices, values=values, dense_shape=dense_shape))
degree_matrix = tf.sparse.reduce_sum(adjacency_matrix_tensor_with_self_loops, axis=-1)
degree_matrix_inv_sqrt = tf.pow(degree_matrix, -0.5)
degree_matrix_inv_sqrt = tf.where(tf.math.is_inf(degree_matrix_inv_sqrt), 0., degree_matrix_inv_sqrt)
D_inv_sqrt = tf.sparse.SparseTensor(indices=tf.range(num_nodes)[:, tf.newaxis].repeat(2, axis=1),
                                    values=degree_matrix_inv_sqrt,
                                    dense_shape=[num_nodes, num_nodes])
normalized_adjacency_matrix = tf.sparse.sparse_dense_matmul(
    tf.sparse.sparse_dense_matmul(D_inv_sqrt, adjacency_matrix_tensor_with_self_loops),
    D_inv_sqrt)


# Testing Stuff