# CNN for Multi-omics Analysis

In [5]:
# Import Libraries
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import math

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

import logging
import sys

In [6]:
# Setup Logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s - %(levelname)s - %(message)s',
                    handlers=[logging.StreamHandler(sys.stdout)])
logger = logging.getLogger(__name__)

## 1. Load and Prepare Data (Simulated)

In [26]:
logger.info("Simulating Data")
num_samples = 50
num_genes_total = 1000
num_proteins_total = 800
num_matched_features = 500 # Number of features common to both omics

np.random.seed(42)
torch.manual_seed(42)

2025-05-21 21:37:08,906 - INFO - Simulating Data


<torch._C.Generator at 0x1cd6934de30>

In [None]:
sample_names = [f'Sample_{i+1}' for i in range(num_samples)]
all_genes = [f'Gene_{i+1}' for i in range(num_genes_total)]
all_proteins = [f'Prot_{i+1}' for i in range(num_proteins_total)]
matched_ids = [f'Gene_{i+1}' for i in range(num_matched_features)]
transcript_ids = matched_ids.copy()  + [ f'Gene_{i+1}' for i in range(num_matched_features, num_genes_total)] # full transcriptome list
protein_col_ids = matched_ids.copy() + [ f'Prot_{i+1}' for i in range(num_matched_features, num_proteins_total)] # full proteome list

Simulate Transcriptomic Data (Samples x Genes)

In [27]:
# values are random numbes, representing expression levels
t_data = np.random.rand(num_samples, len(transcript_ids)) * 100
transcriptomics_df = pd.DataFrame(t_data, index=sample_names, columns=transcript_ids)

Simulate Proteomics Data (Sample x Proteins)

In [28]:
p_data = np.random.rand(num_samples, len(protein_col_ids)) * 100000
proteomics_df = pd.DataFrame(p_data, index=sample_names, columns=protein_col_ids)

Simulate outcome labels (Disease vs Control)

In [32]:
# Binary labels (0 or 1) for each sample
labels_np = np.random.randint(0, 2, size=num_samples)
labels_df = pd.DataFrame(labels_np, index=sample_names, columns=['Label'])

In [33]:
logger.info(f"Simulated {num_samples} samples.")
logger.info(f"Raw Transcriptomics shape: {transcriptomics_df.shape} (Samples x Total Genes)")
logger.info(f"Raw Proteomics shape: {proteomics_df.shape} (Samples x Total Proteins)")
logger.info(f"Number of features intended to be matched: {len(matched_ids)}")

2025-05-21 21:40:48,914 - INFO - Simulated 50 samples.
2025-05-21 21:40:48,915 - INFO - Raw Transcriptomics shape: (50, 1000) (Samples x Total Genes)
2025-05-21 21:40:48,915 - INFO - Raw Proteomics shape: (50, 800) (Samples x Total Proteins)
2025-05-21 21:40:48,916 - INFO - Number of features intended to be matched: 500


## 2. Match Features

In [34]:
# Filter the transcriptomics and proteomics dataframes to only include the matched features.
# The column names for these matched features are identical in `matched_ids`.
transcriptomics_matched = transcriptomics_df[matched_ids]
# For proteomics, we also select columns based on `matched_ids` because we simulated
# protein IDs to be the same as gene IDs for the matched portion.
proteomics_matched = proteomics_df[matched_ids]

In [35]:
logger.info(f"Matched transcriptomics shape: {transcriptomics_matched.shape} (Samples x Matched Genes)")
logger.info(f"Matched proteomics shape: {proteomics_matched.shape} (Samples x Matched Proteins)")

2025-05-21 21:42:56,831 - INFO - Matched transcriptomics shape: (50, 500) (Samples x Matched Genes)
2025-05-21 21:42:56,832 - INFO - Matched proteomics shape: (50, 500) (Samples x Matched Proteins)


## 3. Get Pathway Data (Simulated)

In [40]:
# Pathways are sets of genes that work together. We simulate this by grouping our matched gene IDs.
pathways = {} # Dictionary to store pathway_name: [gene_list]. in reality, this would be a list of genes that are known to work together in a biological pathway from gseapy.
genes_per_pathway = 25 # Arbitrary number of genes per simulated pathway
# Calculate how many pathways we need to cover all matched genes
num_pathways_sim = math.ceil(len(matched_ids) / genes_per_pathway)
current_gene_idx_for_pathway = 0 # Keep track of genes assigned to pathways
for i in range(num_pathways_sim):
    pathway_name = f'Pathway_{i+1}'
    # Get a slice of matched_ids for the current pathway
    pathway_genes = matched_ids[current_gene_idx_for_pathway : current_gene_idx_for_pathway + genes_per_pathway]
    if pathway_genes: # Ensure the list is not empty
        pathways[pathway_name] = pathway_genes
    current_gene_idx_for_pathway += genes_per_pathway
logger.info(f"Simulated {len(pathways)} pathways covering the matched genes.")

2025-05-21 21:48:56,439 - INFO - Simulated 20 pathways covering the matched genes.


## 4. Create Pathway Tiling Master Map

In [None]:
# This map defines where each gene/protein pair will be placed on a 2D grid.
# The grid dimensions are chosen to be roughly square and large enough to hold all matched features.
grid_h = int(math.ceil(math.sqrt(len(matched_ids)))) # Height of the grid
grid_w = grid_h # Width of the grid (making it square)
grid_total_pixels = grid_h * grid_w
logger.info(f"Target grid dimensions: {grid_h} (H) x {grid_w} (W) = {grid_total_pixels} pixels")

In [39]:
print(num_pathways_sim)

20
