# This notebook reads the processed scRNA seq dataset and produces a txt reference expression matrix as input to cibersortx signature matrix pipeline

In [1]:
import pathlib
import yaml
import subprocess

import pandas as pd
import numpy as np
import scanpy as sc

## Preprocessing Parameters

## Load config
The config file specifies the path to data and software repo (due to currently in active development)

In [2]:
# Get the root directory of the analysis repository
REPO_ROOT = subprocess.run(
    ["git", "rev-parse", "--show-toplevel"], capture_output=True, text=True
).stdout.strip()
REPO_ROOT = pathlib.Path(REPO_ROOT)

CONFIG_FILE = REPO_ROOT / 'config.yml'
assert CONFIG_FILE.exists(), f"Config file not found at {CONFIG_FILE}"

with open(CONFIG_FILE, 'r') as file:
    config_dict = yaml.safe_load(file)

## Retrieve Path to Processed Single-Cell RNA-seq Data and relevant Metadata

In [3]:
ACCESSION = config_dict['data_accession']
SC_DATA_PATH = pathlib.Path(config_dict['data_path']['sc_data_path'])

SC_ADATA_PATH = SC_DATA_PATH / f'{ACCESSION}_processed'
assert SC_ADATA_PATH.exists(), f"Processed Single-cell Data path {SC_ADATA_PATH} does not exist"
SC_ADATA_FILE = SC_ADATA_PATH / f'HGSCsubtype_processed.h5ad'
assert SC_ADATA_FILE.exists(), f"Processed Single-cell Data file {SC_ADATA_FILE} does not exist"
SC_METADATA_PATH = SC_DATA_PATH / f'{ACCESSION}_metadata'
assert SC_METADATA_PATH.exists(), f"Single-cell Metadata path {SC_METADATA_PATH} does not exist"

## Define Path to write Pre-Processing Outputs

In [4]:
PREPROCESSING_OUTPUT_PATH = REPO_ROOT / 'processed_data'
assert PREPROCESSING_OUTPUT_PATH.exists(), f"Preprocessing output path {PREPROCESSING_OUTPUT_PATH} does not exist"
CIBERSORTX_INPUT_PATH = PREPROCESSING_OUTPUT_PATH / 'cibersortx_input' # where to write the processed data as input to CIBERSORTx
CIBERSORTX_INPUT_PATH.mkdir(parents=True, exist_ok=True)

In [5]:
GENE_ID_COL = 'gene_ids'

adata = sc.read_h5ad(SC_ADATA_FILE)
adata.var_names_make_unique()
adata.var[GENE_ID_COL] = adata.var.index.tolist()

In [None]:
adata.obs.head()

Unnamed: 0_level_0,GSM,Barcode,cellType,samp_id,gse_id,IMR_consensus,DIF_consensus,PRO_consensus,MES_consensus,subtype,max_consensusOV,celltype_granular,scpred_CellType,sample_id,stim,n_genes,n_genes_by_counts,total_counts,total_counts_mt,pct_counts_mt
Barcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
AAACCCAAGATTGACA-1,GSM6720925,AAACCCAAGATTGACA-1,Plasma cells,2251,GSM6720925,0.374,0.326,0.154,0.146,IMR,0.374,immune,Plasma cells,Samp_2251,CTRL,2170,2170,14447.0,250.0,1.730463
AAACCCAAGCATGGGT-1,GSM6720925,AAACCCAAGCATGGGT-1,B cells,2251,GSM6720925,0.270,0.438,0.198,0.094,DIF,0.438,immune,B cells,Samp_2251,CTRL,2328,2328,6913.0,419.0,6.061044
AAACCCACACTTGTCC-1,GSM6720925,AAACCCACACTTGTCC-1,T cells,2251,GSM6720925,0.562,0.282,0.110,0.046,IMR,0.562,immune,T cells,Samp_2251,CTRL,3957,3957,13979.0,315.0,2.253380
AAACCCATCTACGCGG-1,GSM6720925,AAACCCATCTACGCGG-1,Endothelial cells,2251,GSM6720925,0.246,0.224,0.278,0.252,PRO,0.278,stromal,Endothelial cells,Samp_2251,CTRL,4473,4473,15987.0,871.0,5.448177
AAACGAAAGAACCCGA-1,GSM6720925,AAACGAAAGAACCCGA-1,Monocytes,2251,GSM6720925,0.358,0.492,0.088,0.062,DIF,0.492,immune,Monocytes,Samp_2251,CTRL,2217,2217,7268.0,274.0,3.769951
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
TTTGTTGCACAAGGTG-1,GSM6720932,TTTGTTGCACAAGGTG-1,T cells,2497,GSM6720932,0.266,0.666,0.048,0.020,DIF,0.666,immune,T cells,Samp_2497,CTRL,1144,1144,3554.0,241.0,6.781092
TTTGTTGCAGAACTAA-1,GSM6720932,TTTGTTGCAGAACTAA-1,Fibroblasts,2497,GSM6720932,0.174,0.082,0.060,0.684,MES,0.684,stromal,Fibroblasts,Samp_2497,CTRL,3590,3590,18203.0,912.0,5.010163
TTTGTTGCATCTTTCA-1,GSM6720932,TTTGTTGCATCTTTCA-1,Fibroblasts,2497,GSM6720932,0.118,0.298,0.308,0.276,PRO,0.308,stromal,Fibroblasts,Samp_2497,CTRL,5451,5451,30411.0,2912.0,9.575482
TTTGTTGGTGTAACGG-1,GSM6720932,TTTGTTGGTGTAACGG-1,B cells,2497,GSM6720932,0.258,0.608,0.090,0.044,DIF,0.608,immune,B cells,Samp_2497,CTRL,1167,1167,3128.0,149.0,4.763427


## Examine number of classes in each cell label

In [7]:
unique_classes = {
    'cellType': adata.obs['cellType'].nunique(),
    'scpred_CellType': adata.obs['scpred_CellType'].nunique(),
    'celltype_granular': adata.obs['celltype_granular'].nunique(),
}

unique_classes

{'cellType': 11, 'scpred_CellType': 11, 'celltype_granular': 3}

## Select Cell Type for cibersortx and downstream analysis

In [8]:
CELL_TYPE_COL = 'cellType'
assert CELL_TYPE_COL in adata.obs.columns, f"Column {CELL_TYPE_COL} not found in adata.obs"

## Produce scRNA reference matrix as input to cibersortx signature matrix generation
### Select single sample from the scRNA seq dataset as input to cibersortx due to file size restrictions 

In [9]:
# due to the limitation of memory and storage usage by cibersortx, we will only use one sample for the analysis
sample = adata.obs['samp_id'].unique()[0]
print(f"Selected sample ID: {sample}")
SUB_SAMPLE = True

## Single Sample Subset
adata_single_sample = adata[adata.obs['samp_id'] == sample].copy() # single sample adata
sc.pp.normalize_total(adata_single_sample, target_sum=1e4)

Selected sample ID: 2251


### Further sub-sample within single sample to reduce reference matrix size

In [10]:
MIN_N = 200
SUB_SAMP_PROP = 0.6

if SUB_SAMPLE == True:

    print(adata_single_sample.shape)
    # Create a new AnnData object to store the subsampled cells
    selected_indices = []

    # Get unique cell types
    for cell_type, indices in adata_single_sample.obs.groupby(CELL_TYPE_COL).groups.items():
        num_cells = len(indices)
        
        # If the cell type has more than 100 cells, subsample to 80%
        if num_cells > MIN_N:
            subsample_size = int(SUB_SAMP_PROP * num_cells)  # Compute 80% size
            sampled_indices = np.random.choice(indices, size=subsample_size, replace=False)  # Randomly select cells
        else:
            sampled_indices = indices  # Keep all cells if <=100

        selected_indices.extend(sampled_indices)

    # Subset the AnnData object to retain only selected cells
    adata_single_sample = adata_single_sample[selected_indices, :]

print(adata_single_sample.shape)

(6819, 31178)
(4294, 31178)


  for cell_type, indices in adata_single_sample.obs.groupby(CELL_TYPE_COL).groups.items():


### Export as tab delimited txt

In [11]:
adata_single_sample = adata_single_sample[adata_single_sample.obs.sort_values(by=CELL_TYPE_COL).index] # sort by cell type

## Transpose as (n_genes, n_cells)
dense_matrix = adata_single_sample.X.T.todense()

## Cell type (with duplicates) as column
dense_df = pd.DataFrame(dense_matrix, columns = adata_single_sample.obs[CELL_TYPE_COL].to_list())
## Gene id as row index
dense_df.index = adata_single_sample.var[GENE_ID_COL].to_list()

## Save to file
dense_df.to_csv(CIBERSORTX_INPUT_PATH / f'{sample}_cibersortx_sc_reference_input.txt', sep='\t')

In [12]:
dense_df.head()

Unnamed: 0,B cells,B cells.1,B cells.2,B cells.3,B cells.4,B cells.5,B cells.6,B cells.7,B cells.8,B cells.9,...,T cells,T cells.1,T cells.2,T cells.3,T cells.4,T cells.5,T cells.6,T cells.7,T cells.8,T cells.9
AL627309.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL627309.3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL627309.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AP006222.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AL732372.1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
