# DEGs and GO analysis preprocessing

In order to deal with the data in R, we need to preprocess the original data. 

In [6]:
import os
import pandas as pd
import scipy.sparse as sp
import scanpy as sc
from scipy.io import mmwrite

In [7]:
subclasses = ["L2/3 IT", "L4 IT", "L5 IT", "Vip", "Oligodendrocyte"]

In [8]:
DONOR_ID_COLUMN = 'Donor ID' 
COVARIATES = ['Age at Death', 'Sex', 'PMI'] 
REQUIRED_COLS = ['Cognitive Status', DONOR_ID_COLUMN] + COVARIATES

In [9]:
def process_data(name):
    safe_name = name.replace('/', '_')
    # Please change the path to your own!
    input_dir = '/Users/a1234/Desktop/Omics data (scRNA-seq)'
    output_dir = f'/Users/a1234/Desktop/Processed_{safe_name}'
    os.makedirs(output_dir, exist_ok=True)

    all_obs = []
    all_X = []
    gene_names = None
    file_counter = 0

    sampling_params = {
        'max_cells_per_file': 500,
        'fraction_threshold': 0.5,
        'min_cells_after_sampling': 10,
        'random_seed': 42
    }

    unique_gene_df = None

    for root, dirs, files in os.walk(input_dir):
        for file in files:
            if not file.endswith('.h5ad'):
                continue

            file_path = os.path.join(root, file)
            file_counter += 1
            print(f"Processing file #{file_counter}: {file}")

            try:
                adata = sc.read_h5ad(file_path)
                name_mask = adata.obs["Subclass"] == f"{name}"
                name_data = adata[name_mask].copy()

                if name_data.n_obs == 0:
                    print(f"File {file} has 0 {name} cells, skip it.")
                    continue

                original_cell_count = name_data.n_obs

                # Sampling Process
                if original_cell_count > sampling_params['max_cells_per_file']:
                    sc.pp.subsample(
                        name_data,
                        n_obs=sampling_params['max_cells_per_file'],
                        random_state=sampling_params['random_seed']
                    )
                elif original_cell_count > 1 / sampling_params['fraction_threshold']:
                    sc.pp.subsample(
                        name_data,
                        fraction=sampling_params['fraction_threshold'],
                        random_state=sampling_params['random_seed']
                    )
                else:
                    print(f"({original_cell_count} cells) remain the original format.")

                if name_data.n_obs < sampling_params['min_cells_after_sampling']:
                    print(f"{name_data.n_obs} cells are not enough after sampling, skip it.")
                    continue

                # Cell ID Enhancement
                if DONOR_ID_COLUMN not in name_data.obs.columns:
                    raise KeyError(f"Lack the core column: {DONOR_ID_COLUMN}")

                donor_ids = name_data.obs[DONOR_ID_COLUMN].tolist()

                new_index = [
                    f"{donor_id}_{idx}_s{original_cell_count}a{name_data.n_obs}"
                    for donor_id, idx in zip(donor_ids, name_data.obs.index)
                ]

                name_data.obs.index = new_index
                name_data.obs.index.name = "cell_id"

                if not sp.issparse(name_data.X):
                    name_data.X = sp.csr_matrix(name_data.X)
                else:
                    name_data.X = name_data.X.tocsr()

                # Gene deduplication
                if gene_names is None:
                    if 'gene_id' in name_data.var.columns and 'gene_name' in name_data.var.columns:
                        gene_ids = name_data.var['gene_id'].tolist()
                        gene_symbols = name_data.var['gene_name'].tolist()
                    elif name_data.var.index.name == 'gene_id' and 'gene_name' in name_data.var.columns:
                        gene_ids = name_data.var.index.tolist()
                        gene_symbols = name_data.var['gene_name'].tolist()
                    else:
                        gene_ids = name_data.var.index.tolist()
                        gene_symbols = name_data.var.index.tolist()
                        print("Warning: use var.index as gene_id and gene_name")

                    gene_df = pd.DataFrame({'gene_id': gene_ids, 'gene_name': gene_symbols})
                    unique_gene_df = gene_df.drop_duplicates(subset=['gene_id'], keep='first')

                    original_gene_count = len(gene_df)
                    final_gene_count = len(unique_gene_df)
                    print(
                        f"Gene deduplication: Origin {original_gene_count} genes; After deduplication {final_gene_count} genes")

                    gene_names = list(zip(unique_gene_df['gene_id'], unique_gene_df['gene_name']))

                name_data = name_data[:, unique_gene_df.index.tolist()]

                # Collect all related metadata 
                required_metadata = name_data.obs[REQUIRED_COLS].copy()

                all_obs.append(required_metadata)
                all_X.append(name_data.X)

                print(f"Successfully deal with {file}: {original_cell_count} -> {name_data.n_obs} for {name} cells")

            except Exception as e:
                print(f"When dealing with {file}, we face an ERROR: {str(e)}")
                continue

        if all_obs and all_X:
            print("\nStart merging...")

            try:
                combined_obs = pd.concat(all_obs, axis=0, verify_integrity=True)
            except ValueError as e:
                print("\n[CRITICAL ERROR] Data merge failed due to overlapping Cell IDs.")
                print("Please check whether the 'Donor ID' has been correctly incorporated into the 'Cell ID' generation logic.")
                raise e  

            combined_X = sp.vstack(all_X, format='csr').T

            print("Saving the results...")

            combined_obs[REQUIRED_COLS].to_csv(
                os.path.join(output_dir, 'metadata.csv'),
                index=True, index_label="cell_id"
            )
            
            # Save the Data

            mmwrite(os.path.join(output_dir, 'matrix.mtx'), combined_X)

            pd.Series(combined_obs.index).to_csv(
                os.path.join(output_dir, 'barcodes.tsv'),
                index=False, header=False
            )

            pd.DataFrame(gene_names, columns=['gene_id', 'gene_name']).to_csv(
                os.path.join(output_dir, 'genes.tsv'),
                sep='\t',
                index=False,
                header=False
            )

            print(f"ALL DONE! Final dimensions of data: {combined_X.shape} (genes x cells)")


In [10]:
if __name__ == '__main__':
    for name in subclasses:
        process_data(name)

Processing file #1: H21.33.028_SEAAD_MTG_RNAseq_final-nuclei.2024-02-13.h5ad
Gene deduplication: Origin 36601 genes; After deduplication 36601 genes
Successfully deal with H21.33.028_SEAAD_MTG_RNAseq_final-nuclei.2024-02-13.h5ad: 3657 -> 500 for L2/3 IT cells
Processing file #2: H21.33.007_SEAAD_MTG_RNAseq_final-nuclei.2024-02-13.h5ad
Successfully deal with H21.33.007_SEAAD_MTG_RNAseq_final-nuclei.2024-02-13.h5ad: 3089 -> 500 for L2/3 IT cells
Processing file #3: H20.33.001_SEAAD_MTG_RNAseq_final-nuclei.2024-02-13.h5ad
Successfully deal with H20.33.001_SEAAD_MTG_RNAseq_final-nuclei.2024-02-13.h5ad: 5394 -> 500 for L2/3 IT cells
Processing file #4: H20.33.035_SEAAD_MTG_RNAseq_final-nuclei.2024-02-13.h5ad
Successfully deal with H20.33.035_SEAAD_MTG_RNAseq_final-nuclei.2024-02-13.h5ad: 2483 -> 500 for L2/3 IT cells
Processing file #5: H21.33.041_SEAAD_MTG_RNAseq_final-nuclei.2024-02-13.h5ad
Successfully deal with H21.33.041_SEAAD_MTG_RNAseq_final-nuclei.2024-02-13.h5ad: 3301 -> 500 for L2