In [None]:
import scanpy as sc
import numpy as np
import pandas as pd
import argparse
import random
import numpy as np
import mygene


def generate_cnv_region(chromosomes, min_size, max_size):

    chrom = random.choice(chromosomes)
    start = random.randint(1, 100000000) 
    end = start + random.randint(min_size, max_size)

    return chrom, start, end


def assign_cnvs_unique(cell_types, num_cnvs, chromosomes, min_size, max_size, cnv_prob=0.1):

    cnvs = []
    for _ in range(num_cnvs):
        chrom, start, end = generate_cnv_region(chromosomes, min_size, max_size)
        cnv_type = random.choice(['gain', 'loss'])
        cnvs.append({'chromosome': chrom, 'start': start, 'end': end, 'type': cnv_type})

    cnv_assignments = {}
    used_cnvs = []  # track CNVs that have been assigned to any cell type

    for cell_type in cell_types:
        cell_indices = adata2.obs[adata2.obs['cell_type'] == cell_type].index
        num_cells_in_type = len(cell_indices)
        num_cnv_cells = int(np.round(num_cells_in_type * cnv_prob)) 

        cnv_cells = random.sample(cell_indices.tolist(), num_cnv_cells)

        #choose a CNV that hasn't been assigned yet
        available_cnvs = [cnv for cnv in cnvs if cnv not in used_cnvs]
        if not available_cnvs:
            break  
        chosen_cnv = random.choice(available_cnvs) 
        cnv_type_for_cell_type = chosen_cnv['type']

        used_cnvs.append(chosen_cnv)
        cnv_assignments[cell_type] = {
            cell: {'cnv': chosen_cnv, 'type': cnv_type_for_cell_type} for cell in cnv_cells
        }
    return cnv_assignments


def simulate_cnv_impact(adata, cnv_assignments):
    for cell_type, cnvs in cnv_assignments.items():
        for cell, cnv_info in cnvs.items():
            cnv = cnv_info['cnv']  
            affected_genes = adata.var[(adata.var['chromosome'] == cnv['chromosome']) &
                                       (adata.var['start'] >= cnv['start']) &
                                       (adata.var['end'] <= cnv['end'])]
            affected_genes_positions = [adata.var_names.get_loc(gene) for gene in affected_genes.index]
            if cnv_info['type'] == 'gain':
                for gene_idx in affected_genes_positions:
                    adata.layers['counts'][:, gene_idx] *= 3  # Scaling up expression for 'gain'
            elif cnv_info['type'] == 'loss':
                for gene_idx in affected_genes_positions:
                    adata.layers['counts'][:, gene_idx] *= 2  # Scaling down expression for 'loss'

    return adata

In [2]:

SEED = 5  
random.seed(SEED)
np.random.seed(SEED)

adata2 = sc.read_h5ad("files/adPBMC_ref_040623.h5ad")
adata2.layers['counts'] = adata2.X.copy()

mg = mygene.MyGeneInfo()


gene_info = mg.querymany(
    adata2.var['gene_ids'].tolist(),
    scopes='ensembl.gene',
    fields='genomic_pos',
    species='human'
)


query_df = pd.DataFrame(gene_info)
query_df['genomic_pos'] = query_df['genomic_pos'].apply(lambda x: eval(x) if isinstance(x, str) else x)

#extract chromosome, start, and end info from the genomic_pos column
query_df['chromosome'] = query_df['genomic_pos'].apply(lambda x: x.get('chr') if isinstance(x, dict) else None)
query_df['start'] = query_df['genomic_pos'].apply(lambda x: x.get('start') if isinstance(x, dict) else None)
query_df['end'] = query_df['genomic_pos'].apply(lambda x: x.get('end') if isinstance(x, dict) else None)
query_df['strand'] = query_df['genomic_pos'].apply(lambda x: x.get('strand') if isinstance(x, dict) else None)

# Remove rows where 'chromosome' is None or 'start' is NaN
clean_query_df = query_df.dropna(subset=['chromosome', 'start', 'end'])
clean_query_df = clean_query_df.drop_duplicates(subset=['query'])

adata2.var = adata2.var.merge(clean_query_df[['query', 'chromosome', 'start', 'end', 'strand']],
                              left_on='gene_ids', right_on='query',
                              how='left')

adata2_CNV = adata2.copy()

# Define parameters for CNV simulation
chromosomes = [str(i) for i in range(1, 23)] + ['X', 'Y']  
min_size = 100000 
max_size = 500000  
num_cnvs = 1  # Number of CNVs to simulate


cell_types = adata2_CNV.obs['cell_type'].unique()


cnv_prob = 0.1  # 10% of cells per cell type will receive CNVs
cnv_assignments = assign_cnvs_unique(cell_types, num_cnvs, chromosomes, min_size, max_size, cnv_prob)

# Simulate CNV impact on expression
adata2_CNV = simulate_cnv_impact(adata2_CNV, cnv_assignments)
adata2_CNV.obs['simulated_cnvs'] = ''
for cell_type, cell_cnv_mapping in cnv_assignments.items():
    for cell, cnv_info in cell_cnv_mapping.items():
        cnv_desc = f"{cnv_info['cnv']['chromosome']}:{cnv_info['cnv']['start']}-{cnv_info['cnv']['end']} ({cnv_info['cnv']['type']})"
        adata2_CNV.obs.loc[cell, 'simulated_cnvs'] = cnv_desc

cnv_table = adata2_CNV.obs.groupby(['simulated_cnvs', 'cell_type']).size().unstack(fill_value=0)

print(cnv_table)
adata2_CNV.write('PBMC_simCNV_3.h5ad')

Input sequence provided is already in string format. No operation performed
Input sequence provided is already in string format. No operation performed
5 input query terms found dup hits:	[('ENSG00000234162', 2), ('ENSG00000227110', 2), ('ENSG00000249738', 2), ('ENSG00000280018', 2), ('E
411 input query terms found no hit:	['ENSG00000238009', 'ENSG00000230699', 'ENSG00000236948', 'ENSG00000277726', 'ENSG00000271895', 'ENS
  self.data *= other
  cnv_table = adata2_CNV.obs.groupby(['simulated_cnvs', 'cell_type']).size().unstack(fill_value=0)


cell_type                    B cell  CD14 monocyte  CD4 T cell  CD8 T cell  \
simulated_cnvs                                                               
                               1450           3128        3199        1029   
20:34285250-34774160 (loss)       0              0         355           0   

cell_type                    Dendritic  FCGR3A monocyte  Megakaryocyte  \
simulated_cnvs                                                           
                                   154              327             59   
20:34285250-34774160 (loss)          0                0              0   

cell_type                    NK cell  
simulated_cnvs                        
                                 608  
20:34285250-34774160 (loss)        0  


In [3]:
cnv_table

cell_type,B cell,CD14 monocyte,CD4 T cell,CD8 T cell,Dendritic,FCGR3A monocyte,Megakaryocyte,NK cell
simulated_cnvs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,1450,3128,3199,1029,154,327,59,608
20:34285250-34774160 (loss),0,0,355,0,0,0,0,0
