In [1]:
import scanpy as sc
import pandas as pd
import numpy as np
import anndata
import matplotlib.pyplot as plt
import numpy as np
from scipy.sparse import csr_matrix

# Read adata

In [3]:
adata = sc.read_h5ad('/data/aneuploidy/cat_crop/fully_processed.h5ad')

In [5]:
adata

AnnData object with n_obs × n_vars = 204374 × 37001
    obs: 'batch', 'aggr_barcode', 'num_features', 'feature_call', 'num_umis', 'maxUmi1', 'maxUmi2', 'cell_barcode', 'guidePvalue', 'pvalueAssign', 'guides', 'multi_output', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts'

In [8]:
# make labels
adata.obs['guides'].value_counts()

Multiple Guides    144526
ADA2-1               2029
HBB-2                1958
USP22-1              1933
HBB-3                1743
                    ...  
HTT-1                   5
RAB11A-4                5
IL2RB-3                 4
PIK3CD-1                4
ERCC8-3                 2
Name: guides, Length: 401, dtype: int64

In [10]:
adata.obs.loc[adata.obs['guides'].str.startswith("Non"), "label"] = "CONTROL"
adata.obs.loc[adata.obs['guides'] == "Multiple Guides", "label"] = "NOISE"
adata.obs.loc[np.logical_and(~(adata.obs['guides'] == "Multiple Guides"),
                             ~(adata.obs['guides'].str.startswith("Non"))), "label"] = "GUIDE"

In [12]:
adata.obs['label'].value_counts()

NOISE      144526
GUIDE       57227
CONTROL      2621
Name: label, dtype: int64

# add qc columns

In [16]:
adata.obs['multi_output'].value_counts()

Multiplet     37132
CTJD02C       28298
CTJD02F       26520
CTJD02E       24020
CTJD02D       23912
CTJD02B       21080
CTJD02A       17280
Blank         13880
Unassigned     1718
Name: multi_output, dtype: int64

In [17]:
adata.obs['SingletorDoublet'] = adata.obs['multi_output'].apply(lambda x: "Singlet" if x in ["CTJD02A", "CTJD02B","CTJD02C","CTJD02D","CTJD02E","CTJD02F",] else "Multiplet")

In [18]:
adata.obs['SingletorDoublet'].value_counts()

Singlet      141110
Multiplet     52730
Name: SingletorDoublet, dtype: int64

In [20]:
adata.obs['<10_percent_mt'] = adata.obs['pct_counts_mt'] < 10

In [38]:
adata.obs['<100000_total_counts'] = adata.obs['total_counts'] < 100000

In [39]:
adata.obs['qc_pass'] = np.logical_and((adata.obs['SingletorDoublet'] == "Singlet"),
                                      adata.obs['<10_percent_mt'])
adata.obs['qc_pass'] = np.logical_and(adata.obs['qc_pass'],
                                      adata.obs['<100000_total_counts'])

In [40]:
adata.obs['qc_pass'].value_counts()

True     108458
False     95916
Name: qc_pass, dtype: int64

In [45]:
adata.obs['SingletorDoublet'].value_counts()

Singlet      141110
Multiplet     52730
Name: SingletorDoublet, dtype: int64

In [42]:
adata.obs['<10_percent_mt'].value_counts()

True     164741
False     39633
Name: <10_percent_mt, dtype: int64

In [44]:
adata.obs['<100000_total_counts'].value_counts()

True     204103
False       271
Name: <100000_total_counts, dtype: int64

In [46]:
# Save the quality-controlled h5ad
adata.write('/data/aneuploidy/cat_crop/qced.h5ad')

In [3]:
adata = sc.read_h5ad('/data/aneuploidy/cat_crop/qced.h5ad')

In [13]:
adata.obs['qc_pass'].value_counts()

True     108458
False     95916
Name: qc_pass, dtype: int64

In [16]:
adata.obs['label'].value_counts()

NOISE      144526
GUIDE       57227
CONTROL      2621
Name: label, dtype: int64

In [38]:
sc.pp.filter_genes(adata, min_cells=3)

In [55]:
adata = adata[:, adata.var['feature_types'] == "Gene Expression"]

In [56]:
adata

View of AnnData object with n_obs × n_vars = 204374 × 31506
    obs: 'batch', 'aggr_barcode', 'num_features', 'feature_call', 'num_umis', 'maxUmi1', 'maxUmi2', 'cell_barcode', 'guidePvalue', 'pvalueAssign', 'guides', 'multi_output', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'label', 'SingletorDoublet', '<10_percent_mt', '<10000_total_counts', 'qc_pass', '<100000_total_counts'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'

# Divide 203K cells in batches of 23K cells

In [57]:
ctrl = adata[np.logical_and(adata.obs['label'] == 'CONTROL',
                            adata.obs['qc_pass']),:]

In [58]:
ctrl

View of AnnData object with n_obs × n_vars = 1854 × 31506
    obs: 'batch', 'aggr_barcode', 'num_features', 'feature_call', 'num_umis', 'maxUmi1', 'maxUmi2', 'cell_barcode', 'guidePvalue', 'pvalueAssign', 'guides', 'multi_output', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'label', 'SingletorDoublet', '<10_percent_mt', '<10000_total_counts', 'qc_pass', '<100000_total_counts'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'

In [59]:
adataGuide = adata[adata.obs['label'] != 'CONTROL']

In [60]:
adataGuide

View of AnnData object with n_obs × n_vars = 201753 × 31506
    obs: 'batch', 'aggr_barcode', 'num_features', 'feature_call', 'num_umis', 'maxUmi1', 'maxUmi2', 'cell_barcode', 'guidePvalue', 'pvalueAssign', 'guides', 'multi_output', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'label', 'SingletorDoublet', '<10_percent_mt', '<10000_total_counts', 'qc_pass', '<100000_total_counts'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'

In [61]:
batch1 = adataGuide[adataGuide.obs.index[0:23000].values, ]
batch1 = batch1.concatenate(ctrl)
batch1

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


AnnData object with n_obs × n_vars = 24854 × 31506
    obs: 'batch', 'aggr_barcode', 'num_features', 'feature_call', 'num_umis', 'maxUmi1', 'maxUmi2', 'cell_barcode', 'guidePvalue', 'pvalueAssign', 'guides', 'multi_output', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'label', 'SingletorDoublet', '<10_percent_mt', '<10000_total_counts', 'qc_pass', '<100000_total_counts'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'

In [62]:
# Performing batch1 with different labels
#batch1.obs.loc[batch1.obs['labels'].isin(['GUIDE','NOISE']), 'labels'] = 'NOT CONTROL'

In [63]:
batch1.obs['label'].value_counts()

NOISE      16780
GUIDE       6220
CONTROL     1854
Name: label, dtype: int64

In [64]:
batch2 = adataGuide[adataGuide.obs.index[23001:46000].values, ]
batch2 = batch2.concatenate(ctrl)
batch2

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


AnnData object with n_obs × n_vars = 24853 × 31506
    obs: 'batch', 'aggr_barcode', 'num_features', 'feature_call', 'num_umis', 'maxUmi1', 'maxUmi2', 'cell_barcode', 'guidePvalue', 'pvalueAssign', 'guides', 'multi_output', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'label', 'SingletorDoublet', '<10_percent_mt', '<10000_total_counts', 'qc_pass', '<100000_total_counts'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'

In [65]:
batch3 = adataGuide[adataGuide.obs.index[46001:69000], ]
batch3 = batch3.concatenate(ctrl)
batch3

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


AnnData object with n_obs × n_vars = 24853 × 31506
    obs: 'batch', 'aggr_barcode', 'num_features', 'feature_call', 'num_umis', 'maxUmi1', 'maxUmi2', 'cell_barcode', 'guidePvalue', 'pvalueAssign', 'guides', 'multi_output', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'label', 'SingletorDoublet', '<10_percent_mt', '<10000_total_counts', 'qc_pass', '<100000_total_counts'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'

In [66]:
batch4 = adataGuide[adataGuide.obs.index[69001:92000], ]
batch4 = batch4.concatenate(ctrl)
batch4

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


AnnData object with n_obs × n_vars = 24853 × 31506
    obs: 'batch', 'aggr_barcode', 'num_features', 'feature_call', 'num_umis', 'maxUmi1', 'maxUmi2', 'cell_barcode', 'guidePvalue', 'pvalueAssign', 'guides', 'multi_output', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'label', 'SingletorDoublet', '<10_percent_mt', '<10000_total_counts', 'qc_pass', '<100000_total_counts'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'

In [67]:
batch5 = adataGuide[adataGuide.obs.index[92001:115000], ]
batch5 = batch5.concatenate(ctrl)
batch5

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


AnnData object with n_obs × n_vars = 24853 × 31506
    obs: 'batch', 'aggr_barcode', 'num_features', 'feature_call', 'num_umis', 'maxUmi1', 'maxUmi2', 'cell_barcode', 'guidePvalue', 'pvalueAssign', 'guides', 'multi_output', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'label', 'SingletorDoublet', '<10_percent_mt', '<10000_total_counts', 'qc_pass', '<100000_total_counts'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'

In [68]:
batch6 = adataGuide[adataGuide.obs.index[115001:138000], ]
batch6 = batch6.concatenate(ctrl)
batch6

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


AnnData object with n_obs × n_vars = 24853 × 31506
    obs: 'batch', 'aggr_barcode', 'num_features', 'feature_call', 'num_umis', 'maxUmi1', 'maxUmi2', 'cell_barcode', 'guidePvalue', 'pvalueAssign', 'guides', 'multi_output', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'label', 'SingletorDoublet', '<10_percent_mt', '<10000_total_counts', 'qc_pass', '<100000_total_counts'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'

In [69]:
batch7 = adataGuide[adataGuide.obs.index[138001:161000], ]
batch7 = batch7.concatenate(ctrl)
batch7

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


AnnData object with n_obs × n_vars = 24853 × 31506
    obs: 'batch', 'aggr_barcode', 'num_features', 'feature_call', 'num_umis', 'maxUmi1', 'maxUmi2', 'cell_barcode', 'guidePvalue', 'pvalueAssign', 'guides', 'multi_output', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'label', 'SingletorDoublet', '<10_percent_mt', '<10000_total_counts', 'qc_pass', '<100000_total_counts'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'

In [70]:
batch8 = adataGuide[adataGuide.obs.index[161001:184000], ]
batch8 = batch8.concatenate(ctrl)
batch8

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


AnnData object with n_obs × n_vars = 24853 × 31506
    obs: 'batch', 'aggr_barcode', 'num_features', 'feature_call', 'num_umis', 'maxUmi1', 'maxUmi2', 'cell_barcode', 'guidePvalue', 'pvalueAssign', 'guides', 'multi_output', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'label', 'SingletorDoublet', '<10_percent_mt', '<10000_total_counts', 'qc_pass', '<100000_total_counts'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'

In [71]:
batch9 = adataGuide[adataGuide.obs.index[184001:len(adataGuide.obs.index)], ]
batch9 = batch9.concatenate(ctrl)
batch9

  [AnnData(sparse.csr_matrix(a.shape), obs=a.obs) for a in all_adatas],


AnnData object with n_obs × n_vars = 19606 × 31506
    obs: 'batch', 'aggr_barcode', 'num_features', 'feature_call', 'num_umis', 'maxUmi1', 'maxUmi2', 'cell_barcode', 'guidePvalue', 'pvalueAssign', 'guides', 'multi_output', 'n_genes_by_counts', 'total_counts', 'total_counts_mt', 'pct_counts_mt', 'label', 'SingletorDoublet', '<10_percent_mt', '<10000_total_counts', 'qc_pass', '<100000_total_counts'
    var: 'gene_ids', 'feature_types', 'genome', 'mt', 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'n_cells'

# Get input files for all cells

In [73]:
def getInputInferCnv(adata, guides, filename):
    # Annotations
    adata.obs[guides] = adata.obs[guides].astype('str')
    annotations = pd.DataFrame(adata.obs[guides])
    annotations.reset_index(inplace = True)
#     print(annotations.head())
    annotations[guides] = annotations[guides].astype('str')
    annotations.to_csv(f'/data/aneuploidy/infercnv/annotations_{filename}.csv', index=False, header=False)
    # Counts 
    countMatrix = anndata.AnnData(
    X = adata.X,
    )
    countMatrix.write_h5ad(f'/data/aneuploidy/infercnv/counts_{filename}.h5ad')
    # Genes 
    geneDf = pd.DataFrame(adata.var['gene_ids'])
    geneDf.reset_index(inplace = True)
    geneDf.to_csv(f'/data/aneuploidy/infercnv/genes_{filename}.csv', index=False, header=False)

In [74]:
getInputInferCnv(batch1, 'label', 'Batch1')

In [75]:
getInputInferCnv(batch2, 'label', 'Batch2')

In [76]:
getInputInferCnv(batch3, 'label', 'Batch3')

In [77]:
getInputInferCnv(batch4, 'label', 'Batch4')

In [78]:
getInputInferCnv(batch5, 'label', 'Batch5')

In [79]:
getInputInferCnv(batch6, 'label', 'Batch6')

In [80]:
getInputInferCnv(batch7, 'label', 'Batch7')

In [81]:
getInputInferCnv(batch8, 'label', 'Batch8')

In [82]:
getInputInferCnv(batch9, 'label', 'Batch9')