# Labeling HBECs

In [1]:
## Initialization

In [2]:
import numpy as np
import pandas as pd
import scanpy as sc
from nero import Harmonia
import matplotlib.pyplot as plt
sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
# sc.logging.print_versions()
#results_file = './write/h1n1pbmc_final.h5ad'  # the file that will store the analysis results


Bad key "text.kerning_factor" on line 4 in
/data/home/anaconda3/envs/single_cell/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
https://github.com/matplotlib/matplotlib/blob/v3.1.3/matplotlibrc.template
or from the matplotlib source distribution


In [3]:
from scipy.sparse import csr_matrix

In [4]:
sc.settings.set_figure_params(dpi=80)

In [5]:
adata = sc.read_10x_h5(
    '/data_volume/ifn_hbec/preprocess/filtered_feature_bc_matrix.h5', gex_only=False)                                # write a cache file for faster subsequent reading

reading /data_volume/ifn_hbec/preprocess/filtered_feature_bc_matrix.h5


Variable names are not unique. To make them unique, call `.var_names_make_unique`.


 (0:00:17)


In [6]:
adata.var_names_make_unique()  # this is unnecessary if using 'gene_ids'

In [7]:
adata.obs.shape

(127203, 0)

In [8]:
#add genetic demux information

In [9]:
donors = dict()

In [13]:
adata.obs

AAACCCAAGACATAAC-1
AAACCCAAGGAAGTAG-1
AAACCCAAGGGACTGT-1
AAACCCACAAACTAAG-1
AAACCCACAATCTGCA-1
...
TTTGTTGTCAAGCTTG-4
TTTGTTGTCCAAGCTA-4
TTTGTTGTCCGGTAAT-4
TTTGTTGTCGGACCAC-4
TTTGTTGTCTCTCCGA-4


In [10]:
donors['1_1'] = Harmonia.MuxOut('/data_volume/ifn_hbec/preprocess/1_1_donor_ids.tsv')

donors['1_2'] = Harmonia.MuxOut('/data_volume/ifn_hbec/preprocess/1_2_donor_ids.tsv')

donors['2_1'] = Harmonia.MuxOut('/data_volume/ifn_hbec/preprocess/2_1_donor_ids.tsv')

donors['2_2'] = Harmonia.MuxOut('/data_volume/ifn_hbec/preprocess/2_2_donor_ids.tsv')

In [11]:
adata = Harmonia.ann_merge(adata,donors['1_1']).concatenate(Harmonia.ann_merge(adata,donors['1_2']),
                                                            Harmonia.ann_merge(adata,donors['2_1']),
                                                            Harmonia.ann_merge(adata,donors['2_2']),index_unique=None).copy()

In [12]:
adata.shape

(127203, 36741)

In [13]:
#add hashtag demux information

In [14]:
HTO_classification = pd.read_csv('/data_volume/ifn_hbec/preprocess/HBEC_HTO_assignments.csv', sep=' ')['x']

In [15]:
adata.obs['HTO_classification'] = HTO_classification

In [16]:
adata.shape

(127203, 36741)

In [17]:
adata.obs.head(5)

Unnamed: 0,NUM.SNPS,BEST.GUESS,DROPLET.TYPE,batch,HTO_classification
AAACCCAAGACATAAC-1,1550,donor0,SNG,0,Negative
AAACCCAAGGAAGTAG-1,1700,donor0,SNG,0,hash-10
AAACCCAAGGGACTGT-1,839,donor0,SNG,0,hash-3
AAACCCACAAACTAAG-1,1547,donor0,SNG,0,Negative
AAACCCACAATCTGCA-1,1130,donor1,SNG,0,hash-1


In [18]:
adata.obs.batch.value_counts()

1    35731
0    35455
3    28507
2    27510
Name: batch, dtype: int64

In [19]:
pd.set_option('display.max_rows', None)

In [20]:
adata.obs['batch'].value_counts()

1    35731
0    35455
3    28507
2    27510
Name: batch, dtype: int64

In [22]:
adata.obs['BEST.GUESS'].value_counts()

donor0           54940
donor1           52954
donor0,donor1    16543
unassigned        2766
Name: BEST.GUESS, dtype: int64

In [50]:
adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-11')&(adata.obs['BEST.GUESS']=='donor0')]



View of AnnData object with n_obs × n_vars = 1967 × 36741
    obs: 'NUM.SNPS', 'BEST.GUESS', 'DROPLET.TYPE', 'batch', 'HTO_classification', 'condition'
    var: 'gene_ids', 'feature_types', 'genome'

In [52]:
adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-11')&(adata.obs['BEST.GUESS']=='ada')]




View of AnnData object with n_obs × n_vars = 1989 × 36741
    obs: 'NUM.SNPS', 'BEST.GUESS', 'DROPLET.TYPE', 'batch', 'HTO_classification', 'condition'
    var: 'gene_ids', 'feature_types', 'genome'

In [29]:
donor0 = 2513
donor1 = 2614


In [30]:
adata.obs['condition']= 'unassigned'

In [53]:
# get subset of cells for each condition.
d2614_control = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-1')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)
d2614_alpha_3 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-1')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)
d2614_alpha_6 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-2')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)
d2614_alpha_9 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-2')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)
d2614_alpha_24 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-3')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)
d2614_alpha_48 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-3')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)

In [54]:
d2614_beta_3 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-4')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)
d2614_beta_6 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-4')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)
d2614_beta_9 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-5')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)
d2614_beta_24 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-5')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)
d2614_beta_48 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-6')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)

In [55]:
d2614_gamma_3 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-6')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)
d2614_gamma_6 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-7')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)
d2614_gamma_9 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-7')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)
d2614_gamma_24 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-8')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)
d2614_gamma_48 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-8')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)

In [56]:
d2614_lambda_3 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-9')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)
d2614_lambda_6 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-9')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)
d2614_lambda_9 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-10')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)
d2614_lambda_24 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-10')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)
d2614_lambda_48 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-11')&(adata.obs['BEST.GUESS']=='donor1')].obs.index)

In [57]:
d2513_control = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-1')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)
d2513_alpha_3 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-1')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)
d2513_alpha_6 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-2')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)
d2513_alpha_9 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-2')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)
d2513_alpha_24 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-3')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)
d2513_alpha_48 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-3')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)

In [58]:
d2513_beta_3 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-4')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)
d2513_beta_6 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-4')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)
d2513_beta_9 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-5')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)
d2513_beta_24 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-5')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)
d2513_beta_48 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-6')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)


In [59]:
d2513_gamma_3 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-6')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)
d2513_gamma_6 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-7')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)
d2513_gamma_9 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-7')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)
d2513_gamma_24 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-8')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)
d2513_gamma_48 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-8')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)


In [60]:
d2513_lambda_3 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-9')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)
d2513_lambda_6 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-9')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)
d2513_lambda_9 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-10')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)
d2513_lambda_24 = list(adata[((adata.obs['batch'] == '2') | (adata.obs['batch'] == '3')) & (adata.obs['HTO_classification'] == 'hash-10')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)
d2513_lambda_48 = list(adata[((adata.obs['batch'] == '0') | (adata.obs['batch'] == '1')) & (adata.obs['HTO_classification'] == 'hash-11')&(adata.obs['BEST.GUESS']=='donor0')].obs.index)



In [61]:
#assign subset to condition
adata.obs.loc[adata.obs.index.isin(d2614_control), "condition"]  = 'd2614_control'
adata.obs.loc[adata.obs.index.isin(d2614_alpha_3), "condition"]  = 'd2614_alpha_3'
adata.obs.loc[adata.obs.index.isin(d2614_alpha_6), "condition"]  = 'd2614_alpha_6'
adata.obs.loc[adata.obs.index.isin(d2614_alpha_9), "condition"]  = 'd2614_alpha_9'
adata.obs.loc[adata.obs.index.isin(d2614_alpha_24), "condition"]  = 'd2614_alpha_24'
adata.obs.loc[adata.obs.index.isin(d2614_alpha_48), "condition"]  = 'd2614_alpha_48'

In [62]:
adata.obs.loc[adata.obs.index.isin(d2614_beta_3), "condition"]  = 'd2614_beta_3'
adata.obs.loc[adata.obs.index.isin(d2614_beta_6), "condition"]  = 'd2614_beta_6'
adata.obs.loc[adata.obs.index.isin(d2614_beta_9), "condition"]  = 'd2614_beta_9'
adata.obs.loc[adata.obs.index.isin(d2614_beta_24), "condition"]  = 'd2614_beta_24'
adata.obs.loc[adata.obs.index.isin(d2614_beta_48), "condition"]  = 'd2614_beta_48'

In [63]:
adata.obs.loc[adata.obs.index.isin(d2614_gamma_3), "condition"]  = 'd2614_gamma_3'
adata.obs.loc[adata.obs.index.isin(d2614_gamma_6), "condition"]  = 'd2614_gamma_6'
adata.obs.loc[adata.obs.index.isin(d2614_gamma_9), "condition"]  = 'd2614_gamma_9'
adata.obs.loc[adata.obs.index.isin(d2614_gamma_24), "condition"]  = 'd2614_gamma_24'
adata.obs.loc[adata.obs.index.isin(d2614_gamma_48), "condition"]  = 'd2614_gamma_48'

In [64]:
adata.obs.loc[adata.obs.index.isin(d2614_lambda_3), "condition"]  = 'd2614_lambda_3'
adata.obs.loc[adata.obs.index.isin(d2614_lambda_6), "condition"]  = 'd2614_lambda_6'
adata.obs.loc[adata.obs.index.isin(d2614_lambda_9), "condition"]  = 'd2614_lambda_9'
adata.obs.loc[adata.obs.index.isin(d2614_lambda_24), "condition"]  = 'd2614_lambda_24'
adata.obs.loc[adata.obs.index.isin(d2614_lambda_48), "condition"]  = 'd2614_lambda_48'

In [65]:
#assign subset to condition
adata.obs.loc[adata.obs.index.isin(d2513_control), "condition"]  = 'd2513_control'
adata.obs.loc[adata.obs.index.isin(d2513_alpha_3), "condition"]  = 'd2513_alpha_3'
adata.obs.loc[adata.obs.index.isin(d2513_alpha_6), "condition"]  = 'd2513_alpha_6'
adata.obs.loc[adata.obs.index.isin(d2513_alpha_9), "condition"]  = 'd2513_alpha_9'
adata.obs.loc[adata.obs.index.isin(d2513_alpha_24), "condition"]  = 'd2513_alpha_24'
adata.obs.loc[adata.obs.index.isin(d2513_alpha_48), "condition"]  = 'd2513_alpha_48'

In [66]:
adata.obs.loc[adata.obs.index.isin(d2513_beta_3), "condition"]  = 'd2513_beta_3'
adata.obs.loc[adata.obs.index.isin(d2513_beta_6), "condition"]  = 'd2513_beta_6'
adata.obs.loc[adata.obs.index.isin(d2513_beta_9), "condition"]  = 'd2513_beta_9'
adata.obs.loc[adata.obs.index.isin(d2513_beta_24), "condition"]  = 'd2513_beta_24'
adata.obs.loc[adata.obs.index.isin(d2513_beta_48), "condition"]  = 'd2513_beta_48'

In [67]:
adata.obs.loc[adata.obs.index.isin(d2513_gamma_3), "condition"]  = 'd2513_gamma_3'
adata.obs.loc[adata.obs.index.isin(d2513_gamma_6), "condition"]  = 'd2513_gamma_6'
adata.obs.loc[adata.obs.index.isin(d2513_gamma_9), "condition"]  = 'd2513_gamma_9'
adata.obs.loc[adata.obs.index.isin(d2513_gamma_24), "condition"]  = 'd2513_gamma_24'
adata.obs.loc[adata.obs.index.isin(d2513_gamma_48), "condition"]  = 'd2513_gamma_48'

In [68]:
adata.obs.loc[adata.obs.index.isin(d2513_lambda_3), "condition"]  = 'd2513_lambda_3'
adata.obs.loc[adata.obs.index.isin(d2513_lambda_6), "condition"]  = 'd2513_lambda_6'
adata.obs.loc[adata.obs.index.isin(d2513_lambda_9), "condition"]  = 'd2513_lambda_9'
adata.obs.loc[adata.obs.index.isin(d2513_lambda_24), "condition"]  = 'd2513_lambda_24'
adata.obs.loc[adata.obs.index.isin(d2513_lambda_48), "condition"]  = 'd2513_lambda_48'

In [74]:
adata.obs.query('condition != "unassigned"').shape

(80654, 6)

In [77]:
adata.write("/data_volume/ifn_hbec/HBEC_condition_assigned_deep.h5ad")

... storing 'BEST.GUESS' as categorical
... storing 'DROPLET.TYPE' as categorical
... storing 'HTO_classification' as categorical
... storing 'condition' as categorical
... storing 'feature_types' as categorical
... storing 'genome' as categorical
