# Assemble Perturb-seq BMDC data

In [12]:
import scanpy as sc
import pandas as pd
import scipy.io as io

In [13]:
data_path = '/data_volume/memento/bmdc/'

### Process time 0

In [14]:
genes = pd.read_csv(data_path + '/raw_0/genes.tsv', sep='\t', index_col=0)
var_df = pd.DataFrame(index=genes['1'].str.split('_').str[1])
var_df['gene_id'] = genes['1'].str.split('_').str[0].tolist()

cells = pd.read_csv(data_path + 'raw_0/cells.tsv', index_col=0)
obs_df = pd.DataFrame(index=cells['0'])
obs_df['cell'] = cells['0'].tolist()

mapping = pd.read_csv(data_path + '/raw_0/GSM2396857_dc_0hr_cbc_gbc_dict.csv', header=None)
mapping['cell'] = mapping[1].str.split(', ')
mapping = mapping.explode(column='cell').rename(columns={0:'guide'})[['cell', 'guide']]
guides = mapping['guide'].drop_duplicates().tolist()


print(obs_df.shape)
obs_df = obs_df.merge(mapping, on='cell', how='left').astype(str)
obs_df = pd.DataFrame(obs_df.groupby('cell').guide.apply(list))
obs_df['guide_string'] = obs_df['guide'].apply(lambda x: '-'.join(x))
print(obs_df.shape)

for g in guides:
    
    obs_df[g] = obs_df['guide'].str.contains(g)

(33063, 1)
(33063, 2)


In [15]:
X = io.mmread(data_path + '/raw_0/matrix.mtx')

In [16]:
adata0 = sc.AnnData(X=X.T.tocsc(), obs=obs_df, var=var_df)

Variable names are not unique. To make them unique, call `.var_names_make_unique`.


In [6]:
adata0.write(data_path + '/raw_0/tp0.h5ad')

... storing 'guide' as categorical
... storing 'guide_string' as categorical


### Process time 3

In [8]:
genes = pd.read_csv(data_path + '/raw_3/genes.tsv', index_col=0)
var_df = pd.DataFrame(index=genes['0'].str.split('_').str[1])
var_df['gene_id'] = genes['0'].str.split('_').str[0].tolist()

cells = pd.read_csv(data_path + 'raw_3/cells.tsv', index_col=0)
obs_df = pd.DataFrame(index=cells['0'])
obs_df['cell'] = cells['0'].tolist()

mapping = pd.read_csv(data_path + '/raw_3/GSM2396856_dc_3hr_cbc_gbc_dict_lenient.csv', header=None)
mapping['cell'] = mapping[1].str.split(', ')
mapping = mapping.explode(column='cell').rename(columns={0:'guide'})[['cell', 'guide']]
guides = mapping['guide'].drop_duplicates().tolist()


print(obs_df.shape)
obs_df = obs_df.merge(mapping, on='cell', how='left').astype(str)
obs_df = pd.DataFrame(obs_df.groupby('cell').guide.apply(list))
obs_df['guide_string'] = obs_df['guide'].apply(lambda x: '-'.join(x))
print(obs_df.shape)

for g in guides:
    
    obs_df[g] = obs_df['guide'].str.contains(g)

(32777, 1)
(32777, 2)


In [9]:
X = io.mmread(data_path + '/raw_3/matrix.mtx')

In [11]:
adata3 = sc.AnnData(X=X.T.tocsc(), obs=obs_df, var=var_df)
adata3.write(data_path + '/raw_3/tp3.h5ad')

Variable names are not unique. To make them unique, call `.var_names_make_unique`.
... storing 'guide' as categorical
... storing 'guide_string' as categorical


### Combine

In [17]:
adata0.obs['tp'] = '0hr'
adata3.obs['tp'] = '3hr'

In [70]:
adata0.var_names_make_unique()
adata3.var_names_make_unique()
overlap_genes = adata0.var[[]].copy().join(adata3.var[[]].copy(), how='inner').index.tolist()

In [71]:
adata0 = adata0[:, overlap_genes]
adata3 = adata3[:, overlap_genes]

In [72]:
adata_combined = adata0.concatenate(adata3)

In [74]:
adata_combined.write(data_path + 'bmdc.h5ad')

... storing 'guide' as categorical
... storing 'guide_string' as categorical
... storing 'tp' as categorical
