## CT Split

Combining the ADT and mRNA results into one object, then splitting into 3 broad cell types for further processing.

Originally, this just had the aggr projection, but I adjusted the code to also use Combat.

In [None]:
import scanpy as sc
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import warnings
import itertools as it
import json
import requests
import seaborn as sns
import pickle as pkl
from functools import reduce

In [None]:
sc.settings.verbosity = 4
sc.settings.set_figure_params(dpi=80)
print(sc.__version__)
sc.settings.n_jobs=16

In [None]:
prefix = '/data/codec/production.run/mrna/'

### Load in Data

In [None]:
path = prefix + 'pkls/aggr/wells.sng.w_covars.pkl'

# with open(path,'wb') as file:
#     pkl.dump(wells, file)
    
with open(path,'rb') as file:
    wells = pkl.load(file)

### Adjust Cell Barcodes, Filter

I'm adjusting the cell barcodes to make them match their well number, which I also did with the ADTs.

In [None]:
for well in wells:
    wells[well]['adata'].obs_names = [i[:16] + '-%s' % well for i in wells[well]['adata'].obs_names]

### Concatenate

In [None]:
concat = wells[0]['adata'].concatenate(*[wells[i]['adata'] for  i in range(1, 12)]) # I really shouldn't do this, I should go back and run cellranger aggr, but for now just concatenate

In [None]:
concat.var['n_counts'] = concat.X.toarray().sum(axis=0)

### Filter Genes, Transform Data

Drop genes with very low counts.

In [None]:
plt.figure(figsize=(8,6))
plt.hist(concat.var['n_counts'].values, bins=np.logspace(np.log10(10),np.log10(2e5), 1000))
plt.grid(False)
plt.grid(True, 'both', 'both')
plt.xscale('log')
# plt.yscale('log')

# Run On Separate Machine:

In [None]:
### Load in Data

path = prefix + 'pkls/aggr/wells.sng.w_covars.pkl'
    
with open(path,'rb') as file:
    wells = pkl.load(file)

### Adjust Cell Barcodes, Filter

# I'm adjusting the cell barcodes to make them match their well number, which I also did with the ADTs.

for well in wells:
    wells[well]['adata'].obs_names = [i[:16] + '-%s' % well for i in wells[well]['adata'].obs_names]

### Concatenate

concat = wells[0]['adata'].concatenate(*[wells[i]['adata'] for  i in range(1, 12)]) # I really shouldn't do this, I should go back and run cellranger aggr, but for now just concatenate

concat.var['n_counts'] = concat.X.toarray().sum(axis=0)

### Filter Genes, Transform Data

# Drop genes with very low counts.

# plt.figure(figsize=(8,6))
# plt.hist(concat.var['n_counts'].values, bins=np.logspace(np.log10(10),np.log10(2e5), 1000))
# plt.grid(False)
# plt.grid(True, 'both', 'both')
# plt.xscale('log')
# # plt.yscale('log')

# remove any genes that are now empty
sc.pp.filter_genes(concat, min_counts=100, inplace=True)

sc.pp.normalize_per_cell(concat, counts_per_cell_after=1e6)

sc.pp.log1p(concat)

# ### Store Some Data

# For posterity...

# path = prefix + 'pkls/aggr/concat.norm.log.pkl'

# # with open(path,'wb') as file:
# #     pkl.dump(concat, file)
    
# with open(path,'rb') as file:
#     concat = pkl.load(file)

# path = prefix + 'obs/aggr/concat.obs.csv'
# concat['adata'].obs.to_csv(path)

# path = prefix + 'obs/aggr/concat.bcs.txt'

# with open(path,'w') as file:
#     for bc in concat['adata'].obs_names:
#         file.write(bc + '\n')

sc.pp.scale(concat)
sc.pp.combat(concat, key='batch',covariates=['cond','free_id'])
sc.pp.scale(concat)
sc.pp.pca(concat, n_comps=200)

path = prefix + 'pkls/aggr/ct.split.pkl'

with open(path,'wb') as file:
    pkl.dump(concat, file)

In [None]:
path = prefix + 'pkls/aggr/ct.split.pkl'
    
with open(path,'rb') as file:
    concat = pkl.load(file)

Original code I ran on a separate machine:

In [None]:
# ### Dimensionality Reduction, Visualization and Clustering

# Perform on a separate, more powerful machine.

# # import scanpy as sc
# # import pickle as pkl
# # import warnings
# # warnings.filterwarnings('ignore')

# # path = '/data/codec/production.run/mrna/pkls/aggr/concat.norm.log.pkl'
# # with open(path,'rb') as file:
# #     concat = pkl.load(file)

# # concat['unscaled'] = concat['adata'].copy()
# # sc.pp.scale(concat['adata'])
# # sc.settings.verbosity = 4
# # sc.settings.n_jobs=30
# # sc.pp.combat(concat['adata'], key='batch',covariates=['cond','free_id'])
# # sc.pp.pca(concat['adata'],n_comps=200)
# # sc.pp.neighbors(concat['adata'],n_neighbors=15,n_pcs=100)
# # sc.tl.umap(concat['adata'])

# # path = '/data/codec/production.run/mrna/pkls/aggr/concat.aggr.combat.dimred.pkl'
# # warnings.filterwarnings('default')
# # with open(path,'wb') as file:
# #     pkl.dump(concat, file, protocol=4)

# path = prefix + 'pkls/aggr/concat.aggr.combat.dimred.pkl'

# with open(path,'rb') as file:
#     concat = pkl.load(file)

In [None]:
sc.pl.pca_variance_ratio(concat, log=True, n_pcs=200)

In [None]:
warnings.filterwarnings('ignore')
sc.pp.neighbors(concat, n_neighbors=15, n_pcs=150)
warnings.filterwarnings('default')

In [None]:
sc.tl.umap(concat)

In [None]:
sc.tl.leiden(concat, resolution=1)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax in zip(['cond', 'leiden'], ax):
    sc.pl.umap(concat,color=color, ax=ax, show=False, return_fig=False, size=2)

In [None]:
sc.tl.leiden(concat, resolution=0.3, restrict_to=('leiden',['24']))

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax in zip(['cond', 'leiden'], ax):
    sc.pl.umap(concat,color=color, ax=ax, show=False, return_fig=False, size=2)

In [None]:
sc.tl.leiden(concat, resolution=0.3, restrict_to=('leiden',['24,1']))

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax in zip(['cond', 'leiden'], ax):
    sc.pl.umap(concat,color=color, ax=ax, show=False, return_fig=False, size=2)

In [None]:
sc.settings.verbosity = 0
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(concat, groupby='leiden', n_genes=100, groups=['24,1,2'])
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(concat, ncols=5, n_genes=20)
sc.settings.verbosity = 4

# Single Gene Plotter

In [None]:
f = ['SOX4']

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5,5))
ax.set_facecolor('black')
sc.pl.umap(acg_t, color=f, ax=ax,show=False, return_fig=False, size=5, use_raw=True);

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax in zip(['percent_mito', 'n_counts'], ax):
    sc.pl.umap(concat, color=color, ax=ax, show=False, return_fig=False, size=2)

In [None]:
batches = concat.obs['batch'].unique() # get a list of the batches

In [None]:
# make a new column in the `.obs` for each batch that is of dtype `int` and that just takes on values of 0 and 1 so that it gets plotted as a continuous variable instead of a categorical one
for batch in batches:
    concat.obs['batch_%s' % batch] = (concat.obs['batch'] == batch).astype(int)

In [None]:
# plot now with sort_order=True, which should apply to this new continuous variable
sc.pl.umap(concat, color=['batch_%s' % i for i in batches],sort_order=True, ncols=4)

In [None]:
sc.pl.umap(concat, color=['APAF1','BAK1','BAX','FAS','TNFRSF10B','TNFRSF10A','ANXA5','TP53', ])

In [None]:
val = '12'
concat['adata'].obs['val'] = concat['adata'].obs['leiden'] == val
fig, ax = plt.subplots(1,1)
ax.set_facecolor('gray')
sc.pl.umap(concat['adata'],color='val', ax=ax)
concat['adata'].obs.drop(columns='val', inplace=True)

In [None]:
sub_adatas = dict() # put the new subsetted adata objects in a dictionary of adatas
sub_adatas['12'] = concat['adata'][concat['adata'].obs['leiden'] == '12'].copy()
sub_adatas['22'] = concat['adata'][concat['adata'].obs['leiden'] == '22'].copy()

In [None]:
sc.tl.leiden(sub_adatas['12'], resolution=0.5) # subcluster them using Leiden
sc.pl.umap(sub_adatas['12'], color='leiden', size=2)

In [None]:
groupings = [[0, 1, 3, 4, 5, 6, 7],
             [2],
             ]
grouped_clusts = [i for j in groupings for i in j]
numclusts = np.unique(sub_adatas['12'].obs['leiden'].values.astype(int))
for i in np.setdiff1d(numclusts, grouped_clusts):
    groupings.append([i])

In [None]:
ctdict = dict()
for i in range(len(groupings)):
    ctdict['ct%s' % str(i)] = groupings[i]

sub_adatas['12'].obs['celltype'] = sub_adatas['12'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        sub_adatas['12'].obs['celltype'].replace(str(clust), ct, regex=True, inplace=True)
sub_adatas['12'].obs['leiden'] = [i.strip('ct') for i in sub_adatas['12'].obs['celltype'].astype('category')]

In [None]:
sc.pl.umap(sub_adatas['12'], color='leiden', size=2)

In [None]:
sc.tl.leiden(sub_adatas['22'], resolution=0.1) # subcluster them using Leiden
sc.pl.umap(sub_adatas['22'], color='leiden', size=2)

Map them back to the clusters on the original adata

In [None]:
def sub_cluster_mapper(adata, sub_adatas):
    '''
    This takes in the adata object and inserts a new leiden column in the `.obs`. 
    
    This function is really convoluted and there's probably a better, simpler way to do it,
    but it should theoretically work for any number of subclusters
    '''
    # ideally you'd make a copy of the adata object here, so we don't have to change the original
    # this would be in case we want to run it multiple times, perhaps the resolutions we put in didn't subset the clusters like we had hoped
    # and we need to run multiple times to adjust the resolution slightly
    
    # this block is to figure out that there are two new subclusters and they should be named 8, 9
    total_new_clusts = 0
    old_clusts = sub_adatas.keys()
    for sub_adata in sub_adatas:
        total_new_clusts += sub_adatas[sub_adata].obs['leiden'].astype(int).unique().max() + 1
    total_added_clusts = total_new_clusts - len(sub_adatas)
    new_clust_names_start = max(adata.obs['leiden'].astype(int))+1
    new_added_clust_names = [str(i) for i in range(new_clust_names_start,
                                                   new_clust_names_start + total_added_clusts)]
    
    # this block is to build a new list of leiden clusters from the old one 
    new_leiden = list()
    leiden_col = adata.obs['leiden'].copy()

    # this builds the new leiden cluster list, now adding a .1, .2, etc. to each new cluster
    for obs in leiden_col.index:
        clust_name = leiden_col.loc[obs]
        if clust_name not in old_clusts or sub_adatas[clust_name].obs.loc[obs, 'leiden'] == '0':
            new_leiden.append(clust_name)
        else:
            new_leiden.append(clust_name + '.%s' % sub_adatas[clust_name].obs.loc[obs,'leiden'])

    # this renames the .1, .2, etc clusters to the new, better names I came up with above (8 and 9)
    new_leiden = pd.Series(new_leiden, index=adata.obs_names)
    added_clusts = np.setdiff1d(new_leiden,adata.obs['leiden'])
    new_leiden.replace(dict(zip(added_clusts, new_added_clust_names)), inplace=True)
    
    # replace the old leiden column, must do these steps sequentially 
    adata.obs['leiden'] = new_leiden.astype(int) # to order the clusters by number
    adata.obs['leiden'] = new_leiden.astype(str) # to convert to string as normal
#     adata.obs['leiden'] = new_leiden.astype('category') # don't do this, it messes things up, just let scanpy do it as it plots
    return adata

In [None]:
concat['adata'] = sub_cluster_mapper(concat['adata'], sub_adatas)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax in zip(['percent_mito', 'leiden'], ax):
    sc.pl.umap(concat['adata'],color=color, ax=ax, show=False, return_fig=False, size=2)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(20,4))
sns.violinplot(data=concat['adata'].obs[['leiden','percent_mito']], x='leiden',y='percent_mito', ax=ax);

In [None]:
remove_clusts = [2, 11, 13, 18, 19, 22, 24]
keep_clusts = np.setdiff1d(range(31), remove_clusts).astype(str)

In [None]:
keep_cells = list()
for i in tqdm(concat['adata'].obs_names):
    if concat['adata'].obs.loc[i,'leiden'] in keep_clusts:
        keep_cells.append(i)

In [None]:
path = prefix + 'obs/aggr/keep.bcs.txt'

# with open(path,'w') as file:
#     for bc in keep_cells:
#         file.write(bc + '\n')
        
with open(path,'r') as file:
    keep_cells = [i.strip() for i in file.readlines()]

Ran on separate machine:

In [None]:
# import scanpy as sc
# import pickle as pkl
# import warnings
# warnings.filterwarnings('ignore')
# sc.settings.verbosity = 4
# sc.settings.n_jobs=30

# path = '/data/codec/production.run/mrna/pkls/aggr/concat.norm.log.pkl'
# with open(path,'rb') as file:
#     concat = pkl.load(file)

# path = '/data/codec/production.run/mrna/obs/aggr/keep.bcs.txt'

# with open(path,'r') as file:
#     keep_cells = [i.strip() for i in file.readlines()]


# concat['adata'] = concat['adata'][keep_cells,:].copy()

# sc.pp.scale(concat['adata'])
# sc.pp.combat(concat['adata'], key='batch',covariates=['cond','free_id'])
# sc.pp.pca(concat['adata'],n_comps=200)
# sc.pp.neighbors(concat['adata'],n_neighbors=15,n_pcs=100)
# sc.tl.umap(concat['adata'])

# path = '/data/codec/production.run/mrna/pkls/aggr/concat.nomito.pkl'
# warnings.filterwarnings('default')
# with open(path,'wb') as file:
#     pkl.dump(concat, file, protocol=4)

In [None]:
path = prefix + 'pkls/aggr/concat.nomito.pkl'
    
with open(path,'rb') as file:
    concat = pkl.load(file)

In [None]:
sc.pl.pca_variance_ratio(concat['adata'],log=True, n_pcs=200)

In [None]:
sc.tl.leiden(concat['adata'], resolution=0.8)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax in zip(['cond', 'leiden'], ax):
    sc.pl.umap(concat['adata'],color=color, ax=ax, show=False, return_fig=False, size=2)

In [None]:
sub_adatas = dict() # put the new subsetted adata objects in a dictionary of adatas
sub_adatas['14'] = concat['adata'][concat['adata'].obs['leiden'] == '14'].copy()
sub_adatas['15'] = concat['adata'][concat['adata'].obs['leiden'] == '15'].copy()
sub_adatas['20'] = concat['adata'][concat['adata'].obs['leiden'] == '20'].copy()
sub_adatas['22'] = concat['adata'][concat['adata'].obs['leiden'] == '22'].copy()
sub_adatas['24'] = concat['adata'][concat['adata'].obs['leiden'] == '24'].copy()

In [None]:
sc.tl.leiden(sub_adatas['14'], resolution=0.3) # subcluster them using Leiden
sc.pl.umap(sub_adatas['14'], color='leiden', size=15)

In [None]:
groupings = [[0, 1, 2],
             ]
grouped_clusts = [i for j in groupings for i in j]
numclusts = np.unique(sub_adatas['14'].obs['leiden'].values.astype(int))
for i in np.setdiff1d(numclusts, grouped_clusts):
    groupings.append([i])

In [None]:
ctdict = dict()
for i in range(len(groupings)):
    ctdict['ct%s' % str(i)] = groupings[i]

sub_adatas['14'].obs['celltype'] = sub_adatas['14'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        sub_adatas['14'].obs['celltype'].replace(str(clust), ct, regex=True, inplace=True)
sub_adatas['14'].obs['leiden'] = [i.strip('ct') for i in sub_adatas['14'].obs['celltype'].astype('category')]

In [None]:
sc.pl.umap(sub_adatas['14'], color='leiden', size=15)

In [None]:
sc.tl.leiden(sub_adatas['15'], resolution=0.1) # subcluster them using Leiden
sc.pl.umap(sub_adatas['15'], color='leiden', size=15)

In [None]:
sc.tl.leiden(sub_adatas['20'], resolution=1) # subcluster them using Leiden
sc.pl.umap(sub_adatas['20'], color='leiden', size=10)

In [None]:
groupings = [[2, 4, 6, 7],
             ]
grouped_clusts = [i for j in groupings for i in j]
numclusts = np.unique(sub_adatas['20'].obs['leiden'].values.astype(int))
for i in np.setdiff1d(numclusts, grouped_clusts):
    groupings.append([i])

In [None]:
ctdict = dict()
for i in range(len(groupings)):
    ctdict['ct%s' % str(i)] = groupings[i]

sub_adatas['20'].obs['celltype'] = sub_adatas['20'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        sub_adatas['20'].obs['celltype'].replace(str(clust), ct, regex=True, inplace=True)
sub_adatas['20'].obs['leiden'] = [i.strip('ct') for i in sub_adatas['20'].obs['celltype'].astype('category')]

In [None]:
sc.pl.umap(sub_adatas['20'], color='leiden', size=10)

In [None]:
sc.settings.verbosity = 0 
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(sub_adatas['20'], groupby='leiden', n_genes=50)
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(sub_adatas['20'], ncols=5, n_genes=20)
sc.settings.verbosity = 4

In [None]:
sc.pl.umap(sub_adatas['20'], color=['cond'], size=10)

In [None]:
sc.tl.leiden(sub_adatas['22'], resolution=0.001) # subcluster them using Leiden
sc.pl.umap(sub_adatas['22'], color=['leiden', 'cond'], size=10)

In [None]:
sc.tl.leiden(sub_adatas['24'], resolution=0.3) # subcluster them using Leiden
sc.pl.umap(sub_adatas['24'], color='leiden', size=10)

In [None]:
sc.pl.umap(sub_adatas['24'], color='cond', size=10)

In [None]:
sc.settings.verbosity = 0 
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(sub_adatas['24'], groupby='leiden', n_genes=20)
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(sub_adatas['24'], ncols=5, n_genes=20)
sc.settings.verbosity = 4

Map them back to the clusters on the original adata

In [None]:
def sub_cluster_mapper(adata, sub_adatas):
    '''
    This takes in the adata object and inserts a new leiden column in the `.obs`. 
    
    This function is really convoluted and there's probably a better, simpler way to do it,
    but it should theoretically work for any number of subclusters
    '''
    # ideally you'd make a copy of the adata object here, so we don't have to change the original
    # this would be in case we want to run it multiple times, perhaps the resolutions we put in didn't subset the clusters like we had hoped
    # and we need to run multiple times to adjust the resolution slightly
    
    # this block is to figure out that there are two new subclusters and they should be named 8, 9
    total_new_clusts = 0
    old_clusts = sub_adatas.keys()
    for sub_adata in sub_adatas:
        total_new_clusts += sub_adatas[sub_adata].obs['leiden'].astype(int).unique().max() + 1
    total_added_clusts = total_new_clusts - len(sub_adatas)
    new_clust_names_start = max(adata.obs['leiden'].astype(int))+1
    new_added_clust_names = [str(i) for i in range(new_clust_names_start,
                                                   new_clust_names_start + total_added_clusts)]
    
    # this block is to build a new list of leiden clusters from the old one 
    new_leiden = list()
    leiden_col = adata.obs['leiden'].copy()

    # this builds the new leiden cluster list, now adding a .1, .2, etc. to each new cluster
    for obs in leiden_col.index:
        clust_name = leiden_col.loc[obs]
        if clust_name not in old_clusts or sub_adatas[clust_name].obs.loc[obs, 'leiden'] == '0':
            new_leiden.append(clust_name)
        else:
            new_leiden.append(clust_name + '.%s' % sub_adatas[clust_name].obs.loc[obs,'leiden'])

    # this renames the .1, .2, etc clusters to the new, better names I came up with above (8 and 9)
    new_leiden = pd.Series(new_leiden, index=adata.obs_names)
    added_clusts = np.setdiff1d(new_leiden,adata.obs['leiden'])
    new_leiden.replace(dict(zip(added_clusts, new_added_clust_names)), inplace=True)
    
    # replace the old leiden column, must do these steps sequentially 
    adata.obs['leiden'] = new_leiden.astype(int) # to order the clusters by number
    adata.obs['leiden'] = new_leiden.astype(str) # to convert to string as normal
#     adata.obs['leiden'] = new_leiden.astype('category') # don't do this, it messes things up, just let scanpy do it as it plots
    return adata

In [None]:
concat['adata'] = sub_cluster_mapper(concat['adata'], sub_adatas)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax in zip(['free_id', 'batch'], ax):
    sc.pl.umap(concat['adata'],color=color, ax=ax, show=False, return_fig=False, size=2)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,10))
sc.pl.umap(concat['adata'], color='leiden', ax=ax, show=False, return_fig=False, size=2);

In [None]:
ct_tnk = [0, 1, 2, 3, 4, 5, 6, 8, 10, 11, 16, 19]
ct_b = [9, 12, 18]
ct_m = [7, 14, 15, 17]
ct_other = [22, 23, 24, 25, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]
ct_drop = [13, 20, 21, 26, 27]
        
cts = ['tnk', 'b', 'm', 'other', 'drop']

ctdict = dict()
ctdict['tnk'] = ct_tnk
ctdict['b'] = ct_b
ctdict['m'] = ct_m
ctdict['other'] = ct_other
ctdict['drop'] = ct_drop
concat['adata'].obs['ct1'] = concat['adata'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        concat['adata'].obs['ct1'].replace(str(clust), ct, inplace=True)
concat['adata'].obs['ct1'] = concat['adata'].obs['ct1'].astype('category')

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,10))
sc.pl.umap(concat['adata'], color='ct1', ax=ax, show=False, return_fig=False, size=2, palette=sc.pl.palettes.default_20);

Looks good. Because I couldn't export here, I ran on a separate machine.

In [None]:
# import scanpy as sc
# import pickle as pkl
# import pandas as pd
# import numpy as np

# sc.settings.verbosity = 4
# sc.settings.n_jobs=30

# path = '/data/codec/production.run/mrna/pkls/aggr/concat.nomito.pkl'

# with open(path,'rb') as file:
#     concat = pkl.load(file)

# path = '/data/codec/production.run/adts/pkls/combat/concat.combat.adts.norm.log.pkl'

# with open(path,'rb') as file:
#     concat_adts = pkl.load(file)

# transcripts = concat['adata'].var_names
# proteins = concat_adts['adata'].var_names

# adts_df = pd.DataFrame(concat_adts['adata'].X, columns=proteins, index=concat_adts['adata'].obs_names)

# concat['adata'].obs = concat['adata'].obs.join(adts_df)

# sc.tl.leiden(concat['adata'], resolution=0.8)

# sub_adatas = dict() # put the new subsetted adata objects in a dictionary of adatas
# sub_adatas['14'] = concat['adata'][concat['adata'].obs['leiden'] == '14'].copy()
# sub_adatas['15'] = concat['adata'][concat['adata'].obs['leiden'] == '15'].copy()
# sub_adatas['20'] = concat['adata'][concat['adata'].obs['leiden'] == '20'].copy()
# sub_adatas['22'] = concat['adata'][concat['adata'].obs['leiden'] == '22'].copy()
# sub_adatas['24'] = concat['adata'][concat['adata'].obs['leiden'] == '24'].copy()

# sc.tl.leiden(sub_adatas['14'], resolution=0.3) # subcluster them using Leiden

# groupings = [[0, 1, 2],
#              ]
# grouped_clusts = [i for j in groupings for i in j]
# numclusts = np.unique(sub_adatas['14'].obs['leiden'].values.astype(int))
# for i in np.setdiff1d(numclusts, grouped_clusts):
#     groupings.append([i])

# ctdict = dict()
# for i in range(len(groupings)):
#     ctdict['ct%s' % str(i)] = groupings[i]

# sub_adatas['14'].obs['celltype'] = sub_adatas['14'].obs['leiden']
# for ct in ctdict:
#     for clust in ctdict[ct]:
#         sub_adatas['14'].obs['celltype'].replace(str(clust), ct, regex=True, inplace=True)
# sub_adatas['14'].obs['leiden'] = [i.strip('ct') for i in sub_adatas['14'].obs['celltype'].astype('category')]

# sc.tl.leiden(sub_adatas['15'], resolution=0.1) # subcluster them using Leiden

# sc.tl.leiden(sub_adatas['20'], resolution=1) # subcluster them using Leiden

# groupings = [[2, 4, 6, 7],
#              ]
# grouped_clusts = [i for j in groupings for i in j]
# numclusts = np.unique(sub_adatas['20'].obs['leiden'].values.astype(int))
# for i in np.setdiff1d(numclusts, grouped_clusts):
#     groupings.append([i])

# ctdict = dict()
# for i in range(len(groupings)):
#     ctdict['ct%s' % str(i)] = groupings[i]

# sub_adatas['20'].obs['celltype'] = sub_adatas['20'].obs['leiden']
# for ct in ctdict:
#     for clust in ctdict[ct]:
#         sub_adatas['20'].obs['celltype'].replace(str(clust), ct, regex=True, inplace=True)
# sub_adatas['20'].obs['leiden'] = [i.strip('ct') for i in sub_adatas['20'].obs['celltype'].astype('category')]

# sc.tl.leiden(sub_adatas['22'], resolution=0.001) # subcluster them using Leiden

# sc.tl.leiden(sub_adatas['24'], resolution=0.3) # subcluster them using Leiden

# def sub_cluster_mapper(adata, sub_adatas):
#     '''
#     This takes in the adata object and inserts a new leiden column in the `.obs`.

#     This function is really convoluted and there's probably a better, simpler way to do it,
#     but it should theoretically work for any number of subclusters
#     '''
#     # ideally you'd make a copy of the adata object here, so we don't have to change the original
#     # this would be in case we want to run it multiple times, perhaps the resolutions we put in didn't subset the clusters like we had hoped
#     # and we need to run multiple times to adjust the resolution slightly

#     # this block is to figure out that there are two new subclusters and they should be named 8, 9
#     total_new_clusts = 0
#     old_clusts = sub_adatas.keys()
#     for sub_adata in sub_adatas:
#         total_new_clusts += sub_adatas[sub_adata].obs['leiden'].astype(int).unique().max() + 1
#     total_added_clusts = total_new_clusts - len(sub_adatas)
#     new_clust_names_start = max(adata.obs['leiden'].astype(int))+1
#     new_added_clust_names = [str(i) for i in range(new_clust_names_start,
#                                                    new_clust_names_start + total_added_clusts)]

#     # this block is to build a new list of leiden clusters from the old one
#     new_leiden = list()
#     leiden_col = adata.obs['leiden'].copy()

#     # this builds the new leiden cluster list, now adding a .1, .2, etc. to each new cluster
#     for obs in leiden_col.index:
#         clust_name = leiden_col.loc[obs]
#         if clust_name not in old_clusts or sub_adatas[clust_name].obs.loc[obs, 'leiden'] == '0':
#             new_leiden.append(clust_name)
#         else:
#             new_leiden.append(clust_name + '.%s' % sub_adatas[clust_name].obs.loc[obs,'leiden'])

#     # this renames the .1, .2, etc clusters to the new, better names I came up with above (8 and 9)
#     new_leiden = pd.Series(new_leiden, index=adata.obs_names)
#     added_clusts = np.setdiff1d(new_leiden,adata.obs['leiden'])
#     new_leiden.replace(dict(zip(added_clusts, new_added_clust_names)), inplace=True)

#     # replace the old leiden column, must do these steps sequentially
#     adata.obs['leiden'] = new_leiden.astype(int) # to order the clusters by number
#     adata.obs['leiden'] = new_leiden.astype(str) # to convert to string as normal
# #     adata.obs['leiden'] = new_leiden.astype('category') # don't do this, it messes things up, just let scanpy do it as it plots
#     return adata

# concat['adata'] = sub_cluster_mapper(concat['adata'], sub_adatas)

# ct_tnk = [0, 1, 2, 3, 4, 5, 6, 8, 10, 11, 16, 19]
# ct_b = [9, 12, 18]
# ct_m = [7, 14, 15, 17]
# ct_other = [22, 23, 24, 25, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37]
# ct_drop = [13, 20, 21, 26, 27]

# cts = ['tnk', 'b', 'm', 'other', 'drop']

# ctdict = dict()
# ctdict['tnk'] = ct_tnk
# ctdict['b'] = ct_b
# ctdict['m'] = ct_m
# ctdict['other'] = ct_other
# ctdict['drop'] = ct_drop
# concat['adata'].obs['ct1'] = concat['adata'].obs['leiden']
# for ct in ctdict:
#     for clust in ctdict[ct]:
#         concat['adata'].obs['ct1'].replace(str(clust), ct, inplace=True)
# concat['adata'].obs['ct1'] = concat['adata'].obs['ct1'].astype('category')

# for ct in cts:
#     ct_dict = dict()
#     ct_dict['adata'] = concat['adata'][concat['adata'].obs['ct1'] == ct].copy()

#     path = '/data/codec/production.run/mrna/pkls/aggr/%s.pkl' % ct

#     with open(path,'wb') as file:
#                 pkl.dump(ct_dict, file, protocol=4)