# Here's the new plan:

1. Rerun the adts.clr.ipynb to make a new adts with the correct features we want to keep (95 of them) and the correct `adts_n_counts` in the `.obs`.
2. Load that in and perform the clr normalization, scaling, combat, scaling, neigbors, UMAP and leiden as below.
3. Remove _only_ the cluster that shows the very high counts, should be around number 17 or so.
4. Rerun the processing without those cells present. 
5. Run Leiden (and leiden with restrictions) to cluster the cells as best as possible without splitting hairs. Maybe make a lower limit of 0.5% of cells can be split further. It doesn't _exactly_ have to match the UMAP, and actually if a cluster is spread across multiple distinct locations across the UMAP, it probably means its a doublet or multiplet anyway.
6. 

In [None]:
import scanpy as sc
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import warnings
import itertools as it
import json
import requests
import seaborn as sns
import pickle as pkl
from functools import reduce

In [None]:
sc.settings.verbosity = 4
sc.settings.set_figure_params(dpi=80)
print(sc.__version__)
sc.settings.n_jobs=16

In [None]:
prefix = '/data/codec/production.run/'

### Load in Data

In [None]:
path = prefix + 'mrna/pkls/aggr/wells.sng.w_covars.pkl'

# with open(path,'wb') as file:
#     pkl.dump(wells, file)
    
with open(path,'rb') as file:
    wells = pkl.load(file)

### Adjust Cell Barcodes, Filter

I'm adjusting the cell barcodes to make them match their well number, which I also did with the ADTs.

In [None]:
for well in wells:
    wells[well]['adata'].obs_names = [i[:16] + '-%s' % well for i in wells[well]['adata'].obs_names]

### Concatenate

In [None]:
concat = wells[0]['adata'].concatenate(*[wells[i]['adata'] for  i in range(1, 12)])

In [None]:
path = prefix + 'adts/pkls/concat.adts.pkl'

# with open(path,'wb') as file:
#     pkl.dump(wells, file)
    
with open(path,'rb') as file:
    raw_adts = pkl.load(file)

In [None]:
adts = raw_adts.copy()

In [None]:
plt.figure(figsize=(20,4))
plt.hist(adts.obs['adts_n_counts'].values, bins=np.logspace(np.log10(10),np.log10(1e6), 1000))
plt.grid(False)
plt.grid(True, 'both', 'both')
plt.xscale('log')
plt.yscale('log')

In [None]:
def clr_normalize_column(x):
    normed_column = np.log1p((x) / (np.exp(sum(np.log1p((x)[x > 0 ])) / len(x + 1))))
    return normed_column
def clr_normalize(x):
    normed_matrix = np.apply_along_axis(clr_normalize_column, 1, x)
    return normed_matrix

In [None]:
sc.pp.normalize_per_cell(adts, counts_per_cell_after=1e6);

In [None]:
adts.X = clr_normalize(adts.X.toarray())

In [None]:
sc.pp.scale(adts)

In [None]:
adts.obs = adts.obs.join(concat.obs[['cond','free_id', 'percent_mito', 'viability_eq_ctrl', 'viability_sample']])

In [None]:
sc.pp.combat(adts, key='batch',covariates=['free_id'])
sc.pp.combat(adts, key='cond',covariates=['free_id'])
sc.pp.scale(adts)

In [None]:
warnings.filterwarnings('ignore')
sc.pp.neighbors(adts, n_neighbors=15, n_pcs=0) # use .X because all the surface markers we're looking at are probably pretty important
warnings.filterwarnings('default')

In [None]:
sc.tl.umap(adts)

In [None]:
sc.tl.leiden(adts, resolution=1)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax, palette in zip(['cond', 'leiden'], ax, [None, sc.pl.palettes.default_20]):
    ax.set_facecolor('whitesmoke')
    sc.pl.umap(adts,color=color, ax=ax, show=False, return_fig=False, palette=palette)

In [None]:
fig, ax = plt.subplots(1,7, figsize=(30,4))
for val, ax in tqdm(zip(adts.obs['cond'].dtype.categories, np.ravel(ax))):
    adts.obs['val'] = adts.obs['cond'] == val
    ax.set_facecolor('black')
    sc.pl.umap(adts,color='val', ax=ax, show=False, return_fig=False, title=val)
    adts.obs.drop(columns='val', inplace=True)
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(4,6,figsize=(20,12))
for val, ax in tqdm(zip(adts.obs['leiden'].dtype.categories, np.ravel(ax))):
    adts.obs['val'] = adts.obs['leiden'] == val
    ax.set_facecolor('black')
    sc.pl.umap(adts,color='val', ax=ax, show=False, return_fig=False, title=val)
    adts.obs.drop(columns='val', inplace=True)
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,10))
adts.obs['val'] = adts.obs['adts_n_counts'] < 1000
ax.set_facecolor('black')
sc.pl.umap(adts,color='val', ax=ax, show=False, return_fig=False, palette=palette, size=5);
adts.obs.drop(columns='val', inplace=True)

In [None]:
path = prefix + 'adts/pkls/adts.clr.combat.dim.1.pkl'

# with open(path,'wb') as file:
#     pkl.dump(adts, file)
    
with open(path,'rb') as file:
    adts = pkl.load(file)

Remove Contamination

In [None]:
adts = adts[~adts.obs['leiden'].isin(['15']),:].copy()
adts = adts[adts.obs['adts_n_counts'] > 1000].copy()

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax, palette in zip(['cond', 'leiden'], ax, [None, sc.pl.palettes.default_20]):
    ax.set_facecolor('whitesmoke')
    sc.pl.umap(adts,color=color, ax=ax, show=False, return_fig=False, palette=palette, size=5)

In [None]:
cond_leiden_df = cond_leiden_df.divide(cond_leiden_df.sum(0),1)

In [None]:
for i in adts.var_names:
    print(i)

In [None]:
fig, ax = plt.subplots(12,8,figsize=(30,40))
for p, ax in tqdm(zip(adts.var_names, np.ravel(ax))):
    ax.set_facecolor('black')
    sc.pl.umap(adts, color=p, size=3, ncols=8, ax=ax, show=False, return_fig=False, use_raw=False)
plt.tight_layout()

In [None]:
conds = adts.obs['cond'].dtype.categories
clusts = adts.obs['leiden'].dtype.categories

In [None]:
cond_leiden_df = pd.DataFrame(index=conds, columns=clusts, dtype=np.int)

In [None]:
for cond in tqdm(conds):
    for clust in clusts:
        cond_leiden_df.loc[cond, clust] = adts[(adts.obs['cond'] == cond) & (adts.obs['leiden'] == clust)].shape[0]

In [None]:
cond_leiden_df_div = cond_leiden_df.divide(cond_leiden_df.sum(0),1)

In [None]:
fig, ax = plt.subplots(1,2,figsize=(20,6))
for ax, df, ylabel in zip(ax, [cond_leiden_df, cond_leiden_df_div], ['Number','Proportion']):
    labels = clusts
    props = dict()

    for cond in conds:
        props[cond] = df.loc[cond].values
    lastpos = [0]*len(clusts)
    for cond in props:
        ax.bar(labels, props[cond], label=cond, bottom=lastpos)
        lastpos = props[cond] + lastpos

    ax.set_ylabel(ylabel)
    ax.legend()

In [None]:
labels = deconvolution.columns.values
counts = dict()

for cond in conds:
    counts[cond] = deconvolution.values.T[(scheme==cond).values.T]

fig, ax = plt.subplots(figsize=(30,5))
lastpos = [0]*64
for cond in counts:
    ax.bar(labels, counts[cond], label=cond, bottom=lastpos)
    lastpos = counts[cond] + lastpos

ax.set_ylabel('Counts')
ax.legend()

plt.show()

In [None]:
plt_contam_proteins = ['CD9|CD9', 'CD226|CD226', 'CD61|ITGB3', 'CD49b|ITGA2', 'CD29|ITGB1']

In [None]:
fig, ax = plt.subplots(1,5,figsize=(30,6))
for p, ax in tqdm(zip(plt_contam_proteins, np.ravel(ax))):
    ax.set_facecolor('black')
    sc.pl.umap(adts, color=p, size=3, ncols=8, ax=ax, show=False, return_fig=False, use_raw=False)
plt.tight_layout()

In [None]:
sc.tl.score_genes(adts, gene_list=plt_contam_proteins, use_raw=False)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax, s, fcolor, palette in zip(['CD94|KLRD1', 'leiden'], ax, [5, 2], ['black', 'whitesmoke'], [None, sc.pl.palettes.default_102]):
    ax.set_facecolor(fcolor)
    sc.pl.umap(adts,color=color, ax=ax, show=False, return_fig=False, palette=palette, size=s)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(10,10))
ax.set_facecolor('black')
sc.pl.umap(adts,color='percent_mito', ax=ax, show=False, return_fig=False, palette=palette, size=5);

In [None]:
meds = list()
for clust in adts.obs['leiden'].dtype.categories:
    meds.append(np.median(adts.obs[adts.obs['leiden'] == clust]['score'].values))
plt.figure(figsize=(20,5))
sns.violinplot(data=adts.obs[['score', 'leiden']], y='score', x='leiden');
plt.plot([0, 46],[1, 1], linestyle='--');

In [None]:
plt.hist(adts.obs['score'].values,bins=100);

In [None]:
plt.figure(figsize=(8,3))
plt.hist(meds, bins=100);

# Grouping and Subclustering
Going to combine or break up those clusters further.

In [None]:
exclude = [0, 4, 13, 17, 18, 19, 20, 21]
clusts = [i for i in range(len(adts.obs['leiden'].dtype.categories)) if i not in exclude]

In [None]:
sub_adts = dict()
for clust in adts.obs['leiden'].dtype.categories:
    sub_adts[str(clust)] = adts[adts.obs['leiden'] == str(clust)].copy()

In [None]:
sc.tl.leiden(sub_adts['1'], resolution=0.25) # subcluster them using Leiden
sc.tl.leiden(sub_adts['1'], resolution=0.5, restrict_to=('leiden',['0'])) # subcluster them using Leiden
# sc.tl.leiden(sub_adts['1'], resolution=0.5, restrict_to=('leiden',['0'])) # subcluster them using Leiden
# sc.tl.leiden(sub_adts['1'], resolution=0.3, restrict_to=('leiden',['0,4'])) # subcluster them using Leiden
stupid_name_clusts = sub_adts['1'].obs['leiden'].dtype.categories
for i, j in zip(stupid_name_clusts, range(len(stupid_name_clusts))):
    sub_adts['1'].obs['leiden'].replace(i, j, inplace=True)
sub_adts['1'].obs['leiden'] = sub_adts['1'].obs['leiden'].astype(int)
sub_adts['1'].obs['leiden'] = sub_adts['1'].obs['leiden'].astype(str)

In [None]:
sc.pl.umap(sub_adts['1'], color='leiden')

In [None]:
fig, ax = plt.subplots(1,5,figsize=(20,4))
for val, ax in tqdm(zip(sub_adts['1'].obs['leiden'].dtype.categories, np.ravel(ax))):
    sub_adts['1'].obs['val'] = sub_adts['1'].obs['leiden'] == val
    ax.set_facecolor('black')
    sc.pl.umap(sub_adts['1'],color='val', ax=ax, show=False, return_fig=False, title=val)
    sub_adts['1'].obs.drop(columns='val', inplace=True)
plt.tight_layout()

In [None]:
groupings = [[0, 1, 3],
            ]
grouped_clusts = [i for j in groupings for i in j]
numclusts = np.unique(sub_adts['1'].obs['leiden'].values.astype(int))
for i in np.setdiff1d(numclusts, grouped_clusts):
    groupings.append([i])

In [None]:
ctdict = dict()
for i in range(len(groupings)):
    ctdict['ct%s' % str(i)] = groupings[i]

sub_adts['1'].obs['celltype'] = sub_adts['1'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        sub_adts['1'].obs['celltype'].replace(str(clust), ct, regex=True, inplace=True)
sub_adts['1'].obs['leiden'] = [i.strip('ct') for i in sub_adts['1'].obs['celltype'].astype('category')]

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5.25, 5))
for ax, f in zip(np.ravel(ax), ['leiden']):
    ax.set_facecolor('gray')
    sc.pl.umap(sub_adts['1'], color=f, ax=ax,show=False, return_fig=False)
plt.tight_layout()

In [None]:
sc.tl.leiden(sub_adts['2'], resolution=0.5) # subcluster them using Leiden
fig, ax = plt.subplots(1,1,figsize=(5.25, 5))
for ax, f in zip(np.ravel(ax), ['leiden']):
    ax.set_facecolor('gray')
    sc.pl.umap(sub_adts['2'], color=f, ax=ax, show=False, return_fig=False)
plt.tight_layout()

In [None]:
groupings = [[0, 5],
             [1, 2],
            ]
grouped_clusts = [i for j in groupings for i in j]
numclusts = np.unique(sub_adts['2'].obs['leiden'].values.astype(int))
for i in np.setdiff1d(numclusts, grouped_clusts):
    groupings.append([i])

In [None]:
ctdict = dict()
for i in range(len(groupings)):
    ctdict['ct%s' % str(i)] = groupings[i]

sub_adts['2'].obs['celltype'] = sub_adts['2'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        sub_adts['2'].obs['celltype'].replace(str(clust), ct, regex=True, inplace=True)
sub_adts['2'].obs['leiden'] = [i.strip('ct') for i in sub_adts['2'].obs['celltype'].astype('category')]

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5.25, 5))
for ax, f in zip(np.ravel(ax), ['leiden']):
    ax.set_facecolor('gray')
    sc.pl.umap(sub_adts['2'], color=f, ax=ax,show=False, return_fig=False)
plt.tight_layout()

In [None]:
sc.tl.leiden(sub_adts['3'], resolution=0.2) # subcluster them using Leiden
sc.pl.umap(sub_adts['3'],color='leiden', size=5)

In [None]:
sc.tl.leiden(sub_adts['5'], resolution=0.5) # subcluster them using Leiden
sc.pl.umap(sub_adts['5'],color='leiden', size=5)

In [None]:
sc.tl.leiden(sub_acg_t['5'], resolution=0.5) # subcluster them using Leiden
sc.pl.umap(sub_acg_t['5'],color='leiden', size=5)

In [None]:
sc.tl.leiden(sub_acg_t['7'], resolution=0.2) # subcluster them using Leiden
sc.pl.umap(sub_acg_t['7'],color='leiden', size=5)

In [None]:
sc.tl.leiden(sub_acg_t['10'], resolution=0.2) # subcluster them using Leiden
sc.pl.umap(sub_acg_t['10'],color='leiden', size=5)

In [None]:
def sub_cluster_mapper(adata, sub_adatas):
    '''
    This takes in the adata object and inserts a new leiden column in the `.obs`. 
    
    This function is really convoluted and there's probably a better, simpler way to do it,
    but it should theoretically work for any number of subclusters
    '''
    # ideally you'd make a copy of the adata object here, so we don't have to change the original
    # this would be in case we want to run it multiple times, perhaps the resolutions we put in didn't subset the clusters like we had hoped
    # and we need to run multiple times to adjust the resolution slightly
    
    # this block is to figure out that there are two new subclusters and they should be named 8, 9
    total_new_clusts = 0
    old_clusts = sub_adatas.keys()
    for sub_adata in sub_adatas:
        total_new_clusts += sub_adatas[sub_adata].obs['leiden'].astype(int).unique().max() + 1
    total_added_clusts = total_new_clusts - len(sub_adatas)
    new_clust_names_start = max(adata.obs['leiden'].astype(int))+1
    new_added_clust_names = [str(i) for i in range(new_clust_names_start,
                                                   new_clust_names_start + total_added_clusts)]
    
    # this block is to build a new list of leiden clusters from the old one 
    new_leiden = list()
    leiden_col = adata.obs['leiden'].copy()

    # this builds the new leiden cluster list, now adding a .1, .2, etc. to each new cluster
    for obs in leiden_col.index:
        clust_name = leiden_col.loc[obs]
        if clust_name not in old_clusts or sub_adatas[clust_name].obs.loc[obs, 'leiden'] == '0':
            new_leiden.append(clust_name)
        else:
            new_leiden.append(clust_name + '.%s' % sub_adatas[clust_name].obs.loc[obs,'leiden'])

    # this renames the .1, .2, etc clusters to the new, better names I came up with above (8 and 9)
    new_leiden = pd.Series(new_leiden, index=adata.obs_names)
    added_clusts = np.setdiff1d(new_leiden,adata.obs['leiden'])
    new_leiden.replace(dict(zip(added_clusts, new_added_clust_names)), inplace=True)
    
    # replace the old leiden column, must do these steps sequentially 
    adata.obs['leiden'] = new_leiden.astype(int) # to order the clusters by number
    adata.obs['leiden'] = new_leiden.astype(str) # to convert to string as normal
#     adata.obs['leiden'] = new_leiden.astype('category') # don't do this, it messes things up, just let scanpy do it as it plots
    return adata

In [None]:
acg_t = sub_cluster_mapper(acg_t, sub_acg_t)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax, palette in zip(['cond', 'leiden'], ax, [None, sc.pl.palettes.default_20]):
    ax.set_facecolor('black')
    sc.pl.umap(acg_t,color=color, ax=ax, show=False, return_fig=False, size=10, palette=palette)

# Single Gene Plotter

In [None]:
[i for i in adts.var_names if 'CD34' in i]

In [None]:
f = ['CD16']

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5,5))
ax.set_facecolor('black')
sc.pl.umap(adts, color=f, ax=ax,show=False, return_fig=False);

In [None]:
features = ['CD3|CD3E', 'CD20|MS4A1', 'IgD|IGHD', 'CD27|CD27', 'CD69|CD69', 
            'CD8|CD8A', 'CD4|CD4', 'CD45RO|PTPRC', 'CD45RA|PTPRC', 'CD185|CXCR5', 
            'CD62L|SELL', 'CD26|DPP4', 'CD103|ITGAE', 'CD56|NCAM1','CD16|FCGR3A',
           'CD14|CD14', 'CD25|IL2RA', 'CD94|KLRD1', 'CD11c|ITGAX', 'CD197|CCR7',
           'CD194|CCR4', 'TCRab|TRA_TRB', 'CD183|CXCR3', 'TCRgd|TRD_TRG', 'CD34|CD34']

In [None]:
fig, ax = plt.subplots(5,5,figsize=(25,24))
for ax, f in tqdm(zip(np.ravel(ax), features)):
    ax.set_facecolor('black')
    sc.pl.umap(adts, color=f, ax=ax,show=False, return_fig=False, use_raw=False)
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5,5))
ax.set_facecolor('black')
sc.pl.umap(adts, color=f, ax=ax,show=False, return_fig=False);

In [None]:
adts = adts[~adts.obs['leiden'].isin(['17']),:].copy()