In [None]:
import scanpy as sc
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import warnings
import itertools as it
import json
import requests
import seaborn as sns
import pickle as pkl
from functools import reduce

import matplotlib as mpl
import matplotlib.patches as patches
from matplotlib.gridspec import GridSpec
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

In [None]:
sc.settings.verbosity = 4
sc.settings.set_figure_params(dpi=80)
print(sc.__version__)
sc.settings.n_jobs=12

In [None]:
prefix = '/data/codec/production.run/mrna/'

In [None]:
path = prefix + 'pkls/aggr/tnk.2.pkl'

# with open(path,'wb') as file:
#     pkl.dump(tnk, file, protocol=4)

with open(path,'rb') as file:
    tnk = pkl.load(file)

In [None]:
path = prefix + '../cond.colors.pkl'

cond_colors = dict(zip(tnk.obs['cond'].dtype.categories, tnk.uns['cond_colors']))

with open(path,'wb') as file:
    pkl.dump(cond_colors, file)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax, palette in zip(['cond', 'leiden'], ax, [None, sc.pl.palettes.default_20]):
    sc.pl.umap(tnk,color=color, ax=ax, show=False, return_fig=False, size=2, palette=palette)

In [None]:
clusts = [6, 10]

In [None]:
sub_tnk = dict()
for clust in clusts:
    sub_tnk[str(clust)] = tnk[tnk.obs['leiden'] == str(clust)].copy()

In [None]:
sc.tl.leiden(sub_tnk['6'], resolution=0.1) # subcluster them using Leiden
sc.pl.umap(sub_tnk['6'],color='leiden', size=0.6)

In [None]:
sc.tl.leiden(sub_tnk['10'], resolution=0.2) # subcluster them using Leiden
sc.pl.umap(sub_tnk['10'],color='leiden', size=0.6)

In [None]:
groupings = [[0, 2],
             ]
grouped_clusts = [i for j in groupings for i in j]
numclusts = np.unique(sub_tnk['10'].obs['leiden'].values.astype(int))
for i in np.setdiff1d(numclusts, grouped_clusts):
    groupings.append([i])

In [None]:
ctdict = dict()
for i in range(len(groupings)):
    ctdict['ct%s' % str(i)] = groupings[i]

sub_tnk['10'].obs['celltype'] = sub_tnk['10'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        sub_tnk['10'].obs['celltype'].replace(str(clust), ct, regex=True, inplace=True)
sub_tnk['10'].obs['leiden'] = [i.strip('ct') for i in sub_tnk['10'].obs['celltype'].astype('category')]

In [None]:
sc.pl.umap(sub_tnk['10'],color=['leiden'], size=3)

In [None]:
def sub_cluster_mapper(adata, sub_adatas):
    '''
    This takes in the adata object and inserts a new leiden column in the `.obs`. 
    
    This function is really convoluted and there's probably a better, simpler way to do it,
    but it should theoretically work for any number of subclusters
    '''
    # ideally you'd make a copy of the adata object here, so we don't have to change the original
    # this would be in case we want to run it multiple times, perhaps the resolutions we put in didn't subset the clusters like we had hoped
    # and we need to run multiple times to adjust the resolution slightly
    
    # this block is to figure out that there are two new subclusters and they should be named 8, 9
    total_new_clusts = 0
    old_clusts = sub_adatas.keys()
    for sub_adata in sub_adatas:
        total_new_clusts += sub_adatas[sub_adata].obs['leiden'].astype(int).unique().max() + 1
    total_added_clusts = total_new_clusts - len(sub_adatas)
    new_clust_names_start = max(adata.obs['leiden'].astype(int))+1
    new_added_clust_names = [str(i) for i in range(new_clust_names_start,
                                                   new_clust_names_start + total_added_clusts)]
    
    # this block is to build a new list of leiden clusters from the old one 
    new_leiden = list()
    leiden_col = adata.obs['leiden'].copy()

    # this builds the new leiden cluster list, now adding a .1, .2, etc. to each new cluster
    for obs in leiden_col.index:
        clust_name = leiden_col.loc[obs]
        if clust_name not in old_clusts or sub_adatas[clust_name].obs.loc[obs, 'leiden'] == '0':
            new_leiden.append(clust_name)
        else:
            new_leiden.append(clust_name + '.%s' % sub_adatas[clust_name].obs.loc[obs,'leiden'])

    # this renames the .1, .2, etc clusters to the new, better names I came up with above (8 and 9)
    new_leiden = pd.Series(new_leiden, index=adata.obs_names)
    added_clusts = np.setdiff1d(new_leiden,adata.obs['leiden'])
    new_leiden.replace(dict(zip(added_clusts, new_added_clust_names)), inplace=True)
    
    # replace the old leiden column, must do these steps sequentially 
    adata.obs['leiden'] = new_leiden.astype(int) # to order the clusters by number
    adata.obs['leiden'] = new_leiden.astype(str) # to convert to string as normal
#     adata.obs['leiden'] = new_leiden.astype('category') # don't do this, it messes things up, just let scanpy do it as it plots
    return adata

In [None]:
tnk = sub_cluster_mapper(tnk, sub_tnk)

In [None]:
sc.pl.umap(tnk,color=['leiden'], size=3)

In [None]:
groupings = [[1, 3, 4, 7, 8, 11],
             [0, 5, 9],
             [2, 10]
             ]
grouped_clusts = [i for j in groupings for i in j]
numclusts = np.unique(tnk.obs['leiden'].values.astype(int))
for i in np.setdiff1d(numclusts, grouped_clusts):
    groupings.append([i])

In [None]:
ctdict = dict()
for i in range(len(groupings)):
    ctdict['ct%s' % str(i)] = groupings[i]

tnk.obs['celltype'] = tnk.obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        tnk.obs['celltype'].replace(str(clust), ct, regex=True, inplace=True)
tnk.obs['leiden'] = [i.strip('ct') for i in tnk.obs['celltype'].astype('category')]

In [None]:
sc.pl.umap(tnk, color=['leiden', 'CD3|CD3E','CD4|CD4','CD8|CD8A','CD56|NCAM1','CD16|FCGR3A'], size=3)

In [None]:
sc.settings.verbosity = 0 
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(tnk, groupby='leiden', n_genes=20, groups=['4','5'])
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(tnk, ncols=5, n_genes=20)
sc.settings.verbosity = 4

Cluster 4 is probably B/T multiplets and 5 is probably T/M multiplets.

In [None]:
# acg_t = tnk[tnk.obs['leiden'] == '0'].copy()
acg_nk = tnk[tnk.obs['leiden'] == '3'].copy()
# br_t = tnk[tnk.obs['leiden'] == '1'].copy()
# br_nk = tnk[tnk.obs['leiden'] == '7'].copy()
# p_t = tnk[tnk.obs['leiden'] == '2'].copy()
# p_nk = tnk[tnk.obs['leiden'] == '6'].copy()

In [None]:
path = prefix + 'obs/acg.nk.txt'

with open(path,'w') as file:
    for bc in acg_nk.obs_names:
        file.write(bc + '\n')
        
# with open(path,'r') as file:
#     tnk_cells = [i.strip() for i in file.readlines()]

# Cell Type Split

In [None]:
def hv_run(adata, flavor='cell_ranger', min_mean=0.0125, min_disp=0.5, max_mean=3, bins=500):    # Extract out highly variable genes, but don't subset just yet
    '''
    Run highly variable genes and return a new adata object, if provided.
    '''
    hv_adata = adata.copy() # make a copy because I don't want to change the original just yet
    sc.pp.highly_variable_genes(hv_adata, flavor=flavor,inplace=True, 
                                min_mean=min_mean, 
                                min_disp=min_disp, 
                                max_mean=max_mean,
                                n_bins=bins)
    
    means = hv_adata.var['means'].values
    means_sorted = np.unique(np.sort(means))
    if means_sorted[0] == 0:
        mean_shift = means_sorted[1]
    elif means_sorted[0] < 0:
        mean_shift = means_sorted[1] - 2*means_sorted[0]
    else:
        mean_shift = 0
    
    disps = hv_adata.var['dispersions_norm'].values
    disps_sorted = np.unique(np.sort(disps))
    if disps_sorted[0] == 0:
        disp_shift = disps_sorted[1]
    elif disps_sorted[0] < 0:
        disp_shift = disps_sorted[1] - 2*disps_sorted[0]
    else:
        disp_shift = 0
        
    log_means = np.log10(means + mean_shift)
    log_disps = np.log10(disps + disp_shift)
    
    hv_adata.var['log_means'] = log_means
    hv_adata.var['log_disps'] = log_disps
    
    return hv_adata

def hv_plot(hv_adata, gate=None, highlight_genes=None, bw='scott'):
    '''
    Plot the means and normalized dispersions from the adata object provided. Choose to
    also plot a gate or highlight certain genes.
    '''

    
    # do what the scanpy function does, plotting normalized dispersions with means as blue dots, 
    # I don't want to plot in logspace but I don't want to use the log function because then the gates don't work
    # if they contain segments with fractional slopes (i.e. non-straight lines). To be robust to these gates, I therefore 
    # will log10 everything and just plot in linear space
    log_means = hv_adata.var['log_means'].values
    log_disps = hv_adata.var['log_disps'].values
    
    fig = plt.figure(figsize=(19, 6))
    gs = GridSpec(2, 3, figure=fig)
    ax1 = fig.add_subplot(gs[:, 0])
    ax2 = fig.add_subplot(gs[:, 1])
    ax3 = fig.add_subplot(gs[0, 2])
    ax4 = fig.add_subplot(gs[1, 2])

#     ax1.hist2d(log_means, log_disps, bins=bins);
    ax1.get_xaxis().set_minor_locator(mpl.ticker.AutoMinorLocator())
    ax1.get_yaxis().set_minor_locator(mpl.ticker.AutoMinorLocator())
    ax1.grid(True, which='both',axis='both', alpha=0.4)
    ax1.set_ylabel('Log Dispersions')
    ax1.set_xlabel('Log Means')
    sns.kdeplot(log_means, log_disps, bw=bw, shade=True, shade_lowest=False, ax=ax1, color='skyblue')

    ax2.minorticks_on() # throw on the minor ticks for use with the grid, will help with subsetting
    ax2.grid(True,which='both',axis='both') # turn the grid on
    ax2.scatter(log_means, log_disps, s=0.1, c='b')
    ax2.set_ylabel('Log Dispersions')
    ax2.set_xlabel('Log Means')

#     ax3.hist(log_means, bins=bins)
    ax3.minorticks_on()
    ax3.grid(True,which='both',axis='both')
    ax3.set_title('Log Means')
    sns.kdeplot(log_means, bw=bw, ax=ax3, color='blue')

#     ax4.hist(log_disps, bins=bins)
    ax3.minorticks_on()
    ax3.grid(True,which='both',axis='both')
    ax4.set_title('Log Dispersions')
    sns.kdeplot(log_disps, bw=bw, ax=ax4, color='blue')

    plt.tight_layout();

    if type(highlight_genes) != type(None): # do you want to highlight some genes?
        mask = [i in highlight_genes for i in hv_adata.var_names] # create a boolean mask of which genes to highlight
        highlight_means = log_means[mask] # subset only those means
        highlight_disps = log_disps[mask] # subset only those dispersions
        ax2.scatter(highlight_means,highlight_disps,s=30, facecolors='none', edgecolors='r'); # plot with a red circle around the blue dot
        
    if type(gate) != type(None):
        # You can draw a gate around the genes you want. There should be a check for using only rectangular gates (or only polygons with right angles). 
        # I have noted that if you try to draw angled lines in log space using the shapely package, the points_in_poly function does not return the right subset of points within the polygon.
        gatepatch = patches.Polygon(gate,linewidth=1,edgecolor='teal',facecolor='turquoise',alpha=0.5) # create a matplotlib patch for the gate to the plot
        ax2.add_patch(gatepatch); # add the gate to the plot

def hv_genes(hv_adata, gate, adata=None):
    '''
    Receive boolean for genes in gate or new subsetted adata object, if provided.
    '''
    # You can subset the genes you want using the gate. Again, there should be a check for using only rectangular gates (or only polygons with right angles). 
    # I have noted that if you try to draw angled lines in log space using the shapely package, the points_in_poly function does not return the right subset of points within the polygon.
    genes = list()
    log_means = hv_adata.var['log_means'].values
    log_disps = hv_adata.var['log_disps'].values
    
    pointsmap = map(Point,log_means,log_disps) # make each point a shapely.geometry.Point
    
    polygon = Polygon(gate) # make your gate a shapely.geometry.Polygon
    for i in pointsmap:
        genes.append(polygon.contains(i)) # this is the workhorse, determining which points are in the gate
    if type(adata) != type(None): # if the adata has been provided, subset it
        adata = adata[:,genes].copy()
        return adata
    else: # if it has not been provided return the boolean array noting which genes will be kept
        return genes

Generate a "highly variable adata" object that has run the highly variable genes extraction function. I generate a separate object because I don't want to necessarily change the original object.

In [None]:
def add_raw(adata, transformed=True):
    path = prefix + 'pkls/aggr/wells.sng.w_covars.pkl'
    
    with open(path,'rb') as file:
        wells = pkl.load(file)
    
    for well in wells:
        wells[well]['adata'].obs_names = [i[:16] + '-%s' % well for i in wells[well]['adata'].obs_names]
    
    raw = wells[0]['adata'].concatenate(*[wells[i]['adata'] for  i in range(1, 12)])[adata.obs_names,:]
    
    if transformed == True:
        sc.pp.normalize_per_cell(raw,counts_per_cell_after=1e6)
        sc.pp.log1p(raw)
    
    adata.raw = raw
    return

In [None]:
add_raw(acg_nk)

In [None]:
hv_adata = hv_run(acg_nk.raw.to_adata())

In [None]:
hv_plot(hv_adata, highlight_genes= ['KIR2DL4'])

In [None]:
acg_nk

In [None]:
total_pcs = 150

In [None]:
acg_nk.shape

In [None]:
acg_nk = acg_nk[:,~((acg_nk.var_names == 'KIR2DL4') | (acg_nk.var_names == 'PDGFD'))]

In [None]:
acg_nk.shape

In [None]:
warnings.filterwarnings('ignore')
sc.settings.n_jobs = 12
sc.pp.pca(acg_nk,n_comps=total_pcs)
warnings.filterwarnings('default')

In [None]:
sc.pl.pca_variance_ratio(acg_nk,log=True, n_pcs=150)

In [None]:
warnings.filterwarnings('ignore')
sc.pp.neighbors(acg_nk, n_neighbors=15, n_pcs=50)
warnings.filterwarnings('default')

In [None]:
sc.tl.umap(acg_nk)

In [None]:
sc.tl.leiden(acg_nk, resolution=0.3)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax, palette in zip(['cond', 'leiden'], ax, [None, sc.pl.palettes.default_20]):
    ax.set_facecolor('whitesmoke')
    sc.pl.umap(acg_nk,color=color, ax=ax, show=False, return_fig=False, size=20, palette=palette)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5.5,5))
for ax, f in zip(np.ravel(ax), features):
    ax.set_facecolor('black')
    sc.pl.umap(acg_nk, color='percent_mito', ax=ax,show=False, return_fig=False, size=20)

In [None]:
clusts = ['0','1']

In [None]:
sub_acg_nk = dict()
for clust in clusts:
    sub_acg_nk[str(clust)] = acg_nk[acg_nk.obs['leiden'] == str(clust)].copy()

In [None]:
sc.tl.leiden(sub_acg_nk['0'], resolution=0.5) # subcluster them using Leiden
sc.pl.umap(sub_acg_nk['0'],color='leiden', size=20)

In [None]:
sc.tl.leiden(sub_acg_nk['1'], resolution=0.4) # subcluster them using Leiden
sc.pl.umap(sub_acg_nk['1'],color='leiden', size=20)

In [None]:
def sub_cluster_mapper(adata, sub_adatas):
    '''
    This takes in the adata object and inserts a new leiden column in the `.obs`. 
    
    This function is really convoluted and there's probably a better, simpler way to do it,
    but it should theoretically work for any number of subclusters
    '''
    # ideally you'd make a copy of the adata object here, so we don't have to change the original
    # this would be in case we want to run it multiple times, perhaps the resolutions we put in didn't subset the clusters like we had hoped
    # and we need to run multiple times to adjust the resolution slightly
    
    # this block is to figure out that there are two new subclusters and they should be named 8, 9
    total_new_clusts = 0
    old_clusts = sub_adatas.keys()
    for sub_adata in sub_adatas:
        total_new_clusts += sub_adatas[sub_adata].obs['leiden'].astype(int).unique().max() + 1
    total_added_clusts = total_new_clusts - len(sub_adatas)
    new_clust_names_start = max(adata.obs['leiden'].astype(int))+1
    new_added_clust_names = [str(i) for i in range(new_clust_names_start,
                                                   new_clust_names_start + total_added_clusts)]
    
    # this block is to build a new list of leiden clusters from the old one 
    new_leiden = list()
    leiden_col = adata.obs['leiden'].copy()

    # this builds the new leiden cluster list, now adding a .1, .2, etc. to each new cluster
    for obs in leiden_col.index:
        clust_name = leiden_col.loc[obs]
        if clust_name not in old_clusts or sub_adatas[clust_name].obs.loc[obs, 'leiden'] == '0':
            new_leiden.append(clust_name)
        else:
            new_leiden.append(clust_name + '.%s' % sub_adatas[clust_name].obs.loc[obs,'leiden'])

    # this renames the .1, .2, etc clusters to the new, better names I came up with above (8 and 9)
    new_leiden = pd.Series(new_leiden, index=adata.obs_names)
    added_clusts = np.setdiff1d(new_leiden,adata.obs['leiden'])
    new_leiden.replace(dict(zip(added_clusts, new_added_clust_names)), inplace=True)
    
    # replace the old leiden column, must do these steps sequentially 
    adata.obs['leiden'] = new_leiden.astype(int) # to order the clusters by number
    adata.obs['leiden'] = new_leiden.astype(str) # to convert to string as normal
#     adata.obs['leiden'] = new_leiden.astype('category') # don't do this, it messes things up, just let scanpy do it as it plots
    return adata

In [None]:
acg_nk = sub_cluster_mapper(acg_nk, sub_acg_nk)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15,7))
for color, ax, palette in zip(['batch', 'leiden'], ax, [None, sc.pl.palettes.default_20]):
    ax.set_facecolor('whitesmoke')
    sc.pl.umap(acg_nk,color=color, ax=ax, show=False, return_fig=False, size=30, palette=palette)

In [None]:
def add_raw(adata, transformed=True):
    path = prefix + 'pkls/aggr/wells.sng.w_covars.pkl'
    
    with open(path,'rb') as file:
        wells = pkl.load(file)
    
    for well in wells:
        wells[well]['adata'].obs_names = [i[:16] + '-%s' % well for i in wells[well]['adata'].obs_names]
    
    raw = wells[0]['adata'].concatenate(*[wells[i]['adata'] for  i in range(1, 12)])[adata.obs_names,:]
    
    if transformed == True:
        sc.pp.normalize_per_cell(raw,counts_per_cell_after=1e6)
        sc.pp.log1p(raw)
    
    adata.raw = raw
    return

In [None]:
add_raw(acg_nk)

In [None]:
sc.settings.verbosity = 0
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(acg_nk, groupby='leiden', n_genes=100, use_raw=True)
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(acg_nk, ncols=4, n_genes=20)
sc.settings.verbosity = 4

In [None]:
fig, ax = plt.subplots(1,6, figsize=(30,5))
for val, ax in tqdm(zip(acg_nk.obs['cond'].dtype.categories, np.ravel(ax))):
    acg_nk.obs['val'] = acg_nk.obs['cond'] == val
    ax.set_facecolor('black')
    sc.pl.umap(acg_nk,color='val', ax=ax, show=False, return_fig=False, title=val)
    acg_nk.obs.drop(columns='val', inplace=True)
plt.tight_layout()

In [None]:
fig, ax = plt.subplots(1,4, figsize=(30,7))
for val, ax in tqdm(zip(acg_nk.obs['leiden'].dtype.categories, np.ravel(ax))):
    acg_nk.obs['val'] = acg_nk.obs['leiden'] == val
    ax.set_facecolor('black')
    sc.pl.umap(acg_nk,color='val', ax=ax, show=False, return_fig=False, title=val)
    acg_nk.obs.drop(columns='val', inplace=True)
plt.tight_layout()

In [None]:
# here's some really opaque code
conds = acg_nk.obs['cond'].dtype.categories
clusts = acg_nk.obs['leiden'].dtype.categories
a = acg_nk.obs.groupby(["cond", "leiden"]).size().reset_index(name='count').sort_values(['cond','leiden'])['count'].values.reshape((len(conds),(len(clusts))))

In [None]:
acg_nk.uns['cond_colors']

In [None]:
labels = clusts
counts = dict()

for cond, i in zip(conds, range(len(conds))):
    counts[cond] = a[i,:]

fig, ax = plt.subplots(figsize=(10,5))
lastpos = [0]*len(clusts)
for cond, color in zip(counts, acg_nk.uns['cond_colors']):
    ax.bar(labels, counts[cond], label=cond, bottom=lastpos, color=color)
    lastpos = counts[cond] + lastpos

ax.set_ylabel('Counts')
ax.legend()

plt.show()

In [None]:
raw_transcripts = acg_nk.raw.var_names
transcripts = acg_nk.var_names
proteins = [i for i in acg_nk.obs.columns if '|' in i]

In [None]:
name = 'TNFR'
print([i for i in proteins if name in i])
print([i for i in transcripts if name in i])
print([i for i in raw_transcripts if name in i])

In [None]:
proteins.index('CD4|CD4') 
# proteins.index('CD94|KLRD1')

In [None]:
features = ['CD56|NCAM1','NCAM1','CD16|FCGR3A','FCGR3A','NKG7']

In [None]:
fig, ax = plt.subplots(1,5,figsize=(30,5))
for ax, f in zip(np.ravel(ax), features):
    ax.set_facecolor('black')
    sc.pl.umap(acg_nk, color=f, ax=ax,show=False, return_fig=False, size=20, use_raw=True)

In [None]:
plot_genes = list()
clusts = acg_nk.obs['leiden'].dtype.categories
for j in clusts:
    plot_genes.append([i for i in acg_nk.uns['rank_genes_groups']['names'][j] if not i.startswith('MT')][:5])

In [None]:
features = [i for j in plot_genes for i in j]

In [None]:
len(features)

In [None]:
fig, ax = plt.subplots(7,5,figsize=(30,31))
for ax, f in zip(np.ravel(ax), features):
    ax.set_facecolor('black')
    sc.pl.umap(acg_nk, color=f, ax=ax,show=False, return_fig=False, size=20, use_raw=True)

In [None]:
plot_genes = list()
clusts = acg_nk.obs['leiden'].dtype.categories
for j in clusts:
    plot_genes.append([i for i in acg_nk.uns['rank_genes_groups']['names'][j]])

In [None]:
features = [i for j in plot_genes for i in j]

In [None]:
sc.settings.verbosity = 0
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(acg_nk, groupby='leiden', n_genes=1000, use_raw=True, groups=['3'], reference='0')
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(acg_nk, ncols=4, n_genes=20)
sc.settings.verbosity = 4

In [None]:
features = [i for i in acg_nk.uns['rank_genes_groups']['names']['3']]

In [None]:
sc.pl.heatmap(acg_nk[(acg_nk.obs['leiden'] == '0') | (acg_nk.obs['leiden'] == '3')],groupby='leiden', var_names=features, use_raw=True, figsize=(30,15))

In [None]:
def get_dge(adata):
    deg_data = pd.DataFrame() 
    for i in ['scores', 'names', 'logfoldchanges', 'pvals', 'pvals_adj']:
        deg_data[i] = np.array(adata.uns['rank_genes_groups'][i].tolist()).flatten()
    return deg_data

In [None]:
deg_data = get_dge(acg_nk)

In [None]:
plt.scatter(deg_data['logfoldchanges'].values, -np.log10(deg_data['pvals_adj'].values), s=1)
# plt.scatter(deg_data[deg_data['names'].str.contains('MT')]['logfoldchanges'].values, 
#             -np.log10(deg_data[deg_data['names'].str.contains('MT')]['pvals_adj'].values), s=1, color='r');
plt.yscale('log')
plt.xscale('log')

In [None]:
def grouped_rank(adata, groups):

    assert len(groups) == 2
    for i in range(2):
        groups[i] = np.array(groups[i]).astype(str)
        
    grouped_clusts = [i for j in groups for i in j]
    numclusts = np.unique(adata.obs['leiden'].values)
    groups.append(np.setdiff1d(numclusts, grouped_clusts).astype('<U21'))
    fig, axes = plt.subplots(1, 2, figsize=(8, 3.5))
    for clusts, title, ax in zip(groups, ['0','1'], axes):
        adata.obs['val'] = adata.obs['leiden'].isin(clusts).values
        ax.set_facecolor('gray')
        sc.pl.umap(adata,color='val', ax=ax, size=10, show=False, return_fig=False, title=title)
    adata.obs.drop(columns='val', inplace=True)

    adata.obs['rank_compare'] = adata.obs['leiden'].copy()
    groups_dict = dict(zip(range(3), groups))
    for group in groups_dict:
        for clust in groups_dict[group]:
            adata.obs['rank_compare'].replace(clust, 'ct%s' % group, regex=True, inplace=True)
    adata.obs['rank_compare'] = adata.obs['rank_compare'].str.replace('ct','').astype('category')
    
    fig, ax = plt.subplots(1, 1, figsize=(5,5))
    sc.settings.verbosity = 0
    for ax in np.ravel(ax):
        rank_adata = adata.copy()
        sc.tl.rank_genes_groups(rank_adata, groupby='rank_compare', n_genes=20, groups=['0'], reference='1', use_raw=True)
        y = range(20)[::-1]
        x = [i[0] for i in rank_adata.uns['rank_genes_groups']['scores'].tolist()]
        txts = [i[0] for i in rank_adata.uns['rank_genes_groups']['names'].tolist()]
        ax.scatter(x, y, s=0)
        for i, txt in enumerate(txts):
            ax.annotate(txt, (x[i], y[i]), rotation=0, size=10)
        ax.set_yticklabels([])
    sc.settings.verbosity = 4
    adata.obs.drop(columns='rank_compare', inplace=True)
    return

In [None]:
grouped_rank(acg_nk,groups=[[0],[3]])

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5.5,5))
for ax, f in zip(np.ravel(ax), features):
    ax.set_facecolor('black')
    sc.pl.umap(acg_nk, color='SLC35B4', ax=ax,show=False, return_fig=False, size=20, use_raw=True)

In [None]:
compare = ['4','3']
by = 'leiden'
sc.settings.verbosity = 0
warnings.filterwarnings('ignore')
fig, ax = plt.subplots(1, 1, figsize=(5,5))
for ax in np.ravel(ax):
    rank_adata = acg_nk.copy()
    sc.tl.rank_genes_groups(rank_adata, groupby=by, n_genes=20, groups=[compare[0]], reference=compare[1])
    y = range(20)[::-1]
    x = [i[0] for i in rank_adata.uns['rank_genes_groups']['scores'].tolist()]
    txts = [i[0] for i in rank_adata.uns['rank_genes_groups']['names'].tolist()]
    ax.scatter(x, y, s=0)
    for i, txt in enumerate(txts):
        ax.annotate(txt, (x[i], y[i]), rotation=0, size=10)
    ax.set_title('%s vs %s' % (compare[0], compare[1]))
    ax.set_yticklabels([])
sc.settings.verbosity = 4
warnings.filterwarnings('default')

In [None]:
compare = ['0','4']
by = 'leiden'
sc.settings.verbosity = 0
warnings.filterwarnings('ignore')
fig, ax = plt.subplots(1, 1, figsize=(5,5))
for ax in np.ravel(ax):
    rank_adata = acg_nk.copy()
    sc.tl.rank_genes_groups(rank_adata, groupby=by, n_genes=20, groups=[compare[0]], reference=compare[1])
    y = range(20)[::-1]
    x = [i[0] for i in rank_adata.uns['rank_genes_groups']['scores'].tolist()]
    txts = [i[0] for i in rank_adata.uns['rank_genes_groups']['names'].tolist()]
    ax.scatter(x, y, s=0)
    for i, txt in enumerate(txts):
        ax.annotate(txt, (x[i], y[i]), rotation=0, size=10)
    ax.set_title('%s vs %s' % (compare[0], compare[1]))
    ax.set_yticklabels([])
sc.settings.verbosity = 4
warnings.filterwarnings('default')

In [None]:
fig, ax = plt.subplots(12,8,figsize=(30,40))
for p, ax in tqdm(zip(proteins, np.ravel(ax))):
    ax.set_facecolor('black')
    sc.pl.umap(acg_nk, color=p, size=10, ncols=8, ax=ax, show=False, return_fig=False)
plt.tight_layout()

In [None]:
clusts = [1, 2, 3, 4, 5, 6, 7]

In [None]:
sub_acgt = dict()
for clust in clusts:
    sub_acgt[str(clust)] = acg_t[acg_t.obs['leiden'] == str(clust)].copy()

In [None]:
sc.tl.leiden(sub_acgt['1'], resolution=0.7) # subcluster them using Leiden
sc.pl.umap(sub_acgt['1'],color='leiden', size=3)

In [None]:
groupings = [[2, 4],
            ]
grouped_clusts = [i for j in groupings for i in j]
numclusts = np.unique(sub_acgt['1'].obs['leiden'].values.astype(int))
for i in np.setdiff1d(numclusts, grouped_clusts):
    groupings.append([i])

In [None]:
ctdict = dict()
for i in range(len(groupings)):
    ctdict['ct%s' % str(i)] = groupings[i]

sub_acgt['1'].obs['celltype'] = sub_acgt['1'].obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        sub_acgt['1'].obs['celltype'].replace(str(clust), ct, regex=True, inplace=True)
sub_acgt['1'].obs['leiden'] = [i.strip('ct') for i in sub_acgt['1'].obs['celltype'].astype('category')]

In [None]:
sc.pl.umap(sub_acgt['1'],color='leiden', size=5)

In [None]:
sc.tl.leiden(sub_acgt['2'], resolution=0.2) # subcluster them using Leiden
sc.pl.umap(sub_acgt['2'],color='leiden', size=5)

In [None]:
sc.tl.leiden(sub_acgt['3'], resolution=0.3) # subcluster them using Leiden
sc.pl.umap(sub_acgt['3'],color='leiden', size=5)

In [None]:
sc.tl.leiden(sub_acgt['4'], resolution=0.3) # subcluster them using Leiden
sc.pl.umap(sub_acgt['4'],color='leiden', size=5)

In [None]:
sc.tl.leiden(sub_acgt['5'], resolution=0.3) # subcluster them using Leiden
sc.pl.umap(sub_acgt['5'],color='leiden', size=5)

In [None]:
sc.tl.leiden(sub_acgt['6'], resolution=0.3) # subcluster them using Leiden
sc.pl.umap(sub_acgt['6'],color='leiden', size=5)

In [None]:
sc.tl.leiden(sub_acgt['7'], resolution=0.3) # subcluster them using Leiden
sc.pl.umap(sub_acgt['7'],color='leiden', size=5)

In [None]:
def sub_cluster_mapper(adata, sub_adatas):
    '''
    This takes in the adata object and inserts a new leiden column in the `.obs`. 
    
    This function is really convoluted and there's probably a better, simpler way to do it,
    but it should theoretically work for any number of subclusters
    '''
    # ideally you'd make a copy of the adata object here, so we don't have to change the original
    # this would be in case we want to run it multiple times, perhaps the resolutions we put in didn't subset the clusters like we had hoped
    # and we need to run multiple times to adjust the resolution slightly
    
    # this block is to figure out that there are two new subclusters and they should be named 8, 9
    total_new_clusts = 0
    old_clusts = sub_adatas.keys()
    for sub_adata in sub_adatas:
        total_new_clusts += sub_adatas[sub_adata].obs['leiden'].astype(int).unique().max() + 1
    total_added_clusts = total_new_clusts - len(sub_adatas)
    new_clust_names_start = max(adata.obs['leiden'].astype(int))+1
    new_added_clust_names = [str(i) for i in range(new_clust_names_start,
                                                   new_clust_names_start + total_added_clusts)]
    
    # this block is to build a new list of leiden clusters from the old one 
    new_leiden = list()
    leiden_col = adata.obs['leiden'].copy()

    # this builds the new leiden cluster list, now adding a .1, .2, etc. to each new cluster
    for obs in leiden_col.index:
        clust_name = leiden_col.loc[obs]
        if clust_name not in old_clusts or sub_adatas[clust_name].obs.loc[obs, 'leiden'] == '0':
            new_leiden.append(clust_name)
        else:
            new_leiden.append(clust_name + '.%s' % sub_adatas[clust_name].obs.loc[obs,'leiden'])

    # this renames the .1, .2, etc clusters to the new, better names I came up with above (8 and 9)
    new_leiden = pd.Series(new_leiden, index=adata.obs_names)
    added_clusts = np.setdiff1d(new_leiden,adata.obs['leiden'])
    new_leiden.replace(dict(zip(added_clusts, new_added_clust_names)), inplace=True)
    
    # replace the old leiden column, must do these steps sequentially 
    adata.obs['leiden'] = new_leiden.astype(int) # to order the clusters by number
    adata.obs['leiden'] = new_leiden.astype(str) # to convert to string as normal
#     adata.obs['leiden'] = new_leiden.astype('category') # don't do this, it messes things up, just let scanpy do it as it plots
    return adata

In [None]:
acg_t = sub_cluster_mapper(acg_t, sub_acgt)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax, palette in zip(['cond', 'leiden'], ax, [None, sc.pl.palettes.default_20]):
    sc.pl.umap(acg_t,color=color, ax=ax, show=False, return_fig=False, size=4, palette=palette)

In [None]:
sc.settings.verbosity = 0
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(acg_t, groupby='leiden', n_genes=1000, use_raw=True)
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(acg_t, ncols=5, n_genes=20)
sc.settings.verbosity = 4

In [None]:
sc.pl.umap(acg_t, color=['CD4|CD4','CD8|CD8A'], size=2, ncols=5, use_raw=True) 

In [None]:
sc.pl.umap(acg_t, color=['CCR7', 'CD45RA|PTPRC','CD8|CD8A','CD8A', 'CD8B'], size=2, ncols=5, use_raw=True) 

In [None]:
fig, ax = plt.subplots(1,1,figsize=(8,8))
ax.set_facecolor('black')
sc.pl.umap(acg_t, color='leiden', size=10, ax=ax)

In [None]:
path = prefix + 'pkls/aggr/tnk.3.pkl'

with open(path,'wb') as file:
    pkl.dump(acg_t, file, protocol=4)

# with open(path,'rb') as file:
#     acg_t = pkl.load(file)

In [None]:
def add_raw(adata, transformed=True):
    path = prefix + 'pkls/aggr/wells.sng.w_covars.pkl'
    
    with open(path,'rb') as file:
        wells = pkl.load(file)
    
    for well in wells:
        wells[well]['adata'].obs_names = [i[:16] + '-%s' % well for i in wells[well]['adata'].obs_names]
    
    raw = wells[0]['adata'].concatenate(*[wells[i]['adata'] for  i in range(1, 12)])[adata.obs_names,:]
    
    if transformed == True:
        sc.pp.normalize_per_cell(raw,counts_per_cell_after=1e6)
        sc.pp.log1p(raw)
    
    adata.raw = raw
    return

In [None]:
add_raw(acg_t)

In [None]:
raw_transcripts = acg_t.raw.var_names
transcripts = acg_t.var_names
proteins = [i for i in acg_t.obs.columns if '|' in i]

In [None]:
name = 'CCR7'
print([i for i in proteins if name in i])
print([i for i in transcripts if name in i])
print([i for i in raw_transcripts if name in i])

In [None]:
proteins.index('TCRgd|TRD_TRG') 
# proteins.index('CD94|KLRD1')

In [None]:
features = ['CD3|CD3E', 'CD4|CD4','CD8|CD8A', 'CD45RA|PTPRC','CD45RO|PTPRC',
            'CD185|CXCR5', 'CD62L|SELL', 'CXCR3','CCR4', 'CCR6', 
            'LAT','RGS10','CD69|CD69', 'PITPNC1', 'SLC5A3',
            'TRDC', 'TRDV2', 'IKZF2', 'FOXP3', 'CD94|KLRD1',
            'CD26|DPP4', 'CD103|ITGAE', 'CCR7', 'NKG7','CCL5'
           ]

In [None]:
fig, ax = plt.subplots(5,5,figsize=(30,27.5))
for ax, f in zip(np.ravel(ax), features):
    ax.set_facecolor('black')
    sc.pl.umap(acg_t, color=f, ax=ax,show=False, return_fig=False, size=5, use_raw=True)

In [None]:
cts = ["Na誰ve CD4", "Th2", "Early Activated CD4", "Na誰ve CD8", 
       "Effector Memory RA CD8", "Effector Memory CD8", "Resting Treg", "Mito", 
       "Th17", "Tfh", "Tfh/Proliferation", "Doublet? 1", 
       "R848 Contam 1", "Early Activated CD8", "Central Memory CD8", "Gamma Delta Treg?", 
       "Gamma Delta Type2", "R848 Contam? 2", "Activated Treg", "Doublet? 2"]

In [None]:
ctdict = dict(zip(cts, [[i] for i in range(20)]))

In [None]:
acg_t.obs['ct1'] = acg_t.obs['leiden']
for ct in ctdict:
    for clust in ctdict[ct]:
        acg_t.obs['ct1'].replace(str(clust), ct, regex=True, inplace=True)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(10,10))
ax.set_facecolor('whitesmoke')
sc.pl.umap(acg_t, color='ct1', size=10, ax=ax, legend_loc='on data', legend_fontsize=10)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(10,10))
ax.set_facecolor('whitesmoke')
sc.pl.umap(acg_t, color='ct1', size=10, ax=ax)

In [None]:
fig, ax = plt.subplots(1,7, figsize=(30,4))
for val, ax in tqdm(zip(acg_t.obs['cond'].dtype.categories, np.ravel(ax))):
    acg_t.obs['val'] = acg_t.obs['cond'] == val
    ax.set_facecolor('black')
    sc.pl.umap(acg_t,color='val', ax=ax, show=False, return_fig=False, title=val)
    acg_t.obs.drop(columns='val', inplace=True)
plt.tight_layout()

In [None]:
sc.settings.verbosity = 0
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(acg_t_subs['Naive CD8'], groupby='cond', n_genes=100, groups=['A'], reference='C')
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(acg_t_subs['Naive CD8'])
sc.settings.verbosity = 4

In [None]:
good_cts = ["Na誰ve CD4", "Th2", "Early Activated CD4", "Na誰ve CD8", 
            "Effector Memory RA CD8", "Effector Memory CD8", "Resting Treg", 
            "Th17", "Tfh", "Tfh/Proliferation", "Early Activated CD8", 
            "Central Memory CD8", "Gamma Delta Treg?", 
            "Gamma Delta Type2", "Activated Treg"
           ]

In [None]:
sc.settings.verbosity = 0
warnings.filterwarnings('ignore')
fig, ax = plt.subplots(3, 5, figsize=(30,18))
for ct, ax in tqdm(zip(good_cts,np.ravel(ax))):
    rank_adata = acg_t[acg_t.obs['ct1'] == ct].copy()
    sc.tl.rank_genes_groups(rank_adata, groupby='cond', n_genes=20, groups=['A'], reference='C')
    x = range(20)
    y = [i[0] for i in rank_adata.uns['rank_genes_groups']['scores'].tolist()]
    txts = [i[0] for i in rank_adata.uns['rank_genes_groups']['names'].tolist()]
    ax.scatter(x, y, s=0)
    for i, txt in enumerate(txts):
        ax.annotate(txt, (x[i], y[i]), rotation=45, size=10)
    ax.set_title(ct)

In [None]:
gene = 'CD137|TNFRSF9'
fig, ax = plt.subplots(1,2,figsize=(12,5))
for cond, ax in zip(['C','A'], ax):
    ax.set_facecolor('black')
    sc.pl.umap(acg_t[acg_t.obs['cond'] == cond], ax=ax, show=False, return_fig=False, color=gene, size=10, use_raw=True)

In [None]:
sc.settings.verbosity = 0
warnings.filterwarnings('ignore')
fig, ax = plt.subplots(3, 5, figsize=(30,18))
for ct, ax in tqdm(zip(good_cts,np.ravel(ax))):
    rank_adata = acg_t[acg_t.obs['ct1'] == ct].copy()
    sc.tl.rank_genes_groups(rank_adata, groupby='cond', n_genes=20, groups=['G'], reference='C')
    y = range(20)[::-1]
    x = [i[0] for i in rank_adata.uns['rank_genes_groups']['scores'].tolist()]
    txts = [i[0] for i in rank_adata.uns['rank_genes_groups']['names'].tolist()]
    ax.scatter(x, y, s=0)
    for i, txt in enumerate(txts):
        ax.annotate(txt, (x[i], y[i]), rotation=0, size=10)
    ax.set_title(ct)

In [None]:
gene = 'AHNAK'
fig, ax = plt.subplots(1,2,figsize=(12,5))
for cond, ax in zip(['C','G'], ax):
    ax.set_facecolor('black')
    sc.pl.umap(acg_t[acg_t.obs['cond'] == cond], ax=ax, show=False, return_fig=False, color=gene, size=10, use_raw=True)

In [None]:
sc.pl.umap(acg_t, color=proteins, size=2, ncols=8)

In [None]:
sc.settings.verbosity = 0
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(acg_t, groupby='leiden', n_genes=40, groups=['18'],reference='6')
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(acg_t, ncols=5, n_genes=20)
sc.settings.verbosity = 4

In [None]:
[i[0] for i in acg_t.uns['rank_genes_groups']['names']]

In [None]:
sc.pl.umap(acg_t, color=[i[0] for i in acg_t.uns['rank_genes_groups']['names'][:20]], size=2, ncols=5)

In [None]:
sc.pl.umap(acg_t, color=[i[0] for i in acg_t.uns['rank_genes_groups']['names'][20:]], size=2, ncols=5)