In [None]:
import scanpy as sc
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import warnings
import itertools as it
import json
import requests
import seaborn as sns
import pickle as pkl
from functools import reduce

import matplotlib as mpl
import matplotlib.patches as patches
from matplotlib.gridspec import GridSpec
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

In [None]:
sc.settings.verbosity = 4
sc.settings.set_figure_params(dpi=80)
print(sc.__version__)
sc.settings.n_jobs=12

In [None]:
prefix = '/data/codec/production.run/mrna/'

In [None]:
path = prefix + 'obs/acg.nk.txt'

# with open(path,'w') as file:
#     for bc in acg_nk.obs_names:
#         file.write(bc + '\n')
        
with open(path,'r') as file:
    acg_nk_cells = [i.strip() for i in file.readlines()]

In [None]:
path = prefix + 'pkls/aggr/wells.sng.w_covars.pkl'

# with open(path,'wb') as file:
#     pkl.dump(wells, file)
    
with open(path,'rb') as file:
    wells = pkl.load(file)

### Adjust Cell Barcodes, Filter

I'm adjusting the cell barcodes to make them match their well number, which I also did with the ADTs.

In [None]:
for well in wells:
    wells[well]['adata'].obs_names = [i[:16] + '-%s' % well for i in wells[well]['adata'].obs_names]

### Concatenate

In [None]:
acg_nk = wells[0]['adata'].concatenate(*[wells[i]['adata'] for  i in range(1, 12)])[acg_nk_cells].copy() # I really shouldn't do this, I should go back and run cellranger aggr, but for now just concatenate

In [None]:
acg_nk.var['n_counts'] = acg_nk.X.toarray().sum(axis=0)

### Filter Genes, Transform Data

Drop genes with very low counts.

In [None]:
plt.figure(figsize=(8,6))
plt.hist(acg_nk.var['n_counts'].values)
plt.grid(False)
plt.grid(True, 'both', 'both')
# plt.xscale('log')
# plt.yscale('log')

In [None]:
sc.pp.normalize_per_cell(acg_nk, counts_per_cell_after=1e6)

In [None]:
sc.pp.log1p(acg_nk)

In [None]:
path = '/data/codec/production.run/adts/pkls/combat/concat.combat.adts.norm.log.pkl'
with open(path,'rb') as file:
    concat_adts = pkl.load(file)
transcripts = acg_nk.var_names
proteins = concat_adts['adata'].var_names
adts_df = pd.DataFrame(concat_adts['adata'].X, columns=proteins, index=concat_adts['adata'].obs_names)
acg_nk.obs = acg_nk.obs.join(adts_df)

### Highly Variable Genes Extraction

In [None]:
def hv_run(adata, flavor='cell_ranger', min_mean=0.0125, min_disp=0.5, max_mean=3, bins=500):    # Extract out highly variable genes, but don't subset just yet
    '''
    Run highly variable genes and return a new adata object, if provided.
    '''
    hv_adata = adata.copy() # make a copy because I don't want to change the original just yet
    sc.pp.highly_variable_genes(hv_adata, flavor=flavor,inplace=True, 
                                min_mean=min_mean, 
                                min_disp=min_disp, 
                                max_mean=max_mean,
                                n_bins=bins)
    
    means = hv_adata.var['means'].values
    means_sorted = np.unique(np.sort(means))
    if means_sorted[0] == 0:
        mean_shift = means_sorted[1]
    elif means_sorted[0] < 0:
        mean_shift = means_sorted[1] - 2*means_sorted[0]
    else:
        mean_shift = 0
    
    disps = hv_adata.var['dispersions_norm'].values
    disps_sorted = np.unique(np.sort(disps))
    if disps_sorted[0] == 0:
        disp_shift = disps_sorted[1]
    elif disps_sorted[0] < 0:
        disp_shift = disps_sorted[1] - 2*disps_sorted[0]
    else:
        disp_shift = 0
        
    log_means = np.log10(means + mean_shift)
    log_disps = np.log10(disps + disp_shift)
    
    hv_adata.var['log_means'] = log_means
    hv_adata.var['log_disps'] = log_disps
    
    return hv_adata

def hv_plot(hv_adata, gate=None, highlight_genes=None, bw='scott'):
    '''
    Plot the means and normalized dispersions from the adata object provided. Choose to
    also plot a gate or highlight certain genes.
    '''

    
    # do what the scanpy function does, plotting normalized dispersions with means as blue dots, 
    # I don't want to plot in logspace but I don't want to use the log function because then the gates don't work
    # if they contain segments with fractional slopes (i.e. non-straight lines). To be robust to these gates, I therefore 
    # will log10 everything and just plot in linear space
    log_means = hv_adata.var['log_means'].values
    log_disps = hv_adata.var['log_disps'].values
    
    fig = plt.figure(figsize=(19, 6))
    gs = GridSpec(2, 3, figure=fig)
    ax1 = fig.add_subplot(gs[:, 0])
    ax2 = fig.add_subplot(gs[:, 1])
    ax3 = fig.add_subplot(gs[0, 2])
    ax4 = fig.add_subplot(gs[1, 2])

#     ax1.hist2d(log_means, log_disps, bins=bins);
    ax1.get_xaxis().set_minor_locator(mpl.ticker.AutoMinorLocator())
    ax1.get_yaxis().set_minor_locator(mpl.ticker.AutoMinorLocator())
    ax1.grid(True, which='both',axis='both', alpha=0.4)
    ax1.set_ylabel('Log Dispersions')
    ax1.set_xlabel('Log Means')
    sns.kdeplot(log_means, log_disps, bw=bw, shade=True, shade_lowest=False, ax=ax1, color='skyblue')

    ax2.minorticks_on() # throw on the minor ticks for use with the grid, will help with subsetting
    ax2.grid(True,which='both',axis='both') # turn the grid on
    ax2.scatter(log_means, log_disps, s=0.1, c='b')
    ax2.set_ylabel('Log Dispersions')
    ax2.set_xlabel('Log Means')

#     ax3.hist(log_means, bins=bins)
    ax3.minorticks_on()
    ax3.grid(True,which='both',axis='both')
    ax3.set_title('Log Means')
    sns.kdeplot(log_means, bw=bw, ax=ax3, color='blue')

#     ax4.hist(log_disps, bins=bins)
    ax3.minorticks_on()
    ax3.grid(True,which='both',axis='both')
    ax4.set_title('Log Dispersions')
    sns.kdeplot(log_disps, bw=bw, ax=ax4, color='blue')

    plt.tight_layout();

    if type(highlight_genes) != type(None): # do you want to highlight some genes?
        mask = [i in highlight_genes for i in hv_adata.var_names] # create a boolean mask of which genes to highlight
        highlight_means = log_means[mask] # subset only those means
        highlight_disps = log_disps[mask] # subset only those dispersions
        ax2.scatter(highlight_means,highlight_disps,s=30, facecolors='none', edgecolors='r'); # plot with a red circle around the blue dot
        
    if type(gate) != type(None):
        # You can draw a gate around the genes you want. There should be a check for using only rectangular gates (or only polygons with right angles). 
        # I have noted that if you try to draw angled lines in log space using the shapely package, the points_in_poly function does not return the right subset of points within the polygon.
        gatepatch = patches.Polygon(gate,linewidth=1,edgecolor='teal',facecolor='turquoise',alpha=0.5) # create a matplotlib patch for the gate to the plot
        ax2.add_patch(gatepatch); # add the gate to the plot

def hv_genes(hv_adata, gate, adata=None):
    '''
    Receive boolean for genes in gate or new subsetted adata object, if provided.
    '''
    # You can subset the genes you want using the gate. Again, there should be a check for using only rectangular gates (or only polygons with right angles). 
    # I have noted that if you try to draw angled lines in log space using the shapely package, the points_in_poly function does not return the right subset of points within the polygon.
    genes = list()
    log_means = hv_adata.var['log_means'].values
    log_disps = hv_adata.var['log_disps'].values
    
    pointsmap = map(Point,log_means,log_disps) # make each point a shapely.geometry.Point
    
    polygon = Polygon(gate) # make your gate a shapely.geometry.Polygon
    for i in pointsmap:
        genes.append(polygon.contains(i)) # this is the workhorse, determining which points are in the gate
    if type(adata) != type(None): # if the adata has been provided, subset it
        adata = adata[:,genes].copy()
        return adata
    else: # if it has not been provided return the boolean array noting which genes will be kept
        return genes

Generate a "highly variable adata" object that has run the highly variable genes extraction function. I generate a separate object because I don't want to necessarily change the original object.

In [None]:
hv_adata = hv_run(acg_nk)

In [None]:
hv_plot(hv_adata, highlight_genes=[i for i in acg_nk.var_names if i.startswith('RP')])

In [None]:
xrange = (-1.4, 1.2)
yrange = (0.65, 1.4)
gate = np.array([(xrange[0], yrange[0]), 
                  (xrange[0], yrange[1]), 
                  (xrange[1], yrange[1]), 
                  (xrange[1], yrange[0])])

In [None]:
hv_plot(hv_adata, gate=gate)

In [None]:
genes = hv_genes(hv_adata, gate=gate)

In [None]:
sum(genes)

You can get a new adata object with your genes now subsetted:

In [None]:
acg_nk = hv_genes(hv_adata, gate=gate, adata=acg_nk)

In [None]:
acg_nk.shape

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,4))
for vals, ax in zip(['percent_mito','n_counts'], np.ravel(ax)):
    ax.hist(acg_nk.obs[vals].values,bins=100)

In [None]:
sc.pp.regress_out(acg_nk, ['percent_mito','n_counts'],n_jobs=1)

In [None]:
sc.pp.scale(acg_nk)

In [None]:
sc.pp.combat(acg_nk, key='batch',covariates=['cond','free_id'])

In [None]:
sc.pp.pca(acg_nk,n_comps=150)

In [None]:
sc.pl.pca_variance_ratio(acg_nk,log=True, n_pcs=50)

In [None]:
sc.pp.neighbors(acg_nk,n_neighbors=15,n_pcs=20)

In [None]:
sc.tl.umap(acg_nk)

In [None]:
sc.tl.leiden(acg_nk, resolution=0.6)

In [None]:
acg_nk.uns['cond_colors']

In [None]:
tnk.uns['cond_colors']

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax, palette in zip(['cond', 'leiden'], ax, [None, sc.pl.palettes.default_20]):
    sc.pl.umap(acg_nk,color=color, ax=ax, show=False, return_fig=False, size=20, palette=palette)

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5.5,5))
ax.set_facecolor('black')
sc.pl.umap(acg_nk, color='PDGFD', ax=ax,show=False, return_fig=False, size=20);

In [None]:
clusts = ['1','3']

In [None]:
sub_acg_nk = dict()
for clust in clusts:
    sub_acg_nk[str(clust)] = acg_nk[acg_nk.obs['leiden'] == str(clust)].copy()

In [None]:
sc.tl.leiden(sub_acg_nk['1'], resolution=0.4) # subcluster them using Leiden
sc.pl.umap(sub_acg_nk['1'],color='leiden', size=20)

In [None]:
sc.tl.leiden(sub_acg_nk['3'], resolution=0.3) # subcluster them using Leiden
sc.pl.umap(sub_acg_nk['3'],color='leiden', size=20)

In [None]:
def sub_cluster_mapper(adata, sub_adatas):
    '''
    This takes in the adata object and inserts a new leiden column in the `.obs`. 
    
    This function is really convoluted and there's probably a better, simpler way to do it,
    but it should theoretically work for any number of subclusters
    '''
    # ideally you'd make a copy of the adata object here, so we don't have to change the original
    # this would be in case we want to run it multiple times, perhaps the resolutions we put in didn't subset the clusters like we had hoped
    # and we need to run multiple times to adjust the resolution slightly
    
    # this block is to figure out that there are two new subclusters and they should be named 8, 9
    total_new_clusts = 0
    old_clusts = sub_adatas.keys()
    for sub_adata in sub_adatas:
        total_new_clusts += sub_adatas[sub_adata].obs['leiden'].astype(int).unique().max() + 1
    total_added_clusts = total_new_clusts - len(sub_adatas)
    new_clust_names_start = max(adata.obs['leiden'].astype(int))+1
    new_added_clust_names = [str(i) for i in range(new_clust_names_start,
                                                   new_clust_names_start + total_added_clusts)]
    
    # this block is to build a new list of leiden clusters from the old one 
    new_leiden = list()
    leiden_col = adata.obs['leiden'].copy()

    # this builds the new leiden cluster list, now adding a .1, .2, etc. to each new cluster
    for obs in leiden_col.index:
        clust_name = leiden_col.loc[obs]
        if clust_name not in old_clusts or sub_adatas[clust_name].obs.loc[obs, 'leiden'] == '0':
            new_leiden.append(clust_name)
        else:
            new_leiden.append(clust_name + '.%s' % sub_adatas[clust_name].obs.loc[obs,'leiden'])

    # this renames the .1, .2, etc clusters to the new, better names I came up with above (8 and 9)
    new_leiden = pd.Series(new_leiden, index=adata.obs_names)
    added_clusts = np.setdiff1d(new_leiden,adata.obs['leiden'])
    new_leiden.replace(dict(zip(added_clusts, new_added_clust_names)), inplace=True)
    
    # replace the old leiden column, must do these steps sequentially 
    adata.obs['leiden'] = new_leiden.astype(int) # to order the clusters by number
    adata.obs['leiden'] = new_leiden.astype(str) # to convert to string as normal
#     adata.obs['leiden'] = new_leiden.astype('category') # don't do this, it messes things up, just let scanpy do it as it plots
    return adata

In [None]:
acg_nk = sub_cluster_mapper(acg_nk, sub_acg_nk)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15,7))
for color, ax, palette in zip(['batch', 'leiden'], ax, [None, sc.pl.palettes.default_20]):
    ax.set_facecolor('whitesmoke')
    sc.pl.umap(acg_nk,color=color, ax=ax, show=False, return_fig=False, size=30, palette=palette)

In [None]:
def add_raw(adata, transformed=True):
    path = prefix + 'pkls/aggr/wells.sng.w_covars.pkl'
    
    with open(path,'rb') as file:
        wells = pkl.load(file)
    
    for well in wells:
        wells[well]['adata'].obs_names = [i[:16] + '-%s' % well for i in wells[well]['adata'].obs_names]
    
    raw = wells[0]['adata'].concatenate(*[wells[i]['adata'] for  i in range(1, 12)])[adata.obs_names,:]
    
    if transformed == True:
        sc.pp.normalize_per_cell(raw,counts_per_cell_after=1e6)
        sc.pp.log1p(raw)
    
    adata.raw = raw
    return

In [None]:
add_raw(acg_nk)

In [None]:
sc.settings.verbosity = 0
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(acg_nk, groupby='leiden', n_genes=100, use_raw=True)
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(acg_nk, ncols=4, n_genes=20)
sc.settings.verbosity = 4

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax, palette in zip(['cond', 'leiden'], ax, [None, sc.pl.palettes.default_20]):
    sc.pl.umap(acg_nk,color=color, ax=ax, show=False, return_fig=False, size=20, palette=palette)