In [None]:
import scanpy as sc
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import warnings
import itertools as it
import json
import requests
import seaborn as sns
import pickle as pkl
from functools import reduce

import matplotlib as mpl
import matplotlib.patches as patches
from matplotlib.gridspec import GridSpec
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon

In [None]:
sc.settings.verbosity = 4
sc.settings.set_figure_params(dpi=80)
print(sc.__version__)
sc.settings.n_jobs=12

In [None]:
prefix = '/data/codec/production.run/mrna/'

In [None]:
path = prefix + 'obs/cts/acg.nk.txt'

# with open(path,'w') as file:
#     for bc in acg_nk.obs_names:
#         file.write(bc + '\n')
        
with open(path,'r') as file:
    acg_nk_cells = [i.strip() for i in file.readlines()]

In [None]:
path = prefix + '../cond.colors.pkl'

with open(path,'rb') as file:
    cond_colors = pkl.load(file)

In [None]:
path = prefix + 'pkls/aggr/wells.sng.w_covars.pkl'

# with open(path,'wb') as file:
#     pkl.dump(wells, file)
    
with open(path,'rb') as file:
    wells = pkl.load(file)

# Processing

### Adjust Cell Barcodes, Filter

I'm adjusting the cell barcodes to make them match their well number, which I also did with the ADTs.

In [None]:
for well in wells:
    wells[well]['adata'].obs_names = [i[:16] + '-%s' % well for i in wells[well]['adata'].obs_names]

### Concatenate

In [None]:
acg_nk = wells[0]['adata'].concatenate(*[wells[i]['adata'] for  i in range(1, 12)])[acg_nk_cells].copy() # I really shouldn't do this, I should go back and run cellranger aggr, but for now just concatenate

In [None]:
acg_nk.var['n_counts'] = acg_nk.X.toarray().sum(axis=0)

### Filter Genes, Transform Data

Drop genes with very low counts.

In [None]:
plt.figure(figsize=(8,6))
plt.hist(acg_nk.var['n_counts'].values, bins=np.logspace(np.log10(1),np.log10(1e5), 200))
plt.grid(False)
plt.grid(True, 'both', 'both')
plt.xscale('log')
# plt.yscale('log')

In [None]:
# remove any genes that are now empty
sc.pp.filter_genes(acg_nk, min_counts=20, inplace=True)

In [None]:
sc.pp.normalize_per_cell(acg_nk, counts_per_cell_after=1e6)

In [None]:
sc.pp.log1p(acg_nk)

In [None]:
path = '/data/codec/production.run/adts/pkls/combat/concat.combat.adts.norm.log.pkl'
with open(path,'rb') as file:
    concat_adts = pkl.load(file)
transcripts = acg_nk.var_names
proteins = concat_adts['adata'].var_names
adts_df = pd.DataFrame(concat_adts['adata'].X, columns=proteins, index=concat_adts['adata'].obs_names)
acg_nk.obs = acg_nk.obs.join(adts_df)

In [None]:
fig, ax = plt.subplots(1,2,figsize=(10,4))
for vals, ax in zip(['percent_mito','n_counts'], np.ravel(ax)):
    ax.hist(acg_nk.obs[vals].values,bins=100)

In [None]:
sc.pp.regress_out(acg_nk, ['percent_mito', 'n_counts'],n_jobs=1)

In [None]:
sc.pp.scale(acg_nk)

In [None]:
sc.pp.combat(acg_nk, key='batch',covariates=['cond','free_id'])

In [None]:
sc.pp.scale(acg_nk)

In [None]:
total_pcs = 150

In [None]:
sc.pp.pca(acg_nk,n_comps=total_pcs)

In [None]:
sc.pl.pca_variance_ratio(acg_nk,log=True, n_pcs=50)

In [None]:
df_loadings = pd.DataFrame(acg_nk.varm['PCs'], index=acg_nk.var_names)
df_rankings = pd.DataFrame((-1 * df_loadings.values).argsort(0).argsort(0), index=df_loadings.index, columns=df_loadings.columns)

In [None]:
num = 50
percent_ribos = dict()
percent_mitos = dict()
for direction, boolean in zip(['corr', 'anti-corr'], [False, True]):
    percent_ribos[direction] = list()
    percent_mitos[direction] = list()
    for pc in range(total_pcs):
        top_genes = df_loadings[pc].sort_values(ascending=boolean)[:num].index
        percent_ribos[direction].append(len([i for i in top_genes if i.startswith('RP')])/num)
        percent_mitos[direction].append(len([i for i in top_genes if i.startswith('MT')])/num)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 5))
for direction, ax in zip(['corr', 'anti-corr'], ax):
    ax.bar(range(len(percent_ribos[direction])), percent_ribos[direction], alpha=0.5)
    ax.bar(range(len(percent_mitos[direction])), percent_mitos[direction], alpha=0.5)
    ax.set_title(direction)
    ax.set_ylim((0, 1))

In [None]:
acg_nk.obs['ribo_pc_1'] = acg_nk.obsm['X_pca'][:,0]
acg_nk.obs['ribo_pc_2'] = acg_nk.obsm['X_pca'][:,2]

In [None]:
warnings.filterwarnings('ignore')
sc.pp.regress_out(acg_nk, ['ribo_pc_1','ribo_pc_2'], n_jobs=1)
warnings.filterwarnings('default')

In [None]:
sc.pp.scale(acg_nk)

In [None]:
sc.pp.pca(acg_nk,n_comps=total_pcs)

In [None]:
sc.pl.pca_variance_ratio(acg_nk,log=True, n_pcs=50)

In [None]:
df_loadings = pd.DataFrame(acg_nk.varm['PCs'], index=acg_nk.var_names)
df_rankings = pd.DataFrame((-1 * df_loadings.values).argsort(0).argsort(0), index=df_loadings.index, columns=df_loadings.columns)

In [None]:
num = 50
percent_ribos = dict()
percent_mitos = dict()
for direction, boolean in zip(['corr', 'anti-corr'], [False, True]):
    percent_ribos[direction] = list()
    percent_mitos[direction] = list()
    for pc in range(total_pcs):
        top_genes = df_loadings[pc].sort_values(ascending=boolean)[:num].index
        percent_ribos[direction].append(len([i for i in top_genes if i.startswith('RP')])/num)
        percent_mitos[direction].append(len([i for i in top_genes if i.startswith('MT')])/num)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 5))
for direction, ax in zip(['corr', 'anti-corr'], ax):
    ax.bar(range(len(percent_ribos[direction])), percent_ribos[direction], alpha=0.5)
    ax.bar(range(len(percent_mitos[direction])), percent_mitos[direction], alpha=0.5)
    ax.set_title(direction)
    ax.set_ylim((0, 1))

In [None]:
warnings.filterwarnings('ignore')
sc.pp.neighbors(acg_nk,n_neighbors=15,n_pcs=20) # numba warning outlined here is not to be of concern: https://github.com/lmcinnes/umap/issues/252
warnings.filterwarnings('default')

In [None]:
sc.tl.umap(acg_nk)

In [None]:
sc.tl.leiden(acg_nk, resolution=0.4)

# Visualization

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20,10))
for color, ax, palette in zip(['cond', 'leiden'], ax, [None, sc.pl.palettes.default_20]):
    sc.pl.umap(acg_nk,color=color, ax=ax, show=False, return_fig=False, size=20, palette=palette)

In case the color scheme is not consistent.

In [None]:
acg_nk.uns['cond_colors']
cond_colors.values()
acg_nk.uns['cond_colors'] = ['#ff7f0e', '#279e68', '#d62728', '#aa40fc', '#8c564b', '#e377c2'];

In [None]:
fig, ax = plt.subplots(1,4,figsize=(22,5))
for f, ax in zip(['batch', 'percent_mito', 'n_counts', 'free_id'], np.ravel(ax)):
    ax.set_facecolor('black')
    sc.pl.umap(acg_nk, color=f, ax=ax,show=False, return_fig=False, size=20);

No significant mitochondrial effect. That's great!

In [None]:
fig, ax = plt.subplots(1,3, figsize=(15,5))
for val, ax in tqdm(zip(['A','C','G'], np.ravel(ax))):
    acg_nk.obs['val'] = acg_nk.obs['cond'] == val
    ax.set_facecolor('black')
    sc.pl.umap(acg_nk,color='val', ax=ax, show=False, return_fig=False, title=val)
    acg_nk.obs.drop(columns='val', inplace=True)
plt.tight_layout()

### Subclustering
Going to break up those clusters further.

In [None]:
clusts = ['1','2']

In [None]:
sub_acg_nk = dict()
for clust in clusts:
    sub_acg_nk[str(clust)] = acg_nk[acg_nk.obs['leiden'] == str(clust)].copy()

In [None]:
sc.tl.leiden(sub_acg_nk['1'], resolution=0.36) # subcluster them using Leiden
sc.pl.umap(sub_acg_nk['1'],color='leiden', size=20)

In [None]:
sc.tl.leiden(sub_acg_nk['2'], resolution=0.5) # subcluster them using Leiden
sc.pl.umap(sub_acg_nk['2'],color='leiden', size=20)

In [None]:
def sub_cluster_mapper(adata, sub_adatas):
    '''
    This takes in the adata object and inserts a new leiden column in the `.obs`. 
    
    This function is really convoluted and there's probably a better, simpler way to do it,
    but it should theoretically work for any number of subclusters
    '''
    # ideally you'd make a copy of the adata object here, so we don't have to change the original
    # this would be in case we want to run it multiple times, perhaps the resolutions we put in didn't subset the clusters like we had hoped
    # and we need to run multiple times to adjust the resolution slightly
    
    # this block is to figure out that there are two new subclusters and they should be named 8, 9
    total_new_clusts = 0
    old_clusts = sub_adatas.keys()
    for sub_adata in sub_adatas:
        total_new_clusts += sub_adatas[sub_adata].obs['leiden'].astype(int).unique().max() + 1
    total_added_clusts = total_new_clusts - len(sub_adatas)
    new_clust_names_start = max(adata.obs['leiden'].astype(int))+1
    new_added_clust_names = [str(i) for i in range(new_clust_names_start,
                                                   new_clust_names_start + total_added_clusts)]
    
    # this block is to build a new list of leiden clusters from the old one 
    new_leiden = list()
    leiden_col = adata.obs['leiden'].copy()

    # this builds the new leiden cluster list, now adding a .1, .2, etc. to each new cluster
    for obs in leiden_col.index:
        clust_name = leiden_col.loc[obs]
        if clust_name not in old_clusts or sub_adatas[clust_name].obs.loc[obs, 'leiden'] == '0':
            new_leiden.append(clust_name)
        else:
            new_leiden.append(clust_name + '.%s' % sub_adatas[clust_name].obs.loc[obs,'leiden'])

    # this renames the .1, .2, etc clusters to the new, better names I came up with above (8 and 9)
    new_leiden = pd.Series(new_leiden, index=adata.obs_names)
    added_clusts = np.setdiff1d(new_leiden,adata.obs['leiden'])
    new_leiden.replace(dict(zip(added_clusts, new_added_clust_names)), inplace=True)
    
    # replace the old leiden column, must do these steps sequentially 
    adata.obs['leiden'] = new_leiden.astype(int) # to order the clusters by number
    adata.obs['leiden'] = new_leiden.astype(str) # to convert to string as normal
#     adata.obs['leiden'] = new_leiden.astype('category') # don't do this, it messes things up, just let scanpy do it as it plots
    return adata

In [None]:
acg_nk = sub_cluster_mapper(acg_nk, sub_acg_nk)

In [None]:
fig, ax = plt.subplots(1, 2, figsize=(15,7))
for color, ax, palette in zip(['cond', 'leiden'], ax, [None, sc.pl.palettes.default_20]):
    ax.set_facecolor('whitesmoke')
    sc.pl.umap(acg_nk,color=color, ax=ax, show=False, return_fig=False, size=30, palette=palette)

# Rank Genes

Add the `.raw` attribute for gene tests.

In [None]:
def add_raw(adata, transformed=True):
    path = prefix + 'pkls/aggr/wells.sng.w_covars.pkl'
    
    with open(path,'rb') as file:
        wells = pkl.load(file)
    
    for well in wells:
        wells[well]['adata'].obs_names = [i[:16] + '-%s' % well for i in wells[well]['adata'].obs_names]
    
    raw = wells[0]['adata'].concatenate(*[wells[i]['adata'] for  i in range(1, 12)])[adata.obs_names,:]
    
    if transformed == True:
        sc.pp.normalize_per_cell(raw,counts_per_cell_after=1e6)
        sc.pp.log1p(raw)
    
    adata.raw = raw
    return

In [None]:
add_raw(acg_nk)

In [None]:
sc.settings.verbosity = 0
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(acg_nk, groupby='leiden', n_genes=100, use_raw=True)
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(acg_nk, ncols=6, n_genes=20)
sc.settings.verbosity = 4

# Gene Visualization

### Check Gene Names

In [None]:
raw_transcripts = acg_nk.raw.var_names
transcripts = acg_nk.var_names
proteins = [i for i in acg_nk.obs.columns if '|' in i]

In [None]:
name = 'ICAM1'
print([i for i in proteins if name in i])
print([i for i in transcripts if name in i])
print([i for i in raw_transcripts if name in i])

In [None]:
# proteins.index('CD4|CD4') 
# proteins.index('CD94|KLRD1')

### Marker Genes

Find out where are expected cell types are.

In [None]:
features = ['CD56|NCAM1','NCAM1','CD16|FCGR3A','FCGR3A']

In [None]:
fig, ax = plt.subplots(2,2,figsize=(12,12))
for ax, f in zip(np.ravel(ax), features):
    ax.set_facecolor('black')
    sc.pl.umap(acg_nk, color=f, ax=ax,show=False, return_fig=False, size=20, use_raw=False)

### Proteins

In [None]:
fig, ax = plt.subplots(12,8,figsize=(30,40))
for p, ax in tqdm(zip(proteins, np.ravel(ax))):
    ax.set_facecolor('black')
    sc.pl.umap(acg_nk, color=p, size=10, ncols=8, ax=ax, show=False, return_fig=False)
plt.tight_layout()

### Ranked Genes

In [None]:
plot_genes = list()
clusts = acg_nk.obs['leiden'].dtype.categories
for j in clusts:
    plot_genes.append([i for i in acg_nk.uns['rank_genes_groups']['names'][j]][:5])

In [None]:
features = [i for j in plot_genes for i in j]

In [None]:
len(features)

In [None]:
fig, ax = plt.subplots(6,5,figsize=(30,31))
for ax, f in zip(np.ravel(ax), features):
    ax.set_facecolor('black')
    sc.pl.umap(acg_nk, color=f, ax=ax,show=False, return_fig=False, size=20, use_raw=True)

# Specific Comparisons

### Cluster 5 vs Cluster 2

In [None]:
sc.settings.verbosity = 0
warnings.filterwarnings('ignore')
sc.tl.rank_genes_groups(acg_nk, groupby='leiden', n_genes=100, groups=['5'], reference='2', use_raw=True)
warnings.filterwarnings('default')
sc.pl.rank_genes_groups(acg_nk, ncols=4, n_genes=20)
sc.settings.verbosity = 4

In [None]:
features = [i for i in acg_nk.uns['rank_genes_groups']['names']['5']][:5]

In [None]:
fig, ax = plt.subplots(1,5,figsize=(30,5))
for ax, f in zip(np.ravel(ax), features):
    ax.set_facecolor('black')
    sc.pl.umap(acg_nk, color=f, ax=ax,show=False, return_fig=False, size=20, use_raw=True)

### TNF Signature Clusters vs. No Signature

In [None]:
def grouped_rank(adata, groups, return_genes=True):

    assert len(groups) == 2
    for i in range(2):
        groups[i] = np.array(groups[i]).astype(str)
        
    grouped_clusts = [i for j in groups for i in j]
    numclusts = np.unique(adata.obs['leiden'].values)
    groups.append(np.setdiff1d(numclusts, grouped_clusts).astype('<U21'))
    fig, axes = plt.subplots(1, 2, figsize=(9, 4))
    for clusts, title, ax in zip(groups, ['0','1'], axes):
        adata.obs['val'] = adata.obs['leiden'].isin(clusts).values
        ax.set_facecolor('black')
        sc.pl.umap(adata,color='val', ax=ax, size=20, show=False, return_fig=False, title=title)
    adata.obs.drop(columns='val', inplace=True)

    adata.obs['rank_compare'] = adata.obs['leiden'].copy()
    groups_dict = dict(zip(range(3), groups))
    for group in groups_dict:
        for clust in groups_dict[group]:
            adata.obs['rank_compare'].replace(clust, 'ct%s' % group, regex=True, inplace=True)
    adata.obs['rank_compare'] = adata.obs['rank_compare'].str.replace('ct','').astype('category')
    
    fig, ax = plt.subplots(1, 1, figsize=(5,5))
    sc.settings.verbosity = 0
    for ax in np.ravel(ax):
        rank_adata = adata.copy()
        sc.tl.rank_genes_groups(rank_adata, groupby='rank_compare', n_genes=100, groups=['0'], reference='1', use_raw=True)
        y = range(20)[::-1]
        x = [i[0] for i in rank_adata.uns['rank_genes_groups']['scores'].tolist()][:20]
        genes = [i[0] for i in rank_adata.uns['rank_genes_groups']['names'].tolist()]
        txts = genes[:20]
        ax.scatter(x, y, s=0)
        for i, txt in enumerate(txts):
            ax.annotate(txt, (x[i], y[i]), rotation=0, size=10)
        ax.set_yticklabels([])
        ax.set_title('0 vs 1')
    sc.settings.verbosity = 4
    adata.obs.drop(columns='rank_compare', inplace=True)
    if return_genes == True:
        return genes
    else:
        return 

In [None]:
genes = grouped_rank(acg_nk,groups=[[1, 3, 4, 5], [0, 2]])

In [None]:
genes = grouped_rank(acg_nk,groups=[[0, 2],[1, 3, 4, 5]])

### KIR Genes

In [None]:
features = ['KIRREL2', 'KIR3DX1', 'KIR3DL3', 'KIR2DL3', 'KIR2DL1', 'KIR2DL4', 'KIR3DL1', 'KIR3DL2']

In [None]:
fig, ax = plt.subplots(1,8,figsize=(30,3))
for ax, f in zip(np.ravel(ax), features):
    ax.set_facecolor('black')
    sc.pl.umap(acg_nk, color=f, ax=ax,show=False, return_fig=False, size=20, use_raw=True)

### TRAF Genes

In [None]:
features = ['TRAF3IP3', 'TRAF4', 'TRAF1']

In [None]:
percent_positives = list()
for clust in clusts:
    for f in features:
        v = acg_nk[acg_nk.obs['leiden'] == clust].raw.obs_vector(f)
        p = (v > 0).sum()/len(v)
        if clust in tnfa:
            percent_positives.append([clust, f, p, True])
        else:
            percent_positives.append([clust, f, p, False])

In [None]:
df = pd.DataFrame(percent_positives, columns = ['clust','gene','p', 'response'])
fig, ax = plt.subplots(1,1,figsize=(4,4))
sns.barplot(data=df, x='gene', hue='response', y='p', ax=ax);

### Cluster 4 Phenotype

In [None]:
genes = grouped_rank(acg_nk,groups=[[4],[1, 3]])

In [None]:
genes[:6]

In [None]:
features = genes[:6]

In [None]:
fig, ax = plt.subplots(3,2,figsize=(10,15))
for ax, f in zip(np.ravel(ax), features):
    ax.set_facecolor('black')
    sc.pl.umap(acg_nk, color=f, ax=ax,show=False, return_fig=False, size=20, use_raw=True)

# Single Gene Plotter

In [None]:
f = ['PRDM1']

In [None]:
fig, ax = plt.subplots(1,1,figsize=(5,5))
ax.set_facecolor('black')
sc.pl.umap(acg_nk, color=f, ax=ax,show=False, return_fig=False, size=20, use_raw=True);

In [None]:
cts = pd.read_csv(prefix + 'ct.tsvs/acg.nk.tsv', sep='\t', dtype='|S').set_index('cluster')

In [None]:
s = pd.Series(acg_nk.obs['leiden'].values)

In [None]:
for ct in cts.columns:
    acg_nk.obs[ct] = s.map(dict(cts[ct])).values

In [None]:
fig, ax = plt.subplots(1,3, figsize=(17,4.5))
for val, ax in tqdm(zip(['ct1','ct2','ct3'], np.ravel(ax))):
    ax.set_facecolor('whitesmoke')
    sc.pl.umap(acg_nk,color=val, ax=ax, show=False, return_fig=False)
plt.tight_layout()