### Import scanpy

In [1]:
import scanpy as sc
import scanpy.external as sce
#sc.logging.print_versions()
#sc.logging.print_memory_usage()
#sc.settings.verbosity = 2
import os,sys
import datetime
import numpy as np
import pandas as pd
import math
import matplotlib.ticker as mticker
from adjustText import adjust_text

In [2]:
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.patches as patches
from matplotlib import cm
from matplotlib.lines import Line2D
from matplotlib.offsetbox import AnchoredOffsetbox, TextArea, HPacker, VPacker

In [3]:
# add the utility function folder to PATH
sys.path.append(os.path.abspath("utility_functions_190403_12h24/"))

from rz_import_statements import *
import rz_functions as rz
import rz_fig_params # this adjust mpl.rcParams, almost nothing to import, import after scanpy to overwrite rc.Params
import rz_utility_spring as srz

python version: 3.8.12


# Load data

In [4]:
adata = sc.read_h5ad('backups_JZ_2022/clean_kidney_304_15tr_25c_batch_corrected_50236x2000_220315_12h14.h5ad') 

In [5]:
# overwrite obs with the most recent version
filename = 'backups_JZ_2022/corrected_obs_info_50236x32_221012_12h04.npz'
encoding = 'latin1'

with np.load(filename,encoding=encoding, allow_pickle = True) as f:
    obs = pd.DataFrame(**f)
adata.obs = obs

In [6]:
adata.obs

Unnamed: 0,library,total_counts,pct_counts_mito,library2,sample,patient,pT stage,seq_date,beads,operation,...,n_counts,no_dblt_no_rbc,sp_cl_43,cell_type,broad_cell_type,color,cell_group,stage_color,patient_color,group_color
2,N14,449,0.668151,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,449,True,40,TAM 4,Immune,#9a5ce0,Myeloid cells,#8c1a1a,#e87f7f,#85619c
19,N14,449,4.23163,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,449,True,37,TAM 3,Immune,#943886,Myeloid cells,#8c1a1a,#e87f7f,#85619c
363,N14,1229,10.6591,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,1229,True,22,Tumor cells 2,Tumor,#b06c6c,Tumor cells,#8c1a1a,#e87f7f,#db2f2c
433,N14,432,6.94444,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,432,True,27,Tumor cells 3,Tumor,#db2f2c,Tumor cells,#8c1a1a,#e87f7f,#db2f2c
444,N14,502,4.98008,T2_1,T2,P2,pT3a,20_11_12,old,Open,...,502,True,28,Tumor vasculature 4,Endothelial,#a35927,Endothelium,#8c1a1a,#e87f7f,#e07c57
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4865536,Tumor0228,683,9.37042,T4_old,T4,P4,pT3a,old,old,Open,...,683,True,30,TAM 2,Immune,#532b6b,Myeloid cells,#8c1a1a,#70c3d4,#85619c
4865584,Tumor0228,1473,6.51731,T4_old,T4,P4,pT3a,old,old,Open,...,1473,True,30,TAM 2,Immune,#532b6b,Myeloid cells,#8c1a1a,#70c3d4,#85619c
4865642,Tumor0228,498,9.43775,T4_old,T4,P4,pT3a,old,old,Open,...,498,True,37,TAM 3,Immune,#943886,Myeloid cells,#8c1a1a,#70c3d4,#85619c
4865726,Tumor0228,421,19.2399,T4_old,T4,P4,pT3a,old,old,Open,...,421,True,27,Tumor cells 3,Tumor,#db2f2c,Tumor cells,#8c1a1a,#70c3d4,#db2f2c


### Scale (normalize) data

In [7]:
adata = adata.raw.to_adata()

In [9]:
# turn into counts per 10k
print(adata.X[:5,:].sum(axis=1))
print()
sc.pp.normalize_per_cell(adata, counts_per_cell_after=1e4)
print(adata.X[:5,:].sum(axis=1))

[[ 449.]
 [ 449.]
 [1229.]
 [ 432.]
 [ 502.]]

[[10000.001]
 [10000.   ]
 [10000.001]
 [10000.   ]
 [10000.   ]]


In [17]:
#select populations of interest
adata.obs['cell_type'].unique()

array(['TAM 4', 'TAM 3', 'Tumor cells 2', 'Tumor cells 3',
       'Tumor vasculature 4', 'TAM 1', 'CD8 T cells', 'TAM 2',
       'Resting/memory T cells', 'Cytotoxic T cells',
       'Tumor vasculature 2', 'Tumor vasculature 1',
       'Non-classical monocytes', 'Mesangial/vSMCs',
       'IGHG-high plasma cells', 'Mast cells', 'Tumor vasculature 3',
       'Plasma cells', 'Tumor AVR-like vasculature', 'Cycling', 'vSMCs',
       'NK cells', 'Classical monocytes', 'B cells', 'Myofibroblasts',
       'Regulatory T cells', 'Tumor cells 1', 'Proximal tubule', 'AVR',
       'Epithelial progenitor-like cells', 'tAL of LOH',
       'Glomerular endothelium', 'Principal cells', 'TAL of LOH', 'DVR',
       'Type A-ICs', 'OM Type A-ICs', 'Type B-IC', 'Podocytes', 'DCT/CNT'],
      dtype=object)

In [18]:
interest = [ 'Tumor vasculature 4', 
       'Tumor vasculature 2', 'Tumor vasculature 1', 'Tumor vasculature 3',
        'Tumor AVR-like vasculature',  'AVR',
       'Glomerular endothelium', 'DVR']

In [19]:
cmask = adata.obs['cell_type'].isin(interest)

In [20]:
sum(cmask)

6394

In [21]:
# For each cluster, find genes that are statistically significantly higher or lower in cluster x compared to
# all other cells collectively

#label-free filter to remove very low abundance genes:
min_counts = 15
min_cells = 15

In [22]:
gmask = srz.filter_abund_genes(adata.X[cmask], min_counts, min_cells)

6758 genes passing abundance filter


In [23]:
outdir = 'outputs_JZ_2022/'

In [24]:
import matplotlib as mpl
mpl.rc('font',family='Arial')
mpl.rcParams['pdf.fonttype'] = 42
mpl.rcParams['ps.fonttype'] = 42

## Volcano plot: tumor vasculature vs healthy vasculature

In [25]:
adata.X[cmask].shape

(6394, 33538)

In [26]:
labcol = 'cell_type'

In [42]:
#perform Mann-Whitney U and get FDR
#gmask is the same as before

# run "help(rz.mwu) for details on function parameters"
cmask2 = adata.obs[labcol].isin(['AVR', 'DVR', 'Glomerular endothelium']).values #healthy vasculature
cmask1 = adata.obs[labcol].isin(['Tumor vasculature 1', 'Tumor vasculature 2', 
                                      'Tumor vasculature 4','Tumor vasculature 3',
                                     'Tumor AVR-like vasculature']).values #tumor vasculature
#cmask = np.repeat(True,adata.shape[0])
print(cmask1.sum(), cmask2.sum())

# differential gene expression analysis
dge = rz.mwu(adata.X[cmask1,:][:,gmask].toarray(),
             adata.X[cmask2,:][:,gmask].toarray(),
             adata.var_names[gmask])

4638 1756
1000
2000
3000
4000
5000
6000


In [44]:
#get fold-change

# decide on pseudovalue to add to avoid division by zero, 1 seems to be the scanpy convention
pseudo = 1
dge['fc'] = (dge['mean1']+pseudo)/(dge['mean2']+pseudo)

In [45]:
dge['log2FC'] = np.log2(dge['fc'])

In [47]:
#Considering genes to be dirrefentially expressed if their fold-change is above 2 and 
# adjusted p-value is below 0.05

fc_ts = 2 # 
fdr_ts = 0.05 
dge['is_DGE'] = (dge['log2FC'].abs()>np.log2(fc_ts))&(dge['fdr']<fdr_ts)
dge

Unnamed: 0,U_statistic,p,fdr,mean1,mean2,fc,log2FC,is_DGE
NOC2L,4095494.5,2.025230e-01,3.054342e-01,0.207651,0.200825,1.005685,0.008179,False
HES4,4486332.0,5.791571e-30,1.863783e-28,1.257467,0.401048,1.611270,0.688199,False
ISG15,4428010.0,9.917145e-11,1.045555e-09,4.647995,4.336897,1.058292,0.081737,False
AGRN,4448295.0,2.198662e-31,7.392317e-30,0.893393,0.151532,1.644238,0.717419,False
TNFRSF4,4845289.0,1.145770e-65,1.046367e-63,2.930599,0.511635,2.600230,1.378639,True
...,...,...,...,...,...,...,...,...
MT-ND4L,3799056.0,5.729144e-10,5.507476e-09,1.653327,2.973209,0.667805,-0.582502,False
MT-ND4,2694553.5,3.334261e-97,5.121122e-95,37.169853,59.805523,0.627737,-0.671769,False
MT-ND5,3315981.5,7.629344e-32,2.603995e-30,11.927350,17.814468,0.687096,-0.541416,False
MT-ND6,3901898.0,2.706902e-06,1.594877e-05,1.072271,1.772967,0.747312,-0.420217,False


In [48]:
#some of the adjusted p-values (fdr) are zero, that causes problems downstream
#here I replace such occurences with smallest observed fdr value

dge.loc[dge.fdr == 0, 'fdr'] =  sorted(set(dge['fdr']))[1]
        

In [49]:
sorted(set(dge['fdr']))[0]

2.8624920647782902e-282

In [50]:
#creating a mask for genes considered as differentialy expressed 
mask = dge['is_DGE'] == True

In [51]:
dge = dge[mask]
dge

Unnamed: 0,U_statistic,p,fdr,mean1,mean2,fc,log2FC,is_DGE
TNFRSF4,4845289.0,1.145770e-65,1.046367e-63,2.930599,0.511635,2.600230,1.378639,True
VWA1,5267842.5,3.023715e-103,4.983967e-101,6.132065,1.519589,2.830646,1.501131,True
HSPG2,5461821.5,4.077799e-121,8.350839e-119,8.092089,2.255594,2.792758,1.481691,True
PDZK1IP1,3384649.0,1.744535e-117,3.467520e-115,0.367021,3.549050,0.300507,-1.734529,True
JUN,3247626.5,5.977698e-47,3.366440e-45,4.181340,10.394905,0.454707,-1.136992,True
...,...,...,...,...,...,...,...,...
EVI5L,3735598.0,5.036943e-56,3.583122e-54,0.191240,3.675243,0.254798,-1.972577,True
PLVAP,6057337.0,2.118203e-209,1.192901e-206,21.928204,6.724150,2.968379,1.569675,True
PPP1R14A,3383013.5,5.523876e-91,7.466071e-89,0.438580,2.746226,0.384008,-1.380792,True
EMP3,3439310.0,1.507806e-60,1.213066e-58,0.873097,3.565980,0.410229,-1.285499,True


In [52]:
#saving the file
dge.to_excel(outdir + 'tumor_vs_healthy_endo.xlsx')

# Plot volcano plot

Here I am plotting a volcano plot where significant and differentially expressed genes are highlighted in blue or red (down- or up-regulated) with their number in the dataset, to avoid clutter plotting all the gene names. I am also saving a dictionary to decipher which number is which gene, and later on I can edit the image in Illustrator selecting the genes I wish to emphasize.

In [None]:
x = np.log2(dge['fc'])
y = -np.log10(dge['fdr'])

s = 3 #dotsize
c = '0.2'#dotcolor
alpha = 0.5 #dot transparency
lw = 0
# select thresholds for what to consider "hits"
fc_ts = 2 # 
fdr_ts = 0.05 # convention 

xlabel = 'log2[(Tumor vasculature)\n/(Healthy vasculature)]'
ylabel = '-log10(adj. p-value)'

# plot dots
a,fig,gs = rz.startfig(14,10)
a.scatter(x,y,c=c,s=s,lw=lw,alpha=alpha)

# center fold-change around zero:
xmax = abs(np.array(a.get_xlim())).max()
a.set_xlim(-xmax,xmax)


# plot thresholds:
(left,right) = a.get_xlim()
top = a.get_ylim()[1]
logfcts = np.log2(fc_ts)
logfdrts = -np.log10(fdr_ts)


# save hits as a list, color hits in red:
hitmask = dge['is_DGE']
hits = dge.index[hitmask]

# also split hit in those up left and up right
upleft = hits[dge.loc[hits,'fc']<1]
upright = hits[dge.loc[hits,'fc']>1]

numberTogeneUp = {} # Save genes up in text file made from dict to make your life easier in illustator/keynote
numberTogeneDo = {} # Save genes do in text file made from dict to make your life easier in illustator/keynote

textsUp = []
textsDo = []

iup = 1
ido = 1
for gene,m,n in zip(hits,x[hitmask],y[hitmask]):
    if m > 0: 
        a.scatter(m,n,lw=lw,s=s*4.5,c='r',alpha=1, zorder = 6)
        numberTogeneUp[gene] = iup
        textsUp.append(plt.text(m,n,iup,fontsize=12.5, zorder = 8))
        iup += 1
        
    elif m < 0: 
        a.scatter(m,n,lw=lw,s=s*4.5,c='#008ad4',alpha=1, zorder = 6)
        numberTogeneDo[gene] = ido
        textsDo.append(plt.text(m,n,ido,fontsize=12.5, zorder = 8))
        ido += 1

# Annotate significant DGE with numbers in plot (essential that adjust_text() is called last)
adjust_text(textsUp, 
            arrowprops=dict(arrowstyle='-', color='k', zorder = 4, lw = 0.75),
            precision = 0.0000000001,
            force_text=(0.2, 0.45), 
            )
adjust_text(textsDo, 
            arrowprops=dict(arrowstyle='-', color='k', zorder = 4, lw = 0.75), 
            precision = 0.0000000001,
            force_text=(0.2, 0.45), 
            )

a.set_xlabel(xlabel, fontsize=16)
a.set_ylabel(ylabel, fontsize =16)

a.set_ylim(0,290)


rz.showspines(a,left=True,bottom=True)



#plt.savefig(outdir+'volcano_tumor_vasculature_vs_healthy.pdf', dpi=600, bbox_inches='tight')

# print the number of hits:
print('# gene up left',len(upleft))
print('# genes up right',len(upright))

In [31]:
dge[dge['log2FC']>1.5]

Unnamed: 0,U_statistic,p,fdr,mean1,mean2,fc,log2FC,is_DGE
VWA1,5267842.5,3.023715e-103,4.983967e-101,6.132065,1.519589,2.830646,1.501131,True
INHBB,4840706.0,6.350843e-81,7.035900999999999e-79,2.280245,0.062248,3.088021,1.626682,True
IGFBP7,6868030.0,0.0,2.862492e-282,97.637604,26.680046,3.563491,1.833291,True
SPARCL1,6372203.5,2.541425e-285,2.862492e-282,25.330284,4.90882,4.456099,2.155781,True
SPRY1,6014161.0,1.519694e-204,7.335778e-202,22.339014,5.879164,3.392711,1.762439,True
LAMA4,4992530.0,3.095876e-93,4.2697809999999996e-91,2.577341,0.20117,2.978213,1.574447,True
IGFBP3,5294444.0,5.425068999999999e-137,1.410101e-134,10.878271,0.306484,9.091784,3.184563,True
ANGPT2,5313972.5,1.33844e-138,3.618071e-136,5.204752,0.257791,4.933055,2.302481,True
STC1,4964428.0,5.1179120000000004e-95,7.358904000000001e-93,3.460555,0.230238,3.625765,1.858285,True
ENPP2,4775476.0,7.040058e-57,5.061352e-55,5.85641,0.833809,3.738891,1.90261,True


In [33]:
upright

Index(['TNFRSF4', 'VWA1', 'HSPG2', 'ACKR1', 'RGS5', 'IVNS1ABP', 'CYTOR',
       'MIR4435-2HG', 'INHBB', 'CXCR4', 'COL8A1', 'MGLL', 'IGFBP7', 'SPARCL1',
       'SPRY1', 'ESM1', 'SPARC', 'LAMA4', 'GJA1', 'IGFBP3', 'GRB10', 'CXorf36',
       'ANGPT2', 'STC1', 'CA2', 'ENPP2', 'COL15A1', 'TP53I11', 'KCNE3',
       'PRSS23', 'MCAM', 'THY1', 'VIM', 'UNC5B', 'HTRA1', 'VWF', 'NDUFA4L2',
       'MLEC', 'RGCC', 'EDNRB', 'COL4A1', 'COL4A2', 'CCL3', 'INSR', 'PLVAP'],
      dtype='object')

In [34]:
numberTogeneUp

{'TNFRSF4': 1,
 'VWA1': 2,
 'HSPG2': 3,
 'ACKR1': 4,
 'RGS5': 5,
 'IVNS1ABP': 6,
 'CYTOR': 7,
 'MIR4435-2HG': 8,
 'INHBB': 9,
 'CXCR4': 10,
 'COL8A1': 11,
 'MGLL': 12,
 'IGFBP7': 13,
 'SPARCL1': 14,
 'SPRY1': 15,
 'ESM1': 16,
 'SPARC': 17,
 'LAMA4': 18,
 'GJA1': 19,
 'IGFBP3': 20,
 'GRB10': 21,
 'CXorf36': 22,
 'ANGPT2': 23,
 'STC1': 24,
 'CA2': 25,
 'ENPP2': 26,
 'COL15A1': 27,
 'TP53I11': 28,
 'KCNE3': 29,
 'PRSS23': 30,
 'MCAM': 31,
 'THY1': 32,
 'VIM': 33,
 'UNC5B': 34,
 'HTRA1': 35,
 'VWF': 36,
 'NDUFA4L2': 37,
 'MLEC': 38,
 'RGCC': 39,
 'EDNRB': 40,
 'COL4A1': 41,
 'COL4A2': 42,
 'CCL3': 43,
 'INSR': 44,
 'PLVAP': 45}

In [35]:
df_up = pd.DataFrame.from_dict([numberTogeneUp])
df_do = pd.DataFrame.from_dict([numberTogeneDo])

In [36]:
df_up.to_csv(outdir + 'tumor_vasc_up.csv')

In [37]:
df_do.to_csv(outdir + 'tumor_vasc_do.csv')