## velocity analysis for myeloid cells

**Author:Dr.Xi Li**

**Created: 07/28/2021**

**Language: Python**



In [1]:
from matplotlib import rcParams
import matplotlib.pyplot as plt
import scvelo as scv
import loompy
import pandas as pd
import numpy as np
import os
import scanpy as sc
import scipy.stats as stats
from io import StringIO
from sklearn import linear_model
from sklearn.svm import SVR
import seaborn as sns
import scanpy.external as sce
%matplotlib inline

In [2]:
scv.settings.verbosity = 3  # show errors(0), warnings(1), info(2), hints(3)
scv.settings.presenter_view = True  # set max width size for presenter view
scv.settings.set_figure_params('scvelo')
# for beautified visualization


plt.rcParams['pdf.fonttype'] = 42
plt.rcParams['ps.fonttype'] = 42

## input sample path

First lets find the directory that we are working in

**Note:  SigsDir must be changed depending on the user to allow for proper use of this script**

In [3]:
os.getcwd()

'/mnt/c9b6130c-37e5-4f62-becc-dd4240b42021/T127_T22_ola_2'

In [None]:
#set directory to where signature gene lists locates
SigsDir="/mnt/533ee9c3-18c0-4c72-a09e-d9ce5a10ef9e/sig"
AnnoRefDir="/mnt/533ee9c3-18c0-4c72-a09e-d9ce5a10ef9e/T_anno_ref"

## Unique Gene Names

First we want to select the genes that we wish to look at and format them into a data frame that we can use later on

In [None]:
#set the directory for outputs
project_ID="pool_M1_M2_MKI67"
scv.settings.figdir=f'{project_ID}_figures'

In [None]:
adata=scv.read (f"{project_ID}_figures/{project_ID}_anno_dy_reg4.h5ad")

In [None]:
#store a h5ad file for further faster loadinig
adata.var_names_make_unique("_")

#we need to make a directory to store the files in
try:
    directoryName=project_ID+"_figures"
    os.mkdir(directoryName)
except:
    pass

#we need to make a directory to store the files in
#we need to make a directory to store the files in
try:
    directoryName=project_ID+"_DEG"
    os.mkdir(directoryName)
except:
    pass


In [None]:
import densmap

In [None]:
#This section can create a densmap if disired 
embedding, ro, re = densmap.densMAP(n_neighbors=150, n_epochs=500, dens_frac=0.3,
                                    min_dist=0.2,spread=2,
                                    dens_lambda=0.7).fit_transform(adata.to_df(layer="Ms"))
    
#input densmap information into adata
adata.obsm["X_densmap"]=embedding

# TRAJECTORY

In [None]:
adata = sc.read(f'{project_ID}_figures/{project_ID}_anno.h5ad')

In [None]:
#calculate velocity with stochastic mode first
scv.tl.velocity(adata,  mode='stochastic',use_latent_time=False)
scv.tl.velocity_graph(adata)

In [None]:
#calculate and plot out pseudotime
#scv.tl.terminal_states(adata, self_transitions=False)
scv.tl.velocity_pseudotime(adata)
scv.pl.scatter(adata, color='velocity_pseudotime', cmap='gnuplot', perc=(2,98), save="pseudotime.pdf",basis="umap")

In [None]:
#we have to calculate velocity first before we recover dynamics 
scv.tl.recover_dynamics(adata ,plot_results=False, 
                        fit_connected_states=True, use_raw=False, n_jobs=20,
                        fit_scaling=True)

In [None]:
#calculate velocity one more time with dynamical mode
scv.tl.velocity(adata,  mode='dynamical',use_latent_time=True)
scv.tl.velocity_graph(adata)

In [None]:
#calculate and plot out dynamical pseudotime
#scv.tl.terminal_states(adata, self_transitions=False)
scv.tl.velocity_pseudotime(adata)
scv.pl.scatter(adata, color='velocity_pseudotime', cmap='gnuplot', perc=(2,98), save="pseudotime.pdf",basis="umap")

In [None]:
#sometimes we have to do latent-time if the inter population transition is not remarkable, you can refer to their oringinal paper for better understanging of the differences between pseudotime and latent-time
scv.tl.terminal_states(adata, self_transitions=True)
scv.tl.latent_time(adata, vkey='velocity', min_likelihood=0.58, min_confidence=0.9,
                  min_corr_diffusion=0.1, weight_diffusion=True, root_key="root_cells",
                   end_key="end_points", t_max=50, copy=False)
scv.tl.velocity(adata,  mode='dynamical',use_latent_time=True)
scv.tl.velocity_graph(adata)
scv.pl.scatter(adata, color='latent_time', cmap='YlOrRd', perc=(1,99),smooth=5, save="latent_time_tmp.pdf",basis="densmap")

In [None]:
#summarize the arrows of each cluster into 1 single big arrow
adata.uns['neighbors']['distances'] = adata.obsp['distances']
adata.uns['neighbors']['connectivities'] = adata.obsp['connectivities']

scv.tl.paga(adata,use_time_prior="latent_time", root_key="root_cells", end_key="end_points" #when we only do velocity pseudotime trajectory we set the use_time_prior="velociyt_pseudotime", when we do latent time trajectory we have to chage the prior time into latent time
           ,groups="anno_clusters_sub",
            threshold_root_end_prior=0.95,
           #,groups='velocity_clusters'
           )
df = scv.get_df(adata, 'paga/transitions_confidence', precision=2).T
df.style.background_gradient(cmap='Blues').format('{:.2g}')

In [None]:
#we can set the color of the big dot into Sample_Cluster so that we will get a pie chart of the percentage of each treatment inside 1 cluster
scv.pl.paga(adata, basis='umap', size=20, alpha=0.7,
            min_edge_width=1.5, node_size_scale=1
            ,save="transition_densmap_anno.svg"
            #,color="velocity_clusters"
            ,color="anno_clusters"
            ,threshold=0.1,minimum_spanning_tree=False,
           )

In [None]:
#plot velocity and expression plots of each gene in the gene list
scv.pl.velocity(adata,asymm, ncols=2,
                #save="gene_seleted_merge.pdf", 
                basis="umap", color="clusters", figsize=(7,6)
               )

In [None]:
adata.write(f'{project_ID}_figures/{project_ID}_anno_dy.h5ad')

## correlation heatmap

In [None]:
raw=pd.DataFrame(data.adata.raw.X.toarray(),index=adata.obs_names, columns=adata.raw.var_names)

In [73]:
adata_raw= sc.AnnData(raw)
for i in np.unique(adata.obs.columns):
    adata_raw.obs[i]=adata.obs[i]

AnnData object with n_obs × n_vars = 8411 × 32285

In [76]:
sc.pp.normalize_total(adata_raw, target_sum=1e4)
sc.pp.log1p(adata)

In [77]:
raw=adata_raw.to_df()

In [None]:
score_list=["latent_time","Regulon(Nfkb1(+))","Regulon(Mef2c(+))","Regulon(Irf8(+))","Regulon(Mef2a(+))","Regulon(Bcl11a(+))",
                      "Regulon(Bhlhe40(+))","Regulon(E2f7(+))","Regulon(Ezh2(+))",
                      "Regulon(Klf2(+))","Regulon(Maf(+))","Regulon(Spic(+))","Regulon(Tcf4(+))","protumor_cytokine_score",
            "reactome_phagocytosis_score","KEGG_phagocytosis_score","MM_GO_antigen_presentation_score","antitumor_cytokine_score",]
gene_list=["Cd209a","Csf1r","Il12b","Cd86","Itgax","Cd74","Cd44","Mrc1","Arg1","Msr1","Il10","Siglec1","Ccl8","C5ar1",]

In [None]:
group="Sample"

In [None]:
import math

In [None]:
corr_list=np.array([])
for i in score_list:
    for k in score_list:
        x_label=i
        score_x = pd.DataFrame(adata_s.obs[x_label].values,columns=["score"],index=adata_s.obs[group])
        score_x = score_x.groupby(level=0).mean()
        
        y_label=k
        score_y = pd.DataFrame(adata_s.obs[y_label].values,columns=["score"],index=adata_s.obs[group])
        score_y = score_y.groupby(level=0).mean()
        
        x=score_x["score"].values
        y=score_y["score"]
        r,p = stats.pearsonr(x, y) 
        
        corr_list=np.append(corr_list,r)
index=np.array([])
for i in score_list:
    for k in score_list:
        index=np.append(index, i+"-"+k)
corr=pd.Series(corr_list, index=index)

In [None]:
corr_list=np.array([])
for i in score_list:
    for k in gene_list:
        x_label=i
        score_x = pd.DataFrame(adata_s.obs[x_label].values,columns=["score"],index=adata_s.obs[group])
        score_x = score_x.groupby(level=0).mean()
        
        y_label=k
        gene_y = raw.loc[:,[y_label]]
        gene_y.index=adata_s.obs[group]
        gene_y =gene_y.groupby(level=0).mean()
        
        x=score_x["score"].values
        y=gene_y.loc[:,y_label].values
        r,p = stats.pearsonr(x, y) 
        
        corr_list=np.append(corr_list,r)
index=np.array([])
for i in score_list:
    for k in gene_list:
        index=np.append(index, i+"-"+k)
corr2=pd.Series(corr_list, index=index)

In [None]:
corr_list=np.array([])
for i in gene_list:
    for k in gene_list:
        x_label=i
        gene_x = raw.loc[:,[x_label]]
        gene_x.index=adata_s.obs[group]
        gene_x =gene_x.groupby(level=0).mean()
        
        y_label=k
        gene_y = raw.loc[:,[y_label]]
        gene_y.index=adata_s.obs[group]
        gene_y =gene_y.groupby(level=0).mean()
        
        x=gene_x.loc[:,x_label].values
        y=gene_y.loc[:,y_label].values
        r,p = stats.pearsonr(x, y) 
        
        corr_list=np.append(corr_list,r)
index=np.array([])
for i in gene_list:
    for k in gene_list:
        index=np.append(index, i+"-"+k)
corr3=pd.Series(corr_list, index=index)

In [None]:
corr_list=np.array([])
for i in gene_list:
    for k in score_list:
        x_label=i
        gene_x = raw.loc[:,[x_label]]
        gene_x.index=adata_s.obs[group]
        gene_x =gene_x.groupby(level=0).mean()
        
        y_label=k
        score_y = pd.DataFrame(adata_s.obs[y_label].values,columns=["score"],index=adata_s.obs[group])
        score_y = score_y.groupby(level=0).mean()
        
        x=gene_x.loc[:,x_label].values
        y=score_y["score"]
        r,p = stats.pearsonr(x, y) 
        
        corr_list=np.append(corr_list,r)
index=np.array([])
for i in gene_list:
    for k in score_list:
        index=np.append(index, i+"-"+k)
corr4=pd.Series(corr_list, index=index)

In [None]:
corr_list=np.array([])
for i in score_list:
    for k in score_list:
        x_label=i
        score_x = pd.DataFrame(adata_s.obs[x_label].values,columns=["score"],index=adata_s.obs[group])
        score_x = score_x.groupby(level=0).mean()
        
        y_label=k
        score_y = pd.DataFrame(adata_s.obs[y_label].values,columns=["score"],index=adata_s.obs[group])
        score_y = score_y.groupby(level=0).mean()
        
        x=score_x["score"].values
        y=score_y["score"]
        r,p = stats.pearsonr(x, y) 
        
        corr_list=np.append(corr_list,math.log10(p+math.exp(-7)))
index=np.array([])
for i in score_list:
    for k in score_list:
        index=np.append(index, i+"-"+k)
pvalues=pd.Series(corr_list, index=index)

In [None]:
corr_list=np.array([])
for i in score_list:
    for k in gene_list:
        x_label=i
        score_x = pd.DataFrame(adata_s.obs[x_label].values,columns=["score"],index=adata_s.obs[group])
        score_x = score_x.groupby(level=0).mean()
        
        y_label=k
        gene_y = raw.loc[:,[y_label]]
        gene_y.index=adata_s.obs[group]
        gene_y =gene_y.groupby(level=0).mean()
        
        x=score_x["score"].values
        y=gene_y.loc[:,y_label].values
        r,p = stats.pearsonr(x, y) 
        
        corr_list=np.append(corr_list,math.log10(p+math.exp(-7)))
index=np.array([])
for i in score_list:
    for k in gene_list:
        index=np.append(index, i+"-"+k)
pvalues2=pd.Series(corr_list, index=index)

In [None]:
corr_list=np.array([])
for i in gene_list:
    for k in gene_list:
        x_label=i
        gene_x = raw.loc[:,[x_label]]
        gene_x.index=adata_s.obs[group]
        gene_x =gene_x.groupby(level=0).mean()
        
        y_label=k
        gene_y = raw.loc[:,[y_label]]
        gene_y.index=adata_s.obs[group]
        gene_y =gene_y.groupby(level=0).mean()
        
        x=gene_x.loc[:,x_label].values
        y=gene_y.loc[:,y_label].values
        r,p = stats.pearsonr(x, y) 
        
        corr_list=np.append(corr_list,math.log10(p+math.exp(-7)))
index=np.array([])
for i in gene_list:
    for k in gene_list:
        index=np.append(index, i+"-"+k)
pvalues3=pd.Series(corr_list, index=index)

In [None]:
corr_list=np.array([])
for i in gene_list:
    for k in score_list:
        x_label=i
        gene_x = raw.loc[:,[x_label]]
        gene_x.index=adata_s.obs[group]
        gene_x =gene_x.groupby(level=0).mean()
        
        y_label=k
        score_y = pd.DataFrame(adata_s.obs[y_label].values,columns=["score"],index=adata_s.obs[group])
        score_y = score_y.groupby(level=0).mean()
        
        x=gene_x.loc[:,x_label].values
        y=score_y["score"]
        r,p = stats.pearsonr(x, y) 
        
        corr_list=np.append(corr_list,math.log10(p+math.exp(-7)))
index=np.array([])
for i in gene_list:
    for k in score_list:
        index=np.append(index, i+"-"+k)
pvalues4=pd.Series(corr_list, index=index)

In [None]:
corr_total=pd.concat([corr, corr2,corr3,corr4])
corr_total=pd.DataFrame(corr_total, columns=["R"])
X=np.array([])
Y=np.array([])
for i in range(0,len(corr_total)):
    X=np.append(X, corr_total.index[i].split("-")[0])
    Y=np.append(Y, corr_total.index[i].split("-")[1])
corr_total.loc[:,"X"]=X
corr_total.loc[:,"Y"]=Y
corr_total.loc[:,"pvalues"]=pd.concat([pvalues, pvalues2,pvalues3,pvalues4])

In [None]:
star=np.array([])
for i in range(0,len(corr_total)):
    if ((corr_total.loc[:,"pvalues"][i]<-1.3) & (abs(corr_total.loc[:,"R"][i])>0.3)):
        star=np.append(star, 60)
    else:
        star=np.append(star,0)
corr_total.loc[:,"star"]=star

In [None]:
plt.figure(figsize = (3.5,3.5))  #set figure size

#fig, (ax) = plt.subplots(1, 1, figsize=(2,3), dpi=100)
 

plt.scatter(corr_total.loc[:,"X"], corr_total.loc[:,"Y"], c=-corr_total.loc[:,"pvalues"],
            s=corr_total.loc[:,"R"]*200,alpha=1, linewidths=0, cmap="YlOrRd", marker="o") # plot scatters
plt.scatter(corr_total.loc[:,"X"], corr_total.loc[:,"Y"], c=-corr_total.loc[:,"pvalues"],
            s=-corr_total.loc[:,"R"]*200,alpha=1, linewidths=0, cmap="GnBu",marker="o")
plt.scatter(corr_total.loc[:,"X"], corr_total.loc[:,"Y"], c="white",s=corr_total.loc[:,"star"],alpha=1, linewidths=0, marker="*")# plot scatters

#map1 = ax.imshow(np.stack([corr_total.loc[:,"pvalues"],corr_total.loc[:,"pvalues"]]),cmap='YlOrRd')
#map2 = ax.imshow(np.stack([-corr_total.loc[:,"pvalues"],-corr_total.loc[:,"pvalues"]]),cmap='GnBu')

#plt.colorbar(map1)
#plt.colorbar(map2)

s4 = plt.scatter([],[], s=40, marker='o', color='#555555')
s6 = plt.scatter([],[], s=60, marker='o', color='#555555')
s10 = plt.scatter([],[], s=100, marker='o', color='#555555')
plt.legend((s4,s6,s10),
       ('0.2', '0.3', '0.5'),
       scatterpoints=1,
       loc='best',
       ncol=1,
       fontsize=8)
#make the file that we will store these plots in 
try:
    directoryName=project_ID+"_DEG"
    os.mkdir(directoryName)
except:
    pass

plt.savefig(f"{project_ID}_DEG/pearson_panel_total_mRNA_macrophage.pdf",dpi=300)
plt.show()  # show plots

In [None]:
#export meta data for R packages
meta_sample_clusters=pd.concat([adata.obs.anno_clusters,adata.obs.Sample, adata.obs.treatment,adata.obs.term, adata.obs.strain],axis=1)
meta_sample_clusters.to_csv(f"{project_ID}_DEG/meta_reconcat.csv")