In [1]:
import os
import sys
import scipy
import numpy as np
import pandas as pd
import scanpy as sc
import seaborn as sns
import scipy.io as sio
import anndata as ad
import matplotlib.pyplot as plt

os.chdir("/data/wuqinhua/phase/covid19")

# 1. Cell-level

In [2]:
attnData = pd.read_csv("./Analysis_result/Attn_result/attn_cell_PHASE.csv")

In [3]:
idList = attnData['sample_id'].unique()
for id in idList:
    attnTmp = attnData[attnData['sample_id'] == id]
    avgScore = 1 / len(attnTmp)
    log_attn = np.log2(attnTmp['attn'] / avgScore)
    attn_scaled = (log_attn - np.mean(log_attn)) / np.std(log_attn)
    attn_scaled_clipped = np.clip(attn_scaled, -1, 1)
    attnData.loc[attnData['sample_id'] == id, 'attn_scaled'] = log_attn


In [4]:
adata = ad.read_h5ad('./Alldata_anno.h5ad')
adata

AnnData object with n_obs × n_vars = 2540586 × 5000
    obs: 'batch', 'sample_id', 'group', 'n_genes', 'n_genes_by_counts', 'total_counts', 'total_counts_MT', 'pct_counts_MT', 'leiden', 'predicted_labels', 'over_clustering', 'majority_voting'
    var: 'n_cells_by_counts', 'mean_counts', 'pct_dropout_by_counts', 'total_counts', 'highly_variable', 'means', 'dispersions', 'dispersions_norm', 'highly_variable_nbatches', 'highly_variable_intersection', 'mean', 'std'
    uns: 'hvg', 'leiden', 'leiden_colors', 'log1p', 'majority_voting_colors', 'neighbors', 'pca', 'predicted_labels_colors', 'sample_id_colors', 'umap'
    obsm: 'X_pca', 'X_pca_harmony', 'X_umap'
    varm: 'PCs'
    obsp: 'connectivities', 'distances'

In [6]:
adata.obs

Unnamed: 0,batch,sample_id,group,n_genes,n_genes_by_counts,total_counts,total_counts_MT,pct_counts_MT,leiden,predicted_labels,over_clustering,majority_voting
AAACCCACATCTCAAG-1-Arunachalam2020-0,0,cov01,S,1032,1032,1839.0,140.0,7.612833499908447,4,NK_16hi,430,B_switched_memory
AAACGCTGTAGCTTGT-1-Arunachalam2020-0,0,cov01,S,1850,1850,5966.0,226.0,3.788132667541504,5,B_switched_memory,307,B_switched_memory
AAACGCTGTTTGACAC-1-Arunachalam2020-0,0,cov01,S,5905,5905,63097.0,3995.0,6.331521034240723,9,B_naive,597,Plasma_cell_IgG
AAAGAACCATCAGCAT-1-Arunachalam2020-0,0,cov01,S,726,726,1738.0,79.0,4.545454502105713,5,B_immature,136,B_naive
AAAGGATAGGCTATCT-1-Arunachalam2020-0,0,cov01,S,3960,3960,20807.0,1123.0,5.39722204208374,12,DC2,704,DC1
...,...,...,...,...,...,...,...,...,...,...,...,...
GACAAGCCTAGCTGCTCTTC_19-Zhu2020-11,11,COV-4-D16,M,569,569,901.0,8.0,0.8879023307436182,2,gdT,105,CD14_mono
AGACTGGAGCACGGCCTGTA_19-Zhu2020-11,11,COV-4-D16,M,592,592,1011.0,4.0,0.3956478733926805,3,CD4.CM,298,CD4.Naive
AGATGTATATCCGTGCAACC_19-Zhu2020-11,11,COV-4-D16,M,537,537,893.0,2.0,0.22396416573348266,0,CD4.Tfh,60,CD4.Naive
AGATGTATATCCATCGTGAC_19-Zhu2020-11,11,COV-4-D16,M,554,554,770.0,7.0,0.9090909090909091,4,NK_16hi,83,CD8.TE


In [5]:
adata.obs["attn_scaled"] = attnData["attn_scaled"].values
adata.obs["attn"] = attnData["attn"].values

In [6]:
adata1 = adata[adata.obs['group'] == "H"]
adata2 = adata[adata.obs['group'] == "M"]
adata3 = adata[adata.obs['group'] == "S"]

In [7]:
sc.settings.verbosity = 1
sc.settings.figdir = './Plot/Attn_plot'
sc.settings.set_figure_params(dpi=100, fontsize=10, dpi_save=400,
    facecolor = 'white', figsize=(6,6), format='png')
def one_col_lgd(umap):
    legend = umap.legend(bbox_to_anchor=[1.00, 0.5],
    loc='center left', ncol=1, prop={'size': 6})
    legend.get_frame().set_linewidth(0.0)
    for handle in legend.legendHandles:
        handle.set_sizes([25.0])
    return legend

In [None]:
leiden_umap = sc.pl.umap(adata, color=['predicted_labels'],
    show=False, palette=sns.color_palette("husl", 24),
legend_fontsize=6, frameon=True, title='celltype')
lgd = one_col_lgd(leiden_umap)
fig = leiden_umap.get_figure()
fig.set_size_inches(5, 5)
fig.savefig(str(sc.settings.figdir) + '/umap_celltype.pdf', 
            format='pdf', bbox_extra_artists=(lgd,), bbox_inches='tight')

In [13]:
leiden_umap = sc.pl.umap(adata1, color='attn_scaled', show=False, legend_fontsize=6, color_map ='viridis',
                           frameon= True, title='Attention Score of H',save="_attn_H.pdf")



In [14]:
leiden_umap = sc.pl.umap(adata2, color='attn_scaled', show=False, legend_fontsize=6, color_map ='viridis',
                           frameon= True, title='Attention Score of M',save="_attn_M.pdf")



In [15]:
leiden_umap = sc.pl.umap(adata3, color='attn_scaled', show=False, legend_fontsize=6, color_map ='viridis',
                           frameon= True, title='Attention Score of S',save="_attn_S.pdf")



# 2. Celltype-level

### 2.1 boxplot 

In [None]:
library(tidyr)
library(ggplot2)
library(forestploter)
library(gridExtra)
library(tidyverse)
library(dplyr)
library(broom)
library(ggpubr)
library(randomForest)
library(mice)
library(reshape2)
library(gghalves)
library(cowplot)
library(patchwork)

setwd("/data/wuqinhua/phase/covid19")

In [None]:
attnData = read.csv('./Analysis_result/Attn_result/attn_cell_PHASE.csv')
head(attnData)
colnames(attnData)

In [5]:
nameAll = unique(attnData$predicted_labels)
nameList = sort(nameAll)

sampleFold = data.frame(id = character(), celltype = character(), fold = numeric())
idList = unique(attnData$sample_id)
for (id in idList) {
  attnTmp = attnData %>% filter(sample_id == id)
  avgScore = 1 / dim(attnTmp)[1]
  foldRes = attnTmp %>% group_by(predicted_labels) %>% summarise(res = median(log2(attn/avgScore)))
  dataTmp = data.frame(id = rep(id,dim(foldRes)[1]),
                       celltype = foldRes$predicted_labels,
                       fold = foldRes$res)
  dataTmp_s = dataTmp %>% filter(celltype %in% nameList)
  dataTmp_s$fold = scale(dataTmp_s$fold)
  sampleFold = rbind(sampleFold,dataTmp_s)
}

In [None]:
sampleFold.Table = dcast(sampleFold,id ~ celltype)
rownames(sampleFold.Table) = sampleFold.Table$id
sampleFold.Table$id = NULL
print(colnames(sampleFold.Table))

In [None]:
sampleInfo = read.csv('./COVID19_sample_condition_560.csv')
rownames(sampleInfo) = sampleInfo$sample_id
sampleInfo = sampleInfo[rownames(sampleFold.Table),]

predicted_labelss = colnames(sampleFold.Table)

sampleFold.Table
sampleFold.Table_s = sampleFold.Table
sampleFold.Table_s$group = sampleInfo$group
sampleFold.Table_s$id = sampleInfo$sample_id

In [None]:
sampleFold.Table_st = melt(sampleFold.Table_s,id.vars = c('group','id'))
head(sampleFold.Table_st)


In [None]:
ordercolors <- c("olivedrab3", "skyblue1", "goldenrod1")

plot_list <- list()

for (cell in predicted_labelss) {
  
  dataTmp <- data.frame(atten = sampleFold.Table[[cell]],
                        group = sampleInfo$group)
  dataTmp_s <- na.omit(dataTmp)  
  
  # Kruskal-Wallis-Test
  kruskal_test <- kruskal.test(atten ~ group, data = dataTmp_s)
  print(paste("Kruskal-Wallis test for", cell, ":"))
  print(kruskal_test)
  
  p_value <- kruskal_test$p.value
  p_label <- ifelse(p_value < 0.001, "p < 0.001", sprintf("p=%.3f", p_value))

  p <- ggplot(dataTmp_s, aes(x = group, y = atten, fill = group)) +  
    geom_boxplot(outlier.shape = 16, outlier.colour = "lightgray") +  
    scale_fill_manual(values = ordercolors) +  
    scale_y_continuous(expand = c(0, 0)) + 
    labs(y = "Celltype Attention Scores", x = NULL) +  
    annotate("text", x = Inf, y = Inf, label = paste(cell,": ", p_label),
             hjust = 1.1, vjust = 2, size = 7, color = "black") +  
    theme_classic() + 
    theme(axis.text = element_text(size = 12, color = "black"))
  
  plot_list[[cell]] <- p


  filename <- paste0("./Plot/Attn_plot/boxplot/boxplot_", cell, ".pdf")
  ggsave(filename, plot = p, width = 5, height = 4)
  # print(p)

}

combine_plot = wrap_plots(plot_list,ncol=8)
ggsave("./Plot/Attn_plot/boxplot_all.pdf",combine_plot,width = 40,height = 28)