In [1]:
%load_ext rpy2.ipython

In [3]:
%%R
lapply(c("dplyr","Seurat","HGNChelper","openxlsx"), library, character.only = T)

load('../data/reference.RData')

sample_meta<-'AD_HS_00001	Molecular characterization of selectively vulnerable neurons in Alzheimer’s Disease	20	H.Sapiens	AD	Superior frontal gyrus;Superior frontal gyrus;Superior frontal gyrus;Superior frontal gyrus;Superior frontal gyrus;Superior frontal gyrus;Superior frontal gyrus;Superior frontal gyrus;Superior frontal gyrus;Superior frontal gyrus;Entorhinal cortex;Entorhinal cortex;Entorhinal cortex;Entorhinal cortex;Entorhinal cortex;Entorhinal cortex;Entorhinal cortex;Entorhinal cortex;Entorhinal cortex;Entorhinal cortex	Braak:0;0;0;2;2;2;2;6;6;6;0;0;0;2;2;2;2;6;6;6	ExcitatoryNeuron;InhibitoryNeuron;Oligodendrocyte;OPC;Astrocyte;Microglia;Endothelia;Pericyte	GSE147528	Single-nucleus RNA-seq	10x Genomics	Male	60;50;71;72;87;91;77;82;72;82;60;50;71;72;87;91;77;82;72;82	SRR11422700;SRR11422701;SRR11422702;SRR11422703;SRR11422704;SRR11422705;SRR11422706;SRR11422707;SRR11422708;SRR11422709;SRR11422710;SRR11422711;SRR11422712;SRR11422713;SRR11422714;SRR11422715;SRR11422716;SRR11422717;SRR11422718;SRR11422719'
split_meta<-unlist(strsplit(sample_meta, '\t'))
samples<-unlist(strsplit(split_meta[14],';'))
disease<-rep(split_meta[5],length(samples))
stage<-unlist(strsplit(unlist(strsplit(split_meta[7],':'))[2],';'))
gender<-rep(split_meta[12],length(samples))
age<-unlist(strsplit(split_meta[13],';'))

sample_meta<-data.frame(samples=samples,disease=disease,stage=stage,gender=gender,age=age)
sample_meta[sample_meta$stage=='0','disease']<-'Control'
rownames(sample_meta)<-sample_meta$samples

In [None]:
%%R
library(stringr)
# args <- commandArgs(trailingOnly=TRUE)
args<-c('../data/3/scte','../data/3/cell_umap.txt','SRR11422700','/home/wdeng3/workspace/software/sc-type','../../universal_data/rmsk/rmsk_GRCh38.txt')
data_path<-args[1]
out_path<-args[2]
sample_tag<-args[3]
sctype_path<-args[4]
rmsk_path<-args[5]
rmsk<-read.table(rmsk_path,sep='\t',header=F)
to_remove<-unlist(rmsk[!(rmsk$V12 %in% c('LINE','SINE','LTR')),'V11'])
to_remove<-str_replace_all(to_remove,'-','\\.')

reps<-unlist(rmsk[rmsk$V12 %in% c('LINE','SINE','LTR'),'V11'])
reps<-str_replace_all(reps,'-','\\.')

create_seurat<-function(sample_tag){
    ## load data
    scte.data <- t(read.csv(paste0(data_path,'/',sample_tag,'/',sample_tag,'.csv'),check.names=F, row.names = 1))
    # remove repeats from gene expression matrix

    scte.gene<-scte.data[!(row.names(scte.data) %in% to_remove),]
    # normalize and scale data
    scte <- CreateSeuratObject(counts = scte.gene, project = sample_tag, min.cells = 3, min.features = 200)
    # normalize data
    scte[["percent.mt"]] <- PercentageFeatureSet(scte, pattern = "^MT.|^MT-")
    scte <- subset(scte, subset = nFeature_RNA > 200 & nFeature_RNA < 2500 & percent.mt < 5)
    scte <- SCTransform(scte, vars.to.regress = "percent.mt", verbose = FALSE)

    cell_meta<-scte@meta.data
    cell_meta$disease<-sample_meta[cell_meta$orig.ident,'disease']

    cell_meta$stage<-sample_meta[cell_meta$orig.ident,'stage']
    cell_meta$gender<-sample_meta[cell_meta$orig.ident,'gender']
    cell_meta$age<-sample_meta[cell_meta$orig.ident,'age']
    scte@meta.data<-cell_meta
    scte
}

scte<-create_seurat(sample_tag)

scte <- FindVariableFeatures(scte, selection.method = "vst", nfeatures = 2000)
# scale and run PCA
scte <- ScaleData(scte, features = rownames(scte))
scte <- RunPCA(scte, features = VariableFeatures(object = scte))

sdv<-Stdev(scte,reduction='pca')
sdv<-sdv[sdv>2]
npca<-length(sdv)

scte <- FindNeighbors(scte, dims = 1:npca)
scte <- FindClusters(scte, resolution = 0.8)
scte <- RunUMAP(scte, dims = 1:npca)


scte.anchors <- FindTransferAnchors(reference = reference, query = scte,
    dims = 1:30, reference.reduction = "pca")
scte.query <- MapQuery(anchorset = scte.anchors, reference = reference, query = scte,
    refdata = list(celltype = "broad.cell.type"), reference.reduction = "pca", reduction.model = "umap")
cell_meta<-scte.query@meta.data

cell_meta$UMAP_1<-scte.query@reductions[['umap']]@cell.embeddings[,1]
cell_meta$UMAP_2<-scte.query@reductions[['umap']]@cell.embeddings[,2]
scte.query@meta.data<-cell_meta
write.table(cell_meta,out_path,sep='\t',quote=F,row.names=T)

## get gene expression from scte_query
scte.query.gene<-as.data.frame(t(as.data.frame(GetAssayData(scte.query, slot = "data"))))
scte.query.gene$UMAP_1<-scte.query@meta.data$UMAP_1
scte.query.gene$UMAP_2<-scte.query@meta.data$UMAP_2
write.table(scte.query.gene,'../data/3/cell_exp.txt',sep='\t',quote=F,row.names=T)

In [102]:
%%R
## get gene expression from scte_query
# scte.query.gene<-as.data.frame(t(as.data.frame(GetAssayData(scte.query, slot = "data"))))
# scte.query.gene$UMAP_1<-scte.query@meta.data$UMAP_1
# scte.query.gene$UMAP_2<-scte.query@meta.data$UMAP_2
write.table(scte.query.gene,'../data/3/cell_exp.txt',sep='\t',quote=F,row.names=T)

In [None]:
fh1,fh2=['1.fq','2.fq']


read1=['','','','']
read2=['','','','']

from collection import defaultdict

out_files=defaultdict(lambda x:open(x,'w'))

count=0
for line1,line2 in zip(open(fh1),open(fh2)):
    if count==4:
        count=0
        ## output two files
        for i in range(4):
            out_files[read1[2]].write("\n".join(read2))
        read1=[line1,'','','','']
        read2=[line2,'','','','']

    else:
        read1[count]=line1
        read2[count]=line2
    count+=1
for i in range(4):
    out_files[read1[2]].write("\n".join(read2))

In [1]:
%load_ext rpy2.ipython

#############################################################

In [2]:
%%R
library(Seurat)
data_folder<-'../data/seurat_objs'
out_folder<-'../data/all_datasets'
exp_mtxs<-list()
for(file_name in list.files(data_folder,pattern = 'rds')){
    dataset<-strsplit(file_name,'\\.')[[1]][1]
    print(dataset)
    seurat_obj<-readRDS(paste0(data_folder,'/',file_name))
    exp_mtx<-as.data.frame(t(as.data.frame(GetAssayData(seurat_obj,slot = 'data'))))
    exp_mtx$UMAP_1<-seurat_obj@reductions[['umap']]@cell.embeddings[,1]
    exp_mtx$UMAP_2<-seurat_obj@reductions[['umap']]@cell.embeddings[,2]

    exp_mtxs[[length(exp_mtxs)+1]]<-exp_mtx
}
# all_genes<-colnames(exp_mtxs[1])
# for(i in 2:length(exp_mtxs)){
#     all_genes<-union(all_genes,colnames(exp_mtxs[i]))
# }
# all_genes<-sort(unique(all_genes))
# for(i in 1:length(exp_mtxs)){
#     exp_mtxs[i]<-exp_mtxs[i][,all_genes]
#     exp_mtxs[i].fillna(0,inplace = T)
# }
# exp_mtx<-do.call(rbind,exp_mtxs)
# write.table(exp_mtx,paste0(out_folder,'/cell_exp.txt'),row.names = T,quote = F,sep = '\t',col.names = T)



    consider that it could be called from a Python process. This
    results in a quasi-obligatory segfault when rpy2 is evaluating
    R code using it. On the hand, rpy2 is accounting for the
    fact that it might already be running embedded in a Python
    process. This is why:
    - Python -> rpy2 -> R -> reticulate: crashes
    - R -> reticulate -> Python -> rpy2: works

    The issue with reticulate is tracked here:
    https://github.com/rstudio/reticulate/issues/208
    

R[write to console]: Attaching SeuratObject



[1] "AD_HS_00001"
[1] "AD_HS_00003"
[1] "AD_HS_00006"


In [5]:
%%R
all_genes<-colnames(exp_mtxs[[1]])
for(i in 2:length(exp_mtxs)){
    all_genes<-union(all_genes,colnames(exp_mtxs[[i]]))
}
all_genes<-sort(unique(all_genes))
length(all_genes)
for(i in 1:length(exp_mtxs)){
    print("Processing exp_mtxs[[i]]")
    # create columns not existing in exp_mtxs[[i]]
    for(j in setdiff(all_genes,colnames(exp_mtxs[[i]]))){
        exp_mtxs[[i]][,j]<-0
    }
    exp_mtxs[[i]]<-exp_mtxs[[i]][,all_genes]
}
exp_mtx2<-do.call(rbind,exp_mtxs)
write.table(exp_mtx2,paste0(out_folder,'/cell_exp.txt'),row.names = T,quote = F,sep = '\t',col.names = T)
print('Done writing expression file')


[1] "Processing exp_mtxs[[i]]"
[1] "Processing exp_mtxs[[i]]"
[1] "Processing exp_mtxs[[i]]"
[1] "Done writing expression file"


In [6]:
import pandas as pd
dct=pd.read_csv('../data/all_datasets/cell_umap.txt',sep='\t',index_col=0)['dataset'].to_dict()
dct

  dct=pd.read_csv('../data/all_datasets/cell_umap.txt',sep='\t',index_col=0)['dataset'].to_dict()


{'D17_8753_AAAACCGAGTGATCGG': 'AD_HS_00001',
 'D17_8753_AAAATGAGTTCGTTGA': 'AD_HS_00001',
 'D17_8753_AAACATCGTATCTGCA': 'AD_HS_00001',
 'D17_8753_AAACCAATCAGCTTAG': 'AD_HS_00001',
 'D17_8753_AAAGAGAAGAGACGAA': 'AD_HS_00001',
 'D17_8753_AAAGAGAGTGCAGGTA': 'AD_HS_00001',
 'D17_8753_AAAGATGCACGGTGTC': 'AD_HS_00001',
 'D17_8753_AAAGTTGTCCTGCCAT': 'AD_HS_00001',
 'D17_8753_AAATGCCCCCAATGGT': 'AD_HS_00001',
 'D17_8753_AACAATGTCTGTACGA': 'AD_HS_00001',
 'D17_8753_AACACGATCGATAGAA': 'AD_HS_00001',
 'D17_8753_AACACGCAGCACCGCT': 'AD_HS_00001',
 'D17_8753_AACACGCCAGATCGGA': 'AD_HS_00001',
 'D17_8753_AACAGAGAGGAGTTGC': 'AD_HS_00001',
 'D17_8753_AACAGAGCATGCAATC': 'AD_HS_00001',
 'D17_8753_AACAGTGTCATCGGAT': 'AD_HS_00001',
 'D17_8753_AACATCTTCACAAACC': 'AD_HS_00001',
 'D17_8753_AACATGGTCTGCCAGG': 'AD_HS_00001',
 'D17_8753_AACCACTGTAAATACG': 'AD_HS_00001',
 'D17_8753_AACCAGGTCCGTAGTA': 'AD_HS_00001',
 'D17_8753_AACCATGCCTGTACGA': 'AD_HS_00001',
 'D17_8753_AACCATGTCTGTACGC': 'AD_HS_00001',
 'D17_8753

In [17]:
%%R
# R program to access components
# of a list of data frames

# Create dataframe
df1 = data.frame(
y1 = c(1, 2, 3),
y2 = c(4, 5, 6)
)

# Create another dataframe
df2 = data.frame(
y1 = c(7, 8, 9),
y2 = c(1, 4, 6)
)

x<-list()
x[[1]]<-df1
x[[2]]<-df2
x

[[1]]
  y1 y2
1  1  4
2  2  5
3  3  6

[[2]]
  y1 y2
1  7  1
2  8  4
3  9  6



In [None]:
# ## assign cell type according to ScType tuturial

## load sc type and database
# source(paste0(sctype_path,"/R/gene_sets_prepare.R"))
# source(paste0(sctype_path,"/R/sctype_score_.R"))
# gs_list = gene_sets_prepare(paste0(sctype_path,'/ScTypeDB_full.xlsx'), "Brain" )
# # get cell-type by cell matrix
# es.max = sctype_score(scRNAseqData = scte[["RNA"]]@scale.data, scaled = TRUE, 
#                       gs = gs_list$gs_positive, gs2 = gs_list$gs_negative) 

# # merge by cluster
# cL_resutls = do.call("rbind", lapply(unique(scte@meta.data$seurat_clusters), function(cl){
#     es.max.cl = sort(rowSums(es.max[ ,rownames(scte@meta.data[scte@meta.data$seurat_clusters==cl, ])]), decreasing = !0)
#     head(data.frame(cluster = cl, type = names(es.max.cl), scores = es.max.cl, ncells = sum(scte@meta.data$seurat_clusters==cl)), 10)
# }))
# sctype_scores = cL_resutls %>% group_by(cluster) %>% top_n(n = 1, wt = scores)  

# # set low-confident (low ScType score) clusters to "unknown"
# sctype_scores$type[as.numeric(as.character(sctype_scores$scores)) < sctype_scores$ncells/4] = "Unknown"


# scte@meta.data$customclassif = ""
# for(j in unique(sctype_scores$cluster)){
#   cl_type = sctype_scores[sctype_scores$cluster==j,]; 
#   scte@meta.data$customclassif[scte@meta.data$seurat_clusters == j] = as.character(cl_type$type[1])
# }

# ## save cell type
# cell_types<-scte[['customclassif']]
