#### Load packages

In [3]:
library(Biobase)
library(GEOquery)
library(limma)
library(repr)
library(edgeR)
library(preprocessCore)
library(MetaIntegrator)
library(org.Hs.eg.db)
library(Jmisc)
library(biomaRt)

#### Paramter setting

In [6]:
#Specify the place to store normalized gene expression data
norm_exp_data <- paste("../validation-dataset/viral-infection-dataset/", sep = "")
dir.create(norm_exp_data, showWarnings = FALSE)
#Load the self-defined functions
sourceAll('fun/R')
#Assign Vroom connection buffer size allowing getGEO to process large data 
Sys.setenv("VROOM_CONNECTION_SIZE" = 524288 * 2)

Loading...
  fun/R/aliasMapping.R 
  fun/R/array_normalize.r 
  fun/R/array_process_qc_alt.R 
  fun/R/array_process_qc.R 
  fun/R/dataset_info.R 
  fun/R/diff_exp_array_analysis.R 
  fun/R/diff_exp_seq_analysis.R 
  fun/R/draw_boxplot.R 
  fun/R/ensembl2genesymbol.R 
  fun/R/geneLength.r 
  fun/R/geo_process.R 
  fun/R/idsmap.r 
  fun/R/probe2genesymbol.R 
  fun/R/seq_process_qc.R 
  fun/R/UniqueGene_probe.r 
Done


In [7]:
sessionInfo()

R version 4.0.3 (2020-10-10)
Platform: x86_64-conda-linux-gnu (64-bit)
Running under: Amazon Linux 2

Matrix products: default
BLAS/LAPACK: /shared/software/anaconda3/envs/tb-gene-signature-update/lib/libopenblasp-r0.3.18.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats4    parallel  stats     graphics  grDevices utils     datasets 
[8] methods   base     

other attached packages:
 [1] viridis_0.6.2         viridisLite_0.4.0     biomaRt_2.46.3       
 [4] Jmisc_0.3.1           org.Hs.eg.db_3.12.0   AnnotationDbi_1.52.0 
 [7] IRanges_2.24.1        S4Vectors_0.28.1      MetaIntegrator_2.1.3 
[10] preprocessCore_1.55.2 edgeR_3.32.1          r

In [None]:
#Read viral infection datasets with GEO GES ids
df_viral_dataset <- readxl::read_xlsx("viral infection datasets.xlsx", skip = 1)
df_viral_dataset <- df_viral_dataset[-c(1),]
df_viral_dataset <- df_viral_dataset[which(df_viral_dataset$`RNA-Seq or Microarray` == "Microarray" & df_viral_dataset$`Sample Type` %in% c("WB","PBMC")), ]
df_viral_dataset <- df_viral_dataset[-c(1),]
gse_ids <- unlist(stringr::str_extract_all(df_viral_dataset$Dataset, "GSE\\d+"))
df_viral_dataset[,c("GSE_ID")] <- gse_ids
gse_id_v <- unique(df_viral_dataset[,c("GSE_ID","platform id")])

#Loop each dataset
for(j in seq(nrow(gse_id_v))){
    print(j)
    GSE_ID <- as.character(gse_id_v[c(j),][1])
    platform_id <- as.character(gse_id_v[c(j),][2])
    platform_id <-  str_replace_all(platform_id, fixed(" "), "")
    print(GSE_ID)
   #Download microarray data from GEO
    array_process_qc_alt(GSE_ID, platform_id, norm_exp_data, TRUE) 
}