# Download data to transcriptomic papers
## 1. Data import

In [3]:
library(ArrayExpress, warn.conflicts = FALSE)
library(parallel)
library(foreach, warn.conflicts = FALSE)
library(doParallel)
library(dplyr, warn.conflicts = FALSE)
setwd("/nfs/pgsb/projects/comparative_triticeae/phenotype/flower_development/refpapers/")

In [4]:
pmid_transcript <- readRDS("PMIDlist_transcript.rds")
pmid_trans <- as.data.frame(pmid_transcript, rownames = FALSE)
colnames(pmid_trans) <- "PubmedID"
str(pmid_trans)

'data.frame':	53 obs. of  1 variable:
 $ PubmedID: Factor w/ 53 levels "10852485","18539775",..: 51 49 47 42 43 45 24 50 46 32 ...


## 2. Create query in ArrayExpress, intersect with PMID_transcriptomic list
* **Issues:** original set of searchwords from `EasyPubmed` is not possible to load to `ArrayExpress`, as there are emtpy search-results that cause an error, thus I have been making a subset of existing searchwords:
    - `queryAE()` takes 2 arguments: `keywords` that are in dataset description and `species` that are part of the submitted organisms
    - using the 2 arguments at the same time is limiting the search
    - I used them separately:
        * `keywords = '"wheat" OR "Triticum+aestivum" OR "barley" OR "Hordeum+vulgare" OR "Triticeae" OR "Hordeinae" OR "Triticum" OR "Hordeum"'`
        * `species = '"Triticum" OR "Hordeum" OR "Secale" OR "Triticinae" OR "Aegilops"'`
    - only 6-6 PMIDs are found from the list 53 transcriptomic papers
    - as it is shown below, the whole *Triticum aestivum* database is just 190 experiments, "wheat" appears in 292 experiments

In [5]:
# original searchwords <- ("wheat", "barley", "triticeae", "hordeinae", "agropyron", "anthosachne", "australopyrum", "campeiostachys", "connorochloa", "critesion", "crithopsis", "douglasdeweya", "elymus", "eremopyrum", "festucopsis", "henrardia", "heteranthelium", "hordelymus", "hordeum", "hystrix", "kengyilia", "leymus", "pascopyrum", "peridictyon", "psathyrostachys", "pseudoroegneria", "secale", "stenostachys", "taeniatherum", "triticinae", "aegilops", "amblyopyrum", "dasypyrum", "haynaldia", "lophopyrum", "thinopyrum")
# starting from Agropyron the names have 0 result, and interrupt the command
# here keywords search gave 439 results
setwd("/nfs/pgsb/projects/comparative_triticeae/phenotype/flower_development/refsets/")
triticeae <- '"wheat" OR "Triticum+aestivum" OR "barley" OR "Hordeum+vulgare" OR "Triticeae" OR "Hordeinae" OR "Triticum" OR "Hordeum"' 
cl <- makeCluster(30)
registerDoParallel(cl)
query1 <- foreach(i=triticeae, .packages ="ArrayExpress", .combine = rbind) %dopar% {
    queryAE(keywords = i)
    }
stopCluster(cl)
glimpse(query1)

Rows: 439
Columns: 8
$ ID                <fct> E-MTAB-8971, E-MTAB-8675, E-MTAB-8469, E-MTAB-6109,…
$ Raw               <fct> no, no, no, no, no, no, no, no, no, yes, yes, yes, …
$ Processed         <fct> no, no, no, no, no, no, no, no, no, no, no, no, no,…
$ ReleaseDate       <fct> 2020-05-01, 2020-02-11, 2020-02-07, 2020-01-09, 202…
$ PubmedID          <fct> NA, NA, NA, 30748050, NA, NA, NA, NA, NA, NA, NA, N…
$ Species           <fct> Streptomyces coelicolor A3(2), Triticum aestivum, T…
$ ExperimentDesign  <fct> growth condition design, organism part comparison d…
$ ExperimentFactors <fct> "growth condition=glucose | growth condition=wheat …


In [6]:
# here species search gave 339 results
species <- '"Triticum" OR "Hordeum" OR "Secale" OR "Triticinae" OR "Aegilops"'
cl <- makeCluster(30)
registerDoParallel(cl)
query2 <- foreach(i=species, .packages ="ArrayExpress", .combine = rbind) %dopar% {
    queryAE(species = i)
    }
stopCluster(cl)
glimpse(query2)

Rows: 339
Columns: 8
$ ID                <fct> E-MTAB-8675, E-MTAB-8469, E-MTAB-5701, E-MTAB-8520,…
$ Raw               <fct> no, no, no, no, no, no, no, yes, yes, yes, yes, no,…
$ Processed         <fct> no, no, no, no, no, no, no, no, no, no, no, no, no,…
$ ReleaseDate       <fct> 2020-02-11, 2020-02-07, 2020-01-01, 2019-12-19, 201…
$ PubmedID          <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ Species           <fct> Triticum aestivum, Triticum durum, Triticum monococ…
$ ExperimentDesign  <fct> organism part comparison design, disease state desi…
$ ExperimentFactors <fct> "organism part=pistil | organism part=stamen", "phe…


In [7]:
# intersection of both queries with pmid_transcript list
# inner join with pmid-list & keywordsAE
AEtranscript1 <- inner_join(query1, pmid_trans, by = "PubmedID")
glimpse(AEtranscript1)
head(AEtranscript1)
# only 6 datasets are in intersection & speciesAE
AEtranscript2 <- inner_join(query2, pmid_trans, by = "PubmedID")
glimpse(AEtranscript2)
# only 4 datasets are in intersection

“Column `PubmedID` joining factors with different levels, coercing to character vector”

Rows: 6
Columns: 8
$ ID                <fct> E-MTAB-4114, E-MTAB-4469, E-GEOD-37134, E-MTAB-4401…
$ Raw               <fct> no, no, no, no, no, no
$ Processed         <fct> no, no, no, no, no, no
$ ReleaseDate       <fct> 2015-03-10, 2014-03-14, 2012-04-11, 2012-04-04, 201…
$ PubmedID          <chr> "26307377", "25886815", "22508932", "22443345", "22…
$ Species           <fct> Hordeum vulgare, Triticum aestivum, Triticum aestiv…
$ ExperimentDesign  <fct> NA, NA, NA, NA, NA, NA
$ ExperimentFactors <fct> "developmental stage=early reproductive phase | dev…


Unnamed: 0_level_0,ID,Raw,Processed,ReleaseDate,PubmedID,Species,ExperimentDesign,ExperimentFactors
Unnamed: 0_level_1,<fct>,<fct>,<fct>,<fct>,<chr>,<fct>,<fct>,<fct>
1,E-MTAB-4114,no,no,2015-03-10,26307377,Hordeum vulgare,,developmental stage=early reproductive phase | developmental stage=end of vegetative phase | developmental stage=stamen primordium visible stage | developmental stage=vegetative phase | environmental stress=long day length regimen | environmental stress=short day length regimen | genotype=ppd-H1 mutant | genotype=wild type | organism part=leaf | organism part=shoot apex
2,E-MTAB-4469,no,no,2014-03-14,25886815,Triticum aestivum,,genotype=control | genotype=HTS-1 mutant | organism part=carpel | organism part=carpel-like structure | organism part=stamen
3,E-GEOD-37134,no,no,2012-04-11,22508932,Triticum aestivum,,environmental stress=cold temperature regimen | environmental stress=control
4,E-MTAB-4401,no,no,2012-04-04,22443345,Brachypodium distachyon,,organism part=anther | organism part=early inflorescence | organism part=emerging inflorescence | organism part=endosperm | organism part=leaf | organism part=pistil | organism part=plant embryo | organism part=seed 10 days after pollination | organism part=seed 5 days after pollination
5,E-MTAB-4400,no,no,2012-04-04,22443345,Sorghum bicolor,,organism part=anther | organism part=early inflorescence | organism part=emerging inflorescence | organism part=endosperm | organism part=leaf | organism part=pistil | organism part=plant embryo | organism part=seed 10 days after pollination | organism part=seed 5 days after pollination
6,E-GEOD-36867,no,no,2012-03-29,22508932,Triticum aestivum,,"developmental stage=pollen development stage, anther length 1.0 mm | developmental stage=pollen development stage, anther length 1.5 mm | developmental stage=pollen development stage, anther length 2.2 mm | developmental stage=pollen development stage, anther length 3.0 mm | environmental stress=cold temperature regimen | environmental stress=control"


“Column `PubmedID` joining factors with different levels, coercing to character vector”

Rows: 4
Columns: 8
$ ID                <fct> E-MTAB-4114, E-MTAB-4469, E-GEOD-37134, E-GEOD-36867
$ Raw               <fct> no, no, no, no
$ Processed         <fct> no, no, no, no
$ ReleaseDate       <fct> 2015-03-10, 2014-03-14, 2012-04-11, 2012-03-29
$ PubmedID          <chr> "26307377", "25886815", "22508932", "22508932"
$ Species           <fct> Hordeum vulgare, Triticum aestivum, Triticum aestiv…
$ ExperimentDesign  <fct> NA, NA, NA, NA
$ ExperimentFactors <fct> "developmental stage=early reproductive phase | dev…


In [8]:
wheat1 <- queryAE(keywords ="wheat")
glimpse(wheat1)
wheat2 <- queryAE(species = "Triticum+aestivum")
glimpse(wheat2)

Rows: 292
Columns: 8
$ ID                <fct> E-MTAB-8971, E-MTAB-8675, E-MTAB-8469, E-MTAB-5701,…
$ Raw               <fct> no, no, no, no, no, no, no, no, yes, no, no, no, no…
$ Processed         <fct> no, no, no, no, no, no, no, no, no, no, no, no, no,…
$ ReleaseDate       <fct> 2020-05-01, 2020-02-11, 2020-02-07, 2020-01-01, 201…
$ PubmedID          <fct> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, 311…
$ Species           <fct> Streptomyces coelicolor A3(2), Triticum aestivum, T…
$ ExperimentDesign  <fct> growth condition design, organism part comparison d…
$ ExperimentFactors <fct> "growth condition=glucose | growth condition=wheat …
Rows: 190
Columns: 8
$ ID                <fct> E-MTAB-8675, E-MTAB-8520, E-MTAB-6140, E-MTAB-8397,…
$ Raw               <fct> no, no, no, no, no, yes, no, no, no, no, yes, yes, …
$ Processed         <fct> no, no, no, no, no, no, no, no, no, no, no, no, no,…
$ ReleaseDate       <fct> 2020-02-11, 2019-12-19, 2019-12-16, 2019-11-01, 201…
$ PubmedID

## 3. Direct search with 53 PMIDs

In [9]:
# according to AEwebsite, search can be narrowed into specific field like "pmid:"
str(pmid_transcript)
pmid <- rep("pmid:", each = 53)
str(pmid)
trans <- paste(pmid, pmid_transcript, sep = "", collapse = NULL)
str(trans)

 chr [1:53] "32066846" "32019527" "31910796" "31165748" "31185903" ...
 chr [1:53] "pmid:" "pmid:" "pmid:" "pmid:" "pmid:" "pmid:" "pmid:" "pmid:" ...
 chr [1:53] "pmid:32066846" "pmid:32019527" "pmid:31910796" "pmid:31165748" ...


In [10]:
cl <- makeCluster(30)
registerDoParallel(cl)
query3 <- tryCatch(
    {foreach(i=trans, 
             .packages ="ArrayExpress", 
             .combine = rbind) %dopar% queryAE(keywords = i)}, 
    error = function(e) {message('Caught an error!')
            print(e)},
    finally = {stopCluster(cl)})
str(query3)

Caught an error!


<simpleError in queryAE(keywords = i): task 1 failed - "subscript out of bounds">
List of 2
 $ message: chr "task 1 failed - \"subscript out of bounds\""
 $ call   : language queryAE(keywords = i)
 - attr(*, "class")= chr [1:3] "simpleError" "error" "condition"


In [11]:
# some PMIDs work for this way of search, some doesnt...very annoying but couldnt find out the reason
# worked example from AE website: pmid:16553887
# doesnt work, although it is from the intersected list: pmid:22508932 - but it gives results on the website!
trans <- "pmid:22508932"
cl <- makeCluster(3)
registerDoParallel(cl)
query4 <- tryCatch(
    {foreach(i=trans, 
             .packages ="ArrayExpress", 
             .combine = rbind) %dopar% queryAE(keywords = i)}, 
    error = function(e) {message('Caught an error!')
            print(e)},
    finally = {stopCluster(cl)})
str(query4)

Caught an error!


<simpleError in queryAE(keywords = i): task 1 failed - "missing value where TRUE/FALSE needed">
List of 2
 $ message: chr "task 1 failed - \"missing value where TRUE/FALSE needed\""
 $ call   : language queryAE(keywords = i)
 - attr(*, "class")= chr [1:3] "simpleError" "error" "condition"


In [12]:
sessionInfo()

R version 3.6.3 (2020-02-29)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /home/vanda.marosi/anaconda3/envs/r/lib/libopenblasp-r0.3.9.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] parallel  stats     graphics  grDevices utils     datasets  methods  
[8] base     

other attached packages:
[1] dplyr_0.8.5         doParallel_1.0.14   iterators_1.0.12   
[4] foreach_1.5.0       ArrayExpress_1.46.0 Biobase_2.46.0     
[7] BiocGenerics_0.32.0

loaded via a namespace (and not attached):
 [1] Rcpp_1.0.4.6                lattice_0.20-41            
 [3] Biostrings_2.54.0    