# Download data to transcriptomic papers
## 1. Data import

In [55]:
library(parallel)
library(foreach, warn.conflicts = FALSE)
library(doParallel)
library(dplyr,  warn.conflicts = FALSE)
library(ArrayExpress,  warn.conflicts = FALSE)
setwd("/nfs/pgsb/projects/comparative_triticeae/phenotype/flower_development/refpapers/")

In [56]:
pmid_transcript <- readRDS("PMIDlist_transcript.rds")
pmid_transcript <- as.data.frame(pmid_transcript, rownames = FALSE)
colnames(pmid_transcript) <- "PubmedID"
str(pmid_transcript)
print(pmid_transcript)

'data.frame':	53 obs. of  1 variable:
 $ PubmedID: Factor w/ 53 levels "10852485","18539775",..: 51 49 47 42 43 45 24 50 46 32 ...
   PubmedID
1  32066846
2  32019527
3  31910796
4  31165748
5  31185903
6  31628536
7  28484474
8  32060572
9  31729646
10 29259104
11 29068370
12 28428792
13 29143606
14 25966171
15 25886815
16 19255779
17 10852485
18 31076487
19 29581960
20 22443345
21 29211012
22 29064514
23 28241738
24 27458461
25 25898130
26 23821595
27 23293955
28  7437483
29 31469444
30 31134117
31 30276910
32 31076750
33 30843087
34 29790929
35 29757397
36 28848574
37 29739311
38 28515146
39 27898828
40 28531235
41 27163605
42 26982202
43 26307377
44 25886913
45 25707852
46 25660346
47 25562483
48 25517485
49 22508932
50 18539775
51  9207838
52 31963482
53 26055625


## 2. Create query in ArrayExpress, intersect with PMID_transcriptomic list

In [52]:
#searchwords <- ("wheat", "barley", "triticeae", "hordeinae", "agropyron", "anthosachne", "australopyrum", "campeiostachys", "connorochloa", "critesion", "crithopsis", "douglasdeweya", "elymus", "eremopyrum", "festucopsis", "henrardia", "heteranthelium", "hordelymus", "hordeum", "hystrix", "kengyilia", "leymus", "pascopyrum", "peridictyon", "psathyrostachys", "pseudoroegneria", "secale", "stenostachys", "taeniatherum", "triticinae", "aegilops", "amblyopyrum", "dasypyrum", "haynaldia", "lophopyrum", "thinopyrum")
#searchwords <- '"wheat" OR "barley"'
# starting from Agropyron the names have 0 result, and interrupt the command
#OR "Agropyron" OR "Anthosachne" OR "Australopyrum" OR "Campeiostachys" OR "Connorochloa" OR "Critesion" OR "Crithopsis" OR "Douglasdeweya" OR "Elymus" OR "Eremopyrum" OR "Festucopsis" OR "Henrardia" OR "Heteranthelium" OR "Hordelymus" OR "Hordeum" OR "Hystrix" OR "Kengyilia" OR "Leymus" OR "Pascopyrum" OR "Peridictyon" OR "Psathyrostachys" OR "Pseudoroegneria" OR "Secale" OR "Stenostachys" OR "Taeniatherum" OR "Triticinae" OR "Aegilops" OR "Amblyopyrum" OR "Dasypyrum" OR "Haynaldia" OR "Lophopyrum" OR "Thinopyrum" OR "Triticum"
triticeae <- '"wheat" OR "Triticum+aestivum" OR "barley" OR "Hordeum+vulgare" OR "Triticeae" OR "Hordeinae" OR "Triticum" OR "comparative" OR "transcriptomics" OR "sequencing" OR "rna-seq+analysis" OR "differential+expression"' 
cl <- makeCluster(60)
registerDoParallel(cl)
system.time({
query1 <- foreach(i=triticeae, .packages ="ArrayExpress", .combine = rbind) %dopar% {
    queryAE(keywords = i)
    }
        })
stopCluster(cl)
str(query1)

   user  system elapsed 
  0.087   0.026 276.263 

'data.frame':	14115 obs. of  8 variables:
 $ ID               : Factor w/ 14115 levels "E-AFMX-3","E-ATMX-22",..: 13885 13837 13888 13887 13886 13884 13883 13713 13543 13397 ...
 $ Raw              : Factor w/ 2 levels "no","yes": 1 1 1 1 1 2 1 1 1 1 ...
 $ Processed        : Factor w/ 1 level "no": 1 1 1 1 1 1 1 1 1 1 ...
 $ ReleaseDate      : Factor w/ 3312 levels "2002-03-19","2002-06-27",..: 3312 3311 3310 3310 3310 3310 3310 3310 3310 3310 ...
 $ PubmedID         : Factor w/ 5210 levels "10471496","11116097",..: 5204 5199 5204 5204 5204 5204 5204 5204 5204 5204 ...
 $ Species          : Factor w/ 1520 levels "[Candida] glabrata",..: 739 739 739 739 739 739 1378 739 655 739 ...
 $ ExperimentDesign : Factor w/ 966 levels "all pairs","all pairs | binding site identification | organism part comparison",..: 877 680 189 189 189 877 832 879 903 762 ...
 $ ExperimentFactors: Factor w/ 11666 levels "-BARCODE=CTG | -BARCODE=GAT | INFECTED WITH=MCMV | INFECTED WITH=mock (control) | IP ANTIBO

In [46]:
species <- '"Triticum" OR "Hordeum" OR "Agropyron" OR "Anthosachne" OR "Critesion" OR "Crithopsis" OR "Elymus" OR "Eremopyrum" OR "Festucopsis" OR "Henrardia" OR "Heteranthelium" OR "Hordelymus" OR "Hystrix" OR "Kengyilia" OR "Leymus" OR "Pascopyrum" OR "Peridictyon" OR "Psathyrostachys" OR "Pseudoroegneria" OR "Secale" OR "Stenostachys" OR "Taeniatherum" OR "Triticinae" OR "Aegilops" OR "Amblyopyrum" OR "Dasypyrum" OR "Haynaldia" OR "Lophopyrum" OR "Thinopyrum"'
cl <- makeCluster(30)
registerDoParallel(cl)
system.time({
query2 <- foreach(i=species, .packages ="ArrayExpress", .combine = rbind) %dopar% {
    queryAE(species = i)
    }
        })
stopCluster(cl)
str(query2) 

ERROR: Error in {: task 1 failed - "XML content does not seem to be XML: 'mus" OR "Eremopyrum" OR "Festucopsis" OR "Henrardia" OR "Heteranthelium" OR "Hordelymus" OR "Hordeum" OR "Hystrix" OR "Kengyilia" OR "Leymus" OR "Pascopyrum" OR "Peridictyon" OR "Psathyrostachys" OR "Pseudoroegneria" OR "Secale" OR "Stenostachys" OR "Taeniatherum" OR "Triticinae" OR "Aegilops" OR "Amblyopyrum" OR "Dasypyrum" OR "Haynaldia" OR "Lophopyrum" OR "Thinopyrum".xml'"


Timing stopped at: 0.016 0.007 7.787


In [58]:
#inner join with pmid-list & keywordsAE
AEtranscript1 <- inner_join(query1, pmid_transcript, by = "PubmedID")
str(AEtranscript1)
#only 6 datasets are in intersection & speciesAE
AEtranscript2 <- inner_join(query2, pmid_transcript, by = "PubmedID")
str(AEtranscript2)
#only 6 datasets are in intersection

“Column `PubmedID` joining factors with different levels, coercing to character vector”

'data.frame':	8 obs. of  8 variables:
 $ ID               : Factor w/ 14115 levels "E-AFMX-3","E-ATMX-22",..: 12588 12592 12819 4247 12777 12776 12136 4193
 $ Raw              : Factor w/ 2 levels "no","yes": 1 1 1 1 1 1 1 1
 $ Processed        : Factor w/ 1 level "no": 1 1 1 1 1 1 1 1
 $ ReleaseDate      : Factor w/ 3312 levels "2002-03-19","2002-06-27",..: 2284 2172 1949 1321 1315 1315 1315 1309
 $ PubmedID         : chr  "26307377" "25517485" "25886815" "22508932" ...
 $ Species          : Factor w/ 1520 levels "[Candida] glabrata",..: 777 1056 1440 1440 238 1338 1056 1440
 $ ExperimentDesign : Factor w/ 966 levels "all pairs","all pairs | binding site identification | organism part comparison",..: 877 877 877 877 877 877 822 877
 $ ExperimentFactors: Factor w/ 11666 levels "-BARCODE=CTG | -BARCODE=GAT | INFECTED WITH=MCMV | INFECTED WITH=mock (control) | IP ANTIBODY=anti-Ago1 mAb (2A"| __truncated__,..: 5042 8934 6705 5966 8935 8935 8935 5148


“Column `PubmedID` joining factors with different levels, coercing to character vector”

'data.frame':	4 obs. of  8 variables:
 $ ID               : Factor w/ 337 levels "E-AFMX-3","E-GEOD-10328",..: 258 278 102 101
 $ Raw              : Factor w/ 2 levels "no","yes": 1 1 1 1
 $ Processed        : Factor w/ 1 level "no": 1 1 1 1
 $ ReleaseDate      : Factor w/ 249 levels "2004-09-01","2005-05-01",..: 188 153 115 113
 $ PubmedID         : chr  "26307377" "25886815" "22508932" "22508932"
 $ Species          : Factor w/ 34 levels "Aegilops speltoides | Aegilops tauschii | Secale cereale | Triticum aestivum | Triticum urartu",..: 16 21 21 21
 $ ExperimentDesign : Factor w/ 84 levels "all pairs | co-expression | development or differentiation design | time series design",..: 70 70 70 70
 $ ExperimentFactors: Factor w/ 278 levels "age/developmental stage=3-week old | age/developmental stage=booting stage | organism part=leaf | organism part"| __truncated__,..: 91 135 103 93


In [57]:
wheat1 <- quearyAE(keywords = "wheat")
str(wheat1)
wheat2 <- quearyAE(species = "Triticum+aestivum")
str(wheat2)

ERROR: Error in quearyAE(keywords = "wheat"): could not find function "quearyAE"
