# `EasyPubmed` download of all *Triticeae* abstracts on 02.04.2020

In [1]:
library(easyPubMed)
library(parallel)
library(foreach)
library(doParallel)
library(dplyr, warn.conflicts = FALSE)

Loading required package: iterators


In [2]:
setwd("/nfs/pgsb/projects/comparative_triticeae/phenotype/flower_development/refpapers/")
getwd()

In [3]:
triticeae <- '"wheat" OR "bread wheat" OR "common wheat" OR "Triticum aestivum" OR "barley" OR "domestic barley" OR "Hordeum vulgare" OR "Triticeae" OR "Hordeinae" OR "Agropyron" OR "Anthosachne" OR "Australopyrum" OR "Campeiostachys" OR "Connorochloa" OR "Critesion" OR "Crithopsis" OR "Douglasdeweya" OR "Elymus" OR "Eremopyrum" OR "Festucopsis" OR "Henrardia" OR "Heteranthelium" OR "Hordelymus" OR "Hordeum" OR "Hystrix" OR "Kengyilia" OR "Leymus" OR "Pascopyrum" OR "Peridictyon" OR "Psathyrostachys" OR "Pseudoroegneria" OR "Secale" OR "Stenostachys" OR "Taeniatherum" OR "Triticinae" OR "Aegilops" OR "Amblyopyrum" OR "Dasypyrum" OR "Haynaldia" OR "Lophopyrum" OR "Thinopyrum" OR "Triticum"'
triticeae_id <- get_pubmed_ids(triticeae,
                              api_key = "532056952c2098c0cd03a43bc25e345a7f08")
str(triticeae_id)
# 81 083 PMIDs found which can be downloaded in 17x 5000 batches

List of 10
 $ Count           : chr "81083"
 $ RetMax          : chr "20"
 $ RetStart        : chr "0"
 $ QueryKey        : chr "1"
 $ WebEnv          : chr "NCID_1_101345233_130.14.18.97_9001_1585821879_2146311834_0MetA0_S_MegaStore"
 $ IdList          :List of 20
  ..$ Id: chr "32231884"
  ..$ Id: chr "32231040"
  ..$ Id: chr "32230597"
  ..$ Id: chr "32230394"
  ..$ Id: chr "32230099"
  ..$ Id: chr "32229608"
  ..$ Id: chr "32229393"
  ..$ Id: chr "32229337"
  ..$ Id: chr "32229335"
  ..$ Id: chr "32229110"
  ..$ Id: chr "32229014"
  ..$ Id: chr "32228729"
  ..$ Id: chr "32227938"
  ..$ Id: chr "32227786"
  ..$ Id: chr "32227622"
  ..$ Id: chr "32226881"
  ..$ Id: chr "32226438"
  ..$ Id: chr "32226437"
  ..$ Id: chr "32225158"
  ..$ Id: chr "32224845"
 $ QueryTranslation: chr "\"wheat\"[All Fields] OR \"bread wheat\"[All Fields] OR \"common wheat\"[All Fields] OR \"Triticum aestivum\"[A"| __truncated__
 $ TranslationSet  : list()
 $ OriginalQuery   : chr "\"wheat\"+OR+\"bread+wheat

In [4]:
system.time({
    trit <- batch_pubmed_download(pubmed_query_string = triticeae, 
                               dest_dir = "/nfs/pgsb/projects/comparative_triticeae/phenotype/flower_development/refpapers",
                               format = "xml", 
                               api_key = "532056952c2098c0cd03a43bc25e345a7f08",
                               batch_size = 5000,
                               dest_file_prefix = "pubmed_triticeae_articles",
                               res_cn = 1,
                               encoding = "UTF-8")
})
print(trit)
#it took 761,258 seconds to run the block of code which equals to 12,68 min.

[1] "PubMed data batch 1 / 17 downloaded..."
[1] "PubMed data batch 2 / 17 downloaded..."
[1] "PubMed data batch 3 / 17 downloaded..."
[1] "PubMed data batch 4 / 17 downloaded..."
[1] "PubMed data batch 5 / 17 downloaded..."
[1] "PubMed data batch 6 / 17 downloaded..."
[1] "PubMed data batch 7 / 17 downloaded..."
[1] "PubMed data batch 8 / 17 downloaded..."
[1] "PubMed data batch 9 / 17 downloaded..."
[1] "PubMed data batch 10 / 17 downloaded..."
[1] "PubMed data batch 11 / 17 downloaded..."
[1] "PubMed data batch 12 / 17 downloaded..."
[1] "PubMed data batch 13 / 17 downloaded..."
[1] "PubMed data batch 14 / 17 downloaded..."
[1] "PubMed data batch 15 / 17 downloaded..."
[1] "PubMed data batch 16 / 17 downloaded..."
[1] "PubMed data batch 17 / 17 downloaded..."


   user  system elapsed 
104.365  70.560 761.258 

 [1] "pubmed_triticeae_articles001.txt" "pubmed_triticeae_articles002.txt"
 [3] "pubmed_triticeae_articles003.txt" "pubmed_triticeae_articles004.txt"
 [5] "pubmed_triticeae_articles005.txt" "pubmed_triticeae_articles006.txt"
 [7] "pubmed_triticeae_articles007.txt" "pubmed_triticeae_articles008.txt"
 [9] "pubmed_triticeae_articles009.txt" "pubmed_triticeae_articles010.txt"
[11] "pubmed_triticeae_articles011.txt" "pubmed_triticeae_articles012.txt"
[13] "pubmed_triticeae_articles013.txt" "pubmed_triticeae_articles014.txt"
[15] "pubmed_triticeae_articles015.txt" "pubmed_triticeae_articles016.txt"
[17] "pubmed_triticeae_articles017.txt"


## Parallel recombination of `xml` encoding into single `data.frame` and using for loop with `foreach` package

In [5]:
files <- list.files(path="/nfs/pgsb/projects/comparative_triticeae/phenotype/flower_development/refpapers/", pattern = "*.txt", full.names=TRUE)
str(files)

 chr [1:17] "/nfs/pgsb/projects/comparative_triticeae/phenotype/flower_development/refpapers//pubmed_triticeae_articles001.txt" ...


In [6]:
#Starting time: record
t.start <- Sys.time()

#start a cluster with 34 cores
cl <- makeCluster(34)
registerDoParallel(cl)
system.time({
inter_merged <- foreach(x=files,
           .packages = 'easyPubMed',
           .combine = rbind) %dopar% table_articles_byAuth(pubmed_data = x,
                                                                    included_authors = "first",
                                                                    max_chars = -1,
                                                                    autofill = TRUE,
                                                                    getKeywords = FALSE,
                                                                    encoding = "UTF-8")
    })
stopCluster(cl)

str(inter_merged)

#Final time: record
t.stop <- Sys.time()

#How long did it take?
print(t.stop - t.start)
#It took 28.6 min to download 80 826 article information from the earlier searched 81 083 PMIDs (~99.6%)

    user   system  elapsed 
   3.796    0.584 1719.815 

'data.frame':	80826 obs. of  14 variables:
 $ pmid     : chr  "32231884" "32231040" "32230597" "32230394" ...
 $ doi      : chr  "10.7717/peerj.8788" "10.3390/molecules25071546" "10.11646/zootaxa.4722.6.1" "10.11646/zootaxa.4742.1.7" ...
 $ title    : chr  "Proximate composition, functional properties and quantitative analysis of benzoyl peroxide and benzoic acid in "| __truncated__ "Determination of Glyphosate in Dried Wheat by <sup>1</sup>H-NMR Spectroscopy." "Review of the genus Eurygaster (Hemiptera: Heteroptera: Scutelleridae) of Russia." "Description of Pratylenchoides ojcowensis sp. nov. (Nematoda: Merlinidae) from Polish Jurassic Highland." ...
 $ abstract : chr  "Extensive milling processes have deprived wheat flour from essential nutrients. The objective of the current st"| __truncated__ "A wheat field was sprayed with a dosage of 1.1 kg a.i./ha Roundup PowerMax 10 days before harvest. The <sup>1</"| __truncated__ "Six species of the genus Eurygaster Laporte, 1833 are current

In [7]:
# sanity check with a PMID grabbed from the 7th batch of 5000 articles in xml format, and check whether it exists in the joint table and in which row - as it is shown it is indeed located in the 7th batch
which(grepl("21951803", inter_merged$pmid))

In [8]:
merged <- inter_merged[,c("pmid", "title", "abstract", "year", "journal")]
head(merged)
str(merged)

pmid,title,abstract,year,journal
32231884,"Proximate composition, functional properties and quantitative analysis of benzoyl peroxide and benzoic acid in wheat flour samples: effect on wheat flour quality.","Extensive milling processes have deprived wheat flour from essential nutrients. The objective of the current study was to assess the nutritive quality of commercial wheat flour (soft flour (SF)) through analyses of proximate composition and functional properties as well as quantification of benzoyl peroxide (BPO; added as bleaching agent in the SF) by comparing the results with whole wheat flour (WF; never received any additives). The samples included commercial SF purchased from the local supplier of different flour mills (who use BPO as additive) and a control sample without additives was prepared by grinding the seeds harvested from wheat (<i>Triticum aestivum</i> L.; Inqulab 91) crop grown in the experimental field of University of Agriculture, Faisalabad, under optimized field conditions without any fertilizers and insecticides. Functional properties (including bulk density, water absorption capacity, oil absorption capacity, emulsifying activity, foaming capacity, least gelatinization concentration and gelatinization temperature) and proximate composition (including moisture content, ash contents, crude protein, gluten and starch contents) were determined and compared for all the samples. Benzoyl peroxide (BPO) and Benzoic Acid (BA) quantification was performed through High Performance Liquid Chromatography. Finally dietary intake was estimated for BPO and BA. Results showed that SF had lesser fiber, protein and ash contents, whereas, higher damaged starch, fat, gluten and bulk density. A parallel experiment under selected conditions (temperature, time and solute concentration) showed dissociation of BPO into BA soon after the exposure. Observed BA range (13.77 mg/g after 16 h) in SF and exposure level assessment (44.3 ± 1.36 mg/kg/BW) showed higher intake of BA on the consumption of SF. The results revealed the superiority of WF over SF in nutritive qualities as well as free of toxicants such as BA.",2020,PeerJ
32231040,Determination of Glyphosate in Dried Wheat by <sup>1</sup>H-NMR Spectroscopy.,"A wheat field was sprayed with a dosage of 1.1 kg a.i./ha Roundup PowerMax 10 days before harvest. The <sup>1</sup>H Nuclear Magnetic Resonance (NMR) spectroscopy was used for the detection and quantification of the glyphosate (GLYP) in dried wheat spikelets, leaves, and stems. The quantification was done by the integration of the CH<sub>2</sub>-P groups doublet at 3.00 ppm with good linearity. The GLYP content varied between different samples and parts of the plant. On average, the largest content of herbicide was found in leaves (20.0 mg/kg), followed by stems (6.4 mg/kg) and spikelets (6.3 mg/kg). Our study shows that the <sup>1</sup>H-NMR spectroscopy can be a rapid and reliable tool for GLYP detection and quantification in the field studies.",2020,"Molecules (Basel, Switzerland)"
32230597,Review of the genus Eurygaster (Hemiptera: Heteroptera: Scutelleridae) of Russia.,"Six species of the genus Eurygaster Laporte, 1833 are currently recorded from the territory of the Russian Federation including Eurygaster integriceps Puton, 1881, the most important pest of wheat in Russia. Determination of Eurygaster spp. to the species level remains challenging especially in the case of closely related E. integriceps, E. maura (Linnaeus, 1758), and E. testudinaria (Geoffroy, 1785). This often leads to errors in identification of these species and provides incorrect information for plant protection services. Additional features for more precise identification are discussed and a key to all species of the genus Eurygaster of Russia is provided. The key is backed up with short diagnoses of every species, original color photographs and summarized data on the distribution. Taxonomic status of E. testudinaria sinica Walker, 1867 is discussed and E. testudinaria sinica Walker, 1867 is synonymized with the nominative subspecies. Also, a key to all seven genera of Scutelleridae known from Russia is given.",2020,Zootaxa
32230394,Description of Pratylenchoides ojcowensis sp. nov. (Nematoda: Merlinidae) from Polish Jurassic Highland.,"A new species of the genus Pratylenchoides has been described. It was found in Polish Jurassic Highland, in Ojców National Park. Pratylenchoides ojcowensis sp. nov. was isolated from the soil located around tangled roots of Elymus sp. and Trifolium sp. This species is marked by a conical head in both females and males which is not separated from the body contour and has with 4-5 annuli; a relatively short stylet (20.3-21.3 µm females, 17.7-20.9 µm males) with oval knobs directed posteriorly; the dorsal pharyngeal nucleus located anterior to the cardia (the subventral pharyngeal nuclei located posterior; a pharyngeal lobe of length about two body widths (1.8-2.6); a lateral field with 6 lines in the middle part of body and sometimes with partially areolated outer bands; intestinal fasciculi present; round sperm in the spermatheca in females; a female tail with a maximum of 29 annuli, and an annulated tail terminus. The status of the new species has been verifiied by DNA sequencing and phylogenetic analysis of the 28S rDNA region. The results obtained in the study indicated that P. ojcowensis sp. nov. is most related to P. alkani, P. ritteri and P. nevadensis from which is distinguished by the shape of the female head (conoid vs rounded), shorter stylet in females (20.3-21.3 µm vs 22.0-25.0 µm, 21.0-25.0 µm, 22.0-26.0 µm) and differences in 28S rDNA sequences. In addition (as per the original descriptions Yüksel 1977, Sher 1970, Talavera Tobar 1996) it is distinguished from P. alkani by smaller number of male's head annuli (4-5 vs 7-9), from P. ritteri it is distinguished by posteriorly directed stylet knobs (vs directed laterally), from P. nevadensis it is distinguished by oval and posteriorly directed stylet knobs (vs rounded and directed laterally).",2020,Zootaxa
32230099,"Prionotropis xausi, a new species of Thrinchinae (Orthoptera: Pamphagidae) from Catalonia (northeast of the Iberian Peninsula).","A new species of the genus Prionotropis Fieber, 1853 is described from Catalonia (Northeast of the Iberian Peninsula). Prionotropis xausi n. sp. was collected in a steppic calcareous grassland with low shrubs and scattered rocks, dominated by Buxus, Arctostaphylos uva-ursi, Thymus and Poa species. P. xausi n. sp. is characterized by the pronotum in dorsal view narrow and in lateral view with the principal transverse sulcus between prozona and metazona clearly incised. Females squamipterous with the tegmina extending at most to the end of second abdominal segment. Males sub-brachypterous with the tegmina reaching the 5th abdominal tergite and the epiproct visible. Inside hind femora at the base to the middle part and inside hind tibia red. Phallic complex with the epiphallus more long than wide with short posterior edge and the penis valves long. This new species is compared to the similar species P. rhodanica Uvarov, 1923 , P. azami Uvarov, 1923 and P. hystrix (Germar, 1817) and to the species presents in Catalonia P. flexuosa (Serville, 1838) and P. ancosae Olmo-Vidal, 2017. P. xausi is the species of the genus Prionotropis with the smallest males.",2020,Zootaxa
32229608,Differential effects of day-night cues and the circadian clock on the barley transcriptome.,"The circadian clock is a complex transcriptional network that regulates gene expression in anticipation of the day-night cycle and controls agronomic traits in plants. However, in crops, how the internal clock and day-night cues affect the transcriptome remains poorly understood. We analyzed the diel and circadian leaf transcriptomes in the barley (Hordeum vulgare) cultivar Bowman and derived introgression lines harboring mutations in EARLY FLOWERING 3 (ELF3), LUX ARRHYTHMO 1 (LUX1), and EARLY MATURITY 7 (EAM7). The elf3 and lux1 mutants exhibited abolished circadian transcriptome oscillations under constant conditions, whereas eam7 maintained oscillations of ≈30% of the circadian transcriptome. However, day-night cues fully restored transcript oscillations in all three mutants and thus compensated for a disrupted oscillator in the arrhythmic barley clock mutants elf3 and lux1. Nevertheless, elf3, but not lux1, affected the phase of the diel oscillating transcriptome and thus the integration of external cues into the clock. Using dynamical modeling, we predicted a structure of the barley circadian oscillator and interactions of its individual components with day-night cues. Our findings provide a valuable resource for exploring the function and output targets of the circadian clock and for further investigations into the diel and circadian control of the barley transcriptome.",2020,Plant physiology


'data.frame':	80826 obs. of  5 variables:
 $ pmid    : chr  "32231884" "32231040" "32230597" "32230394" ...
 $ title   : chr  "Proximate composition, functional properties and quantitative analysis of benzoyl peroxide and benzoic acid in "| __truncated__ "Determination of Glyphosate in Dried Wheat by <sup>1</sup>H-NMR Spectroscopy." "Review of the genus Eurygaster (Hemiptera: Heteroptera: Scutelleridae) of Russia." "Description of Pratylenchoides ojcowensis sp. nov. (Nematoda: Merlinidae) from Polish Jurassic Highland." ...
 $ abstract: chr  "Extensive milling processes have deprived wheat flour from essential nutrients. The objective of the current st"| __truncated__ "A wheat field was sprayed with a dosage of 1.1 kg a.i./ha Roundup PowerMax 10 days before harvest. The <sup>1</"| __truncated__ "Six species of the genus Eurygaster Laporte, 1833 are currently recorded from the territory of the Russian Fede"| __truncated__ "A new species of the genus Pratylenchoides has been described. I

## Save final table into R object `.rds` and `.tsv`

In [9]:
write.table(merged, file = "final_triticeae.tsv", row.names = FALSE, sep="\t")
saveRDS(merged, file = "final_triticeae.rds")

In [1]:
sessionInfo()

R version 3.6.3 (2020-02-29)
Platform: x86_64-conda_cos6-linux-gnu (64-bit)
Running under: CentOS Linux 7 (Core)

Matrix products: default
BLAS/LAPACK: /home/vanda.marosi/anaconda3/envs/r/lib/libopenblasp-r0.3.9.so

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=de_DE.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=de_DE.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=de_DE.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=de_DE.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

loaded via a namespace (and not attached):
 [1] compiler_3.6.3  IRdisplay_0.7.0 pbdZMQ_0.3-3    tools_3.6.3    
 [5] htmltools_0.4.0 base64enc_0.1-3 crayon_1.3.4    Rcpp_1.0.4     
 [9] uuid_0.1-4      IRkernel_0.8.15 jsonlite_1.6.1  digest_0.6.25  
[13] repr_1.1.0      evaluate_0.14  