From ff68d75c20b853c8c48497dab23c3a070ecff853 Mon Sep 17 00:00:00 2001 From: Felix Van der Jeugt Date: Wed, 25 Nov 2015 20:13:50 +0100 Subject: [PATCH 1/2] add strain types to the proteomes --- backend/makefile | 16 +++++++++++++--- backend/proteomes.sh | 27 +++++++++++++++++++++++---- backend/strains_assembly_ids.sh | 17 +++-------------- 3 files changed, 39 insertions(+), 21 deletions(-) diff --git a/backend/makefile b/backend/makefile index 0ce41e2d2..17be27e4c 100644 --- a/backend/makefile +++ b/backend/makefile @@ -120,8 +120,11 @@ $(TABDIR)/sequences.tsv.gz: $(INTDIR)/sequences.tsv.gz $(INTDIR)/LCAs.tsv.gz $(I # }}} # Proteomes {{{ ---------------------------------------------------------------- -$(TABDIR)/proteomes.tsv.gz: $(INTDIR)/proteomes.tsv.gz proteomes.sh - ./proteomes.sh $(INTDIR)/proteomes.tsv.gz $(TABDIR)/proteomes.tsv.gz +$(TABDIR)/proteomes.tsv.gz: $(INTDIR)/proteomes.tsv.gz proteomes.sh strains_assembly_ids.sh + ./proteomes.sh \ + $(INTDIR)/proteomes.tsv.gz \ + <(ENTREZ_URL=$(ENTREZ_URL) ENTREZ_BATCH_SIZE=$(ENTREZ_BATCH_SIZE) ./strains_assembly_ids.sh) \ + $(TABDIR)/proteomes.tsv.gz # }}} # Assembly tables {{{ ---------------------------------------------------------- @@ -139,7 +142,14 @@ $(INTDIR)/unstrained_assemblies.tsv.gz $(TABDIR)/assembly_sequences.tsv.gz: pars $(TABDIR)/assemblies.tsv.gz: $(INTDIR)/unstrained_assemblies.tsv.gz strains_assembly_ids.sh echo "Starting the straining of assemblies." - ENTREZ_URL=$(ENTREZ_URL) ENTREZ_BATCH_SIZE=$(ENTREZ_BATCH_SIZE) ./strains_assembly_ids.sh $(INTDIR)/unstrained_assemblies.tsv.gz $@ + join -1 2 -2 1 -a 1 -t ' ' \ + <(zcat "$(INTDIR)/unstrained_assemblies.tsv.gz") \ + <(ENTREZ_URL=$(ENTREZ_URL) ENTREZ_BATCH_SIZE=$(ENTREZ_BATCH_SIZE) ./strains_assembly_ids.sh | sed "s/$$/\t\x01/" | sort) \ + | sed \ + -e '/\x01$$/!s/$$/\t\x00/' \ + -e 's/^\([^\t]*\)\t\([^\t]*\)/\2\t\1/' \ + | gzip - \ + > "$@" echo "Finished the straining of assemblies." # }}} diff --git a/backend/proteomes.sh b/backend/proteomes.sh index a8061b9b1..3dab4f602 100755 --- a/backend/proteomes.sh +++ b/backend/proteomes.sh @@ -4,8 +4,27 @@ # - The gzipped proteomes file. # - The output file. proteome_gz_file="$1" -outfile="$2" +type_strains="$2" +outfile="$3" + +tmp="$(mktemp)" + +zcat $proteome_gz_file | head | while read id accession; do + curl -s http://www.uniprot.org/proteomes/$accession \ + | html2text -nobs -width 1000 \ + | awk -f proteomes.awk -v id=$id -v accession=$accession +done > "$tmp" + +echo done curling + +join -1 8 -2 1 -a 1 -t ' ' \ + <(sort -t' ' -k 8 "$tmp") \ + <(sort "$type_strains" | sed 's/$/\t\x01/') \ + | sed "/\x01$/!s/$/ \x00/" \ + | awk 'BEGIN { FS = OFS = " " }{ print $2,$3,$4,$5,$9,$7,$8,$1 }' \ + | sort -n \ + | gzip \ + > $outfile + +rm "$tmp" -gzcat $proteome_gz_file | while read id accession; do - curl -s http://www.uniprot.org/proteomes/$accession | html2text -width 1000 | awk -f proteomes.awk -v id=$id -v accession=$accession -done | gzip > $outfile diff --git a/backend/strains_assembly_ids.sh b/backend/strains_assembly_ids.sh index cda453dfc..a8d3a73e8 100755 --- a/backend/strains_assembly_ids.sh +++ b/backend/strains_assembly_ids.sh @@ -1,10 +1,6 @@ #!/bin/bash -# Arguments: -# - The gunzipped assembly file. -# - The output file. -assembly_gz_file="$1" -outfile="$2" +# Arguments: none # Please crash on first mistake. set -e @@ -31,7 +27,6 @@ web_env="$(echo "$header" \ returned="$BATCH_SIZE" retstart='1' while ((returned == BATCH_SIZE)); do - echo "$retstart" returned="$(curl -d 'db=assembly' \ -d "query_key=$query_key" \ -d "WebEnv=$web_env" \ @@ -43,17 +38,11 @@ while ((returned == BATCH_SIZE)); do | tee -a "$tempfile" \ | wc -l \ )" - echo "$returned" retstart="$((retstart + returned))" done -join -1 2 -2 1 -a 1 -t ' ' \ - <(zcat "$assembly_gz_file") \ - <(sed "s/$/\t\x01/" "$tempfile" | sort) \ - | sed -e "/\x01$/!s/$/ \x00/" \ - -e 's/^\([^\t]*\)\t\([^\t]*\)/\2\t\1/' \ - | gzip - \ - > "$outfile" +# write out the type strain assembly ids +cat "$tempfile" rm "$tempfile" From 5e7017fd90f5924fc6114c8d9f97c67d6acf18d0 Mon Sep 17 00:00:00 2001 From: Felix Van der Jeugt Date: Wed, 25 Nov 2015 20:19:08 +0100 Subject: [PATCH 2/2] use gzcat --- backend/proteomes.sh | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/backend/proteomes.sh b/backend/proteomes.sh index 3dab4f602..444ca2333 100755 --- a/backend/proteomes.sh +++ b/backend/proteomes.sh @@ -9,14 +9,12 @@ outfile="$3" tmp="$(mktemp)" -zcat $proteome_gz_file | head | while read id accession; do +gzcat $proteome_gz_file | head | while read id accession; do curl -s http://www.uniprot.org/proteomes/$accession \ | html2text -nobs -width 1000 \ | awk -f proteomes.awk -v id=$id -v accession=$accession done > "$tmp" -echo done curling - join -1 8 -2 1 -a 1 -t ' ' \ <(sort -t' ' -k 8 "$tmp") \ <(sort "$type_strains" | sed 's/$/\t\x01/') \