Skip to content

Commit

Permalink
Merge 2cbdbbe into 156de78
Browse files Browse the repository at this point in the history
  • Loading branch information
Felix Van der Jeugt committed Nov 25, 2015
2 parents 156de78 + 2cbdbbe commit d4a346e
Show file tree
Hide file tree
Showing 3 changed files with 38 additions and 21 deletions.
17 changes: 14 additions & 3 deletions backend/makefile
Expand Up @@ -120,9 +120,13 @@ $(TABDIR)/sequences.tsv.gz: $(INTDIR)/sequences.tsv.gz $(INTDIR)/LCAs.tsv.gz $(I
# }}}

# Proteomes {{{ ----------------------------------------------------------------
$(TABDIR)/proteomes.tsv.gz: $(INTDIR)/proteomes.tsv.gz
$(TABDIR)/proteomes.tsv.gz: $(INTDIR)/proteomes.tsv.gz proteomes.sh strains_assembly_ids.sh
echo "Starting fetching proteome info."
./proteomes.sh $(INTDIR)/proteomes.tsv.gz $(TABDIR)/proteomes.tsv.gz
./proteomes.sh \
$(INTDIR)/proteomes.tsv.gz \
<(ENTREZ_URL=$(ENTREZ_URL) ENTREZ_BATCH_SIZE=$(ENTREZ_BATCH_SIZE) ./strains_assembly_ids.sh) \
$(TABDIR)/proteomes.tsv.gz
echo "FInished fetching proteome info."
# }}}

# Assembly tables {{{ ----------------------------------------------------------
Expand All @@ -140,7 +144,14 @@ $(INTDIR)/unstrained_assemblies.tsv.gz $(TABDIR)/assembly_sequences.tsv.gz: pars

$(TABDIR)/assemblies.tsv.gz: $(INTDIR)/unstrained_assemblies.tsv.gz strains_assembly_ids.sh
echo "Starting the straining of assemblies."
ENTREZ_URL=$(ENTREZ_URL) ENTREZ_BATCH_SIZE=$(ENTREZ_BATCH_SIZE) ./strains_assembly_ids.sh $(INTDIR)/unstrained_assemblies.tsv.gz $@
join -1 2 -2 1 -a 1 -t ' ' \
<(zcat "$(INTDIR)/unstrained_assemblies.tsv.gz") \
<(ENTREZ_URL=$(ENTREZ_URL) ENTREZ_BATCH_SIZE=$(ENTREZ_BATCH_SIZE) ./strains_assembly_ids.sh | sed "s/$$/\t\x01/" | sort) \
| sed \
-e '/\x01$$/!s/$$/\t\x00/' \
-e 's/^\([^\t]*\)\t\([^\t]*\)/\2\t\1/' \
| gzip - \
> "$@"
echo "Finished the straining of assemblies."
# }}}

Expand Down
25 changes: 21 additions & 4 deletions backend/proteomes.sh
Expand Up @@ -4,8 +4,25 @@
# - The gzipped proteomes file.
# - The output file.
proteome_gz_file="$1"
outfile="$2"
type_strains="$2"
outfile="$3"

tmp="$(mktemp)"

gzcat $proteome_gz_file | head | while read id accession; do
curl -s http://www.uniprot.org/proteomes/$accession \
| html2text -nobs -width 1000 \
| awk -f proteomes.awk -v id=$id -v accession=$accession
done > "$tmp"

join -1 8 -2 1 -a 1 -t ' ' \
<(sort -t' ' -k 8 "$tmp") \
<(sort "$type_strains" | sed 's/$/\t\x01/') \
| sed "/\x01$/!s/$/ \x00/" \
| awk 'BEGIN { FS = OFS = " " }{ print $2,$3,$4,$5,$9,$7,$8,$1 }' \
| sort -n \
| gzip \
> $outfile

rm "$tmp"

gzcat $proteome_gz_file | while read id accession; do
curl -s http://www.uniprot.org/proteomes/$accession | html2text -width 1000 | awk -f proteomes.awk -v id=$id -v accession=$accession
done | gzip > $outfile
17 changes: 3 additions & 14 deletions backend/strains_assembly_ids.sh
@@ -1,10 +1,6 @@
#!/bin/bash

# Arguments:
# - The gunzipped assembly file.
# - The output file.
assembly_gz_file="$1"
outfile="$2"
# Arguments: none

# Please crash on first mistake.
set -e
Expand All @@ -31,7 +27,6 @@ web_env="$(echo "$header" \
returned="$BATCH_SIZE"
retstart='1'
while ((returned == BATCH_SIZE)); do
echo "$retstart"
returned="$(curl -d 'db=assembly' \
-d "query_key=$query_key" \
-d "WebEnv=$web_env" \
Expand All @@ -43,17 +38,11 @@ while ((returned == BATCH_SIZE)); do
| tee -a "$tempfile" \
| wc -l \
)"
echo "$returned"
retstart="$((retstart + returned))"
done

join -1 2 -2 1 -a 1 -t ' ' \
<(zcat "$assembly_gz_file") \
<(sed "s/$/\t\x01/" "$tempfile" | sort) \
| sed -e "/\x01$/!s/$/ \x00/" \
-e 's/^\([^\t]*\)\t\([^\t]*\)/\2\t\1/' \
| gzip - \
> "$outfile"
# write out the type strain assembly ids
cat "$tempfile"

rm "$tempfile"

0 comments on commit d4a346e

Please sign in to comment.