Skip to content

Commit

Permalink
Revise part 08 of the programmers' "workflows" tutorial.
Browse files Browse the repository at this point in the history
  • Loading branch information
riccardomurri committed Jan 26, 2017
1 parent 6ccdf93 commit 6c021fb
Show file tree
Hide file tree
Showing 5 changed files with 259 additions and 94 deletions.
40 changes: 40 additions & 0 deletions docs/programmers/tutorials/workflows/downloads/saplot.py
@@ -0,0 +1,40 @@
#! /usr/bin/env python
"""
Make a line plot all price paths produced by `simAsset.R`,
together with their average value at any given time.
"""
import csv

import matplotlib.pyplot as plt


plt.style.use('ggplot')

data = open('results.csv')
rows = csv.reader(data)
ys = []
max_y = 0
for row in rows:
y = list(float(item) for item in row)
max_y = max(max_y, max(y))
ys.append(y)

fig = plt.figure()

# plot "hairy"
for y in ys:
x = range(len(y))
plt.plot(x, y, linestyle='solid', color='chartreuse', alpha=(1.0/8))

avgs = []
ts = zip(*ys)
N = len(ys)
for t in ts:
avg = sum(t) / N
avgs.append(avg)
plt.plot(x, avgs, linestyle='solid', linewidth=2, color='darkred', alpha=1.0)

plt.ylim(0, max_y)
#plt.show()

fig.savefig("saplot.pdf")
Binary file modified docs/programmers/tutorials/workflows/part08.pdf
Binary file not shown.
197 changes: 103 additions & 94 deletions docs/programmers/tutorials/workflows/part08.tex
Expand Up @@ -18,7 +18,7 @@
\\[1ex]
University of Zurich
}
\date{November~14--17, 2016}
\date{January~23--27, 2017}


\begin{document}
Expand Down Expand Up @@ -88,66 +88,26 @@
\end{frame}


\begin{frame}[fragile]
\frametitle{Detour: BLAST, again}

Another use of the BLAST tool is to search for given ``query''
proteins in a data base. Large curated DBs are available, but one
may want to build a custom DB.

\+
Building a DB from a set of FASTA-format files \texttt{p1.faa}
\texttt{p2.faa} and \texttt{p3.faa}, and querying it is a 3-step
process:
\begin{sh}
cat p1.faa p2.faa p3.faa > db.faa
formatdb -i db.faa
blastpgp -i q.faa -d db.faa -e ...
\end{sh}

\+ The \texttt{formatdb} step produces output files
\texttt{db.faa.phr}, \texttt{db.faa.pin}, and \texttt{db.faa.psq}; all
these files are \emph{inputs} to the \texttt{blastpgp} program.
\end{frame}


\begin{frame}[fragile]
\begin{exercise*}[8.A]
Write a \texttt{blastdb.py} script to build a BLAST DB and query it.

\+
The \texttt{blastdb.py} script shall be invoked like this:
\begin{sh}
$ python topblast.py query.faa p1.faa [p2.faa ...]
\end{sh}%$
where arguments \texttt{new.faa}, \texttt{p1.faa}, etc. are FASTA-format files.
\+
The script should build a BLAST DB out of the files {p$N$.faa}.
Then, it should query this database for occurrences of the
proteins in \texttt{query.faa} using \texttt{blastpgp}.
\end{exercise*}
\end{frame}
\begin{frame}
\begin{exercise*}[8.B]
Find out by running the \texttt{blastdb.py} script of Ex.~8.A:
\begin{exercise*}[8.A]

\+
Write a \texttt{priceplot.py} script that performs the following two steps:
\begin{enumerate}
\item What happens if an intermediate step fails and does not
produce complete output?
\item Run the
\href{https://github.com/uzh/gc3pie/blob/master/docs/programmers/tutorials/workflows/downloads/simAsset.R}{\texttt{simAsset.R}}
script (from Exercise~6.D) with the parameters given on the command line,
and
\item Feed the \texttt{results.csv} file it outputs into the
\href{https://github.com/uzh/gc3pie/blob/master/docs/programmers/tutorials/workflows/downloads/saplot.py}{\texttt{saplot.py}}
script and retrieve the produced \texttt{saplot.pdf} file.
\end{enumerate}

\+
\item After the whole sequence turns to TERMINATED state, what is
the value of its signal and exitcode?
\+
Run it like \texttt{simAsset.R}, for example:
\begin{semiverbatim}
\$ python priceplot.py 50 0.04 0.1 0.27 10 40
\end{semiverbatim}

\+
\item How could you implement a ``cleanup'' feature that removes
intermediate results (e.g., the ``\texttt{.phr}'' files) and
only keeps the output from \texttt{blastpgp} \textbf{if the
whole sequence was successfully executed?}
\end{enumerate}
\end{exercise*}
\end{frame}

Expand All @@ -169,36 +129,30 @@
\begin{frame}[fragile]
\begin{columns}[t]
\begin{column}{0.6\textwidth}
\begin{lstlisting}
\begin{lstlisting}[basicstyle=\footnotesize\ttfamily]
class Pipeline~\HL{(StagedTaskCollection)}~:
def __init__(self, image):
StagedTaskCollection.__init__(self)
super(Pipeline).__init__(self)
self.source = image

def stage0(self):
# run 1st step
return Application(...)
# ...

def stage1(self):
if self.tasks[0].execution.exitcode != 0:
# set collection signal and exit code,
# and state to TERMINATED
return (0, 1)
else:
# run 2nd step
return Application(...)
# ...

# ...
def stage~$N$~(self):
def stage~{\bfseries\itshape X}~(self):
# ...
\end{lstlisting}
\end{column}
\begin{column}{0.4\textwidth}
\raggedleft

\+\+
Example of~a \texttt{StagedTaskCollection}
subclass.
\+\+\small
Example: \\
subclassing a
\texttt{StagedTaskCollection}
\end{column}
\end{columns}
\end{frame}
Expand All @@ -207,10 +161,10 @@
\begin{frame}[fragile]
\begin{columns}[c]
\begin{column}{0.6\textwidth}
\begin{lstlisting}
\begin{lstlisting}[basicstyle=\footnotesize\ttfamily]
class Pipeline(StagedTaskCollection):
def __init__(self, image):
StagedTaskCollection.__init__(self)
super(Pipeline).__init__(self)
self.source = image

def ~\HL{stage0(self)}~:
Expand All @@ -220,7 +174,7 @@
# ...

# ...
def ~\HL{stage$\mathbf N$(self)}~:
def ~\HL{stage{\bfseries\itshape X}(self)}~:
# ...
\end{lstlisting}
\end{column}
Expand All @@ -239,7 +193,7 @@
\begin{frame}[fragile]
\begin{columns}[c]
\begin{column}{0.6\textwidth}
\begin{lstlisting}
\begin{lstlisting}[basicstyle=\footnotesize\ttfamily]
class Pipeline(StagedTaskCollection):
# ...

Expand All @@ -258,8 +212,8 @@
\begin{column}{0.4\textwidth}
\raggedleft

Each \texttt{stage$N$} method can return a \texttt{Task}
instance, that will run as step $N$ in the sequence.
Each \texttt{stage{\itshape X}} method can return a \texttt{Task}
instance, that will run as the $X$-th step in the sequence.
\end{column}
\end{columns}
\end{frame}
Expand All @@ -268,21 +222,20 @@
\begin{frame}[fragile]
\begin{columns}[c]
\begin{column}{0.6\textwidth}
\begin{lstlisting}
\begin{lstlisting}[basicstyle=\footnotesize\ttfamily]
class Pipeline(StagedTaskCollection):
# ...

def stage1(self):
~\HL{if self.tasks[0].execution.exitcode != 0:}~
# set collection signal and exit code,
# and state to TERMINATED
~\HL{\textbf{if} \textbf{\color{gray} self}.tasks[0].execution.exitcode != 0:}~
# bail out
return (0, 1)
else:
# run 2nd step
return Application(...)

# ...
def stage~$N$~(self):
def stage~{\bfseries\itshape X}~(self):
# ...
\end{lstlisting}
\end{column}
Expand All @@ -301,21 +254,20 @@
\begin{frame}[fragile]
\begin{columns}[c]
\begin{column}{0.6\textwidth}
\begin{lstlisting}
\begin{lstlisting}[basicstyle=\footnotesize\ttfamily]
class Pipeline(StagedTaskCollection):
# ...

def stage1(self):
if self.tasks[0].execution.exitcode != 0:
# set collection signal and exit code,
# and state to TERMINATED
if ...:
# bail out
~\HL{return (0, 1)}~
else:
# run 2nd step
return Application(...)

# ...
def stage~$N$~(self):
def stage~{\bfseries\itshape X}~(self):
# ...
\end{lstlisting}
\end{column}
Expand All @@ -326,22 +278,79 @@
To abort the sequence, return an integer (termination
status) or a pair \emph{(signal, exit code)}, instead of a
\texttt{Task} instance.

\+
This sets the collections' own signal and exit code, and also sets the
state as \texttt{TERMINATED}.
\end{column}
\end{columns}
\end{frame}


\begin{frame}[fragile]
\frametitle{Detour: BLAST, again}

Another use of the BLAST tool is to search for given ``query''
proteins in a data base. Large curated DBs are available, but one
may want to build a custom DB.

\+
Building a DB from a set of FASTA-format files \texttt{p1.faa}
\texttt{p2.faa} and \texttt{p3.faa}, and querying it is a 3-step
process:
\begin{sh}
cat p1.faa p2.faa p3.faa > db.faa
formatdb -i db.faa
blastpgp -i q.faa -d db.faa -e ...
\end{sh}

\+ The \texttt{formatdb} step produces output files
\texttt{db.faa.phr}, \texttt{db.faa.pin}, and \texttt{db.faa.psq}; all
these files are \emph{inputs} to the \texttt{blastpgp} program.
\end{frame}


\begin{frame}[fragile]
\begin{exercise*}[8.B] \emph{(Difficult)}

Write a \texttt{blastdb.py} script to build a BLAST DB and query it.

\+
The \texttt{blastdb.py} script shall be invoked like this:
\begin{sh}
$ python blastdb.py query.faa p1.faa [p2.faa ...]
\end{sh}%$
where arguments \texttt{new.faa}, \texttt{p1.faa}, etc. are FASTA-format files.
\+
The script should build a BLAST DB out of the files {p$N$.faa}.
Then, it should query this database for occurrences of the
proteins in \texttt{query.faa} using \texttt{blastpgp}.
\end{exercise*}
\end{frame}
\begin{frame}
\begin{exercise*}[8.C]
Rewrite the \texttt{blastdb.py} script from Ex.~8.A to use a
\texttt{StagedTaskCollection} and be sure to check that a step is
successful before proceeding to the next one.
Find out by running the \texttt{blastdb.py} script of Ex.~8.B:
\+
Upon successful completion of the pipeline, move the
\texttt{blastpgp} output into directory
\texttt{/home/ubuntu/results} and then delete all intermediate
files and directories.
\begin{enumerate}
\item What happens if an intermediate step fails and does not
produce complete output?
\+
\item After the whole sequence turns to TERMINATED state, what is
the value of its signal and exitcode?
\end{enumerate}
\end{exercise*}
\+
\begin{exercise*}[8.D]
Implement (in \texttt{blastdb.py}) a ``cleanup'' feature that removes
intermediate results (e.g., the ``\texttt{.phr}'' files) and only keeps the
output from \texttt{blastpgp} \emph{if the whole sequence was successfully
executed}.
\end{exercise*}
\end{frame}
Expand Down

0 comments on commit 6c021fb

Please sign in to comment.