In [1]:
%matplotlib inline
%config InlineBackend.figure_format = "retina"
from IPython.display import display
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.io as pio
pio.templates.default = "plotly_white"
import logging
import logzero
logzero.loglevel(logging.INFO)

# Move to the data directory

In [1]:
dir_fname = "work/"

In [2]:
import os
os.chdir(dir_fname)

In [3]:
!(ls -l)

total 69320
drwxr-xr-x 2 yoshihiko_s users       38 Sep 11 17:45 datander
-rw-r--r-- 1 yoshihiko_s users      250 Sep 11 17:34 DMEL.db
-rw-r--r-- 1 yoshihiko_s users 70975912 Sep 11 17:46 TAN.DMEL.las


# Input

* DAZZ_DB file of the whole sequenced reads (here `DMEL.db`)
* Output `.las` file by datander

In [4]:
db_prefix = "DMEL"
db_fname = f"{db_prefix}.db"
las_fname = f"TAN.{db_prefix}.las"

# Output

* `tr_reads.pkl`: Reads with tandem repeat units detected by datruf

# How to run

In [5]:
from vca.datruf import DatrufRunner

## Docstring

In [7]:
DatrufRunner?
# following string will be shown in your Notebook

```
Init signature:
DatrufRunner(
    db_fname: str,
    las_fname: str,
    n_core: int = 1,
    n_distribute: int = 1,
    scheduler: BITS.util.scheduler.Scheduler = None,
) -> None
Docstring:     
Entry point of datruf, which detects units of TRs using the result of datander.

Positional arguments:
  - db_fname  <str> : DAZZ_DB file
  - las_fname <str> : Output of datander. These files must be in CWD

Optional arguments:
  - n_core       <int>       [1]               : Number of cores used in a single job of datrud
  - n_distribute <int>       [1]               : Number of jobs distributed in datruf
  - scheduler    <Scheduler> [None]            : Scheduler object
```

## Options

In [8]:
n_core = 6
n_distribute = 10

In [9]:
from BITS.util.scheduler import Scheduler
scheduler = Scheduler("sge", "qsub", "all.q")   # It is optional; you do not have to use a job scheduler

## Execution

In [10]:
r = DatrufRunner(db_fname, las_fname, n_core=n_core, n_distribute=n_distribute, scheduler=scheduler)

In [11]:
r.run()

[I 190911 17:59:12 scheduler:28] Submitting a job: python -m vca.datruf.main DMEL.db TAN.DMEL.las datruf/tr_reads.pkl.01 1 10000 6
[I 190911 17:59:12 scheduler:28] Submitting a job: python -m vca.datruf.main DMEL.db TAN.DMEL.las datruf/tr_reads.pkl.02 10001 20000 6
[I 190911 17:59:12 scheduler:28] Submitting a job: python -m vca.datruf.main DMEL.db TAN.DMEL.las datruf/tr_reads.pkl.03 20001 30000 6
[I 190911 17:59:12 scheduler:28] Submitting a job: python -m vca.datruf.main DMEL.db TAN.DMEL.las datruf/tr_reads.pkl.04 30001 40000 6
[I 190911 17:59:12 scheduler:28] Submitting a job: python -m vca.datruf.main DMEL.db TAN.DMEL.las datruf/tr_reads.pkl.05 40001 50000 6
[I 190911 17:59:12 scheduler:28] Submitting a job: python -m vca.datruf.main DMEL.db TAN.DMEL.las datruf/tr_reads.pkl.06 50001 60000 6
[I 190911 17:59:12 scheduler:28] Submitting a job: python -m vca.datruf.main DMEL.db TAN.DMEL.las datruf/tr_reads.pkl.07 60001 70000 6
[I 190911 17:59:12 scheduler:28] Submitting a job: python -

Now `tr_reads.pkl` is generated.

In [12]:
!(ls -l)

total 244512
drwxr-xr-x 2 yoshihiko_s users        38 Sep 11 17:45 datander
drwxr-xr-x 2 yoshihiko_s users      4096 Sep 11 17:59 datruf
-rw-r--r-- 1 yoshihiko_s users       250 Sep 11 17:34 DMEL.db
-rw-r--r-- 1 yoshihiko_s users  70975912 Sep 11 17:46 TAN.DMEL.las
-rw-r--r-- 1 yoshihiko_s users 179390870 Sep 11 18:00 tr_reads.pkl


And script files used and log files are stored in the `datruf/` directory:

In [14]:
!(ls -l datruf/)

total 176332
-rw-r--r-- 1 yoshihiko_s users      216 Sep 11 17:59 gather.sh
-rw-r--r-- 1 yoshihiko_s users   756794 Sep 11 17:59 log
-rw-r--r-- 1 yoshihiko_s users      205 Sep 11 17:59 scatter.sh.01
-rw-r--r-- 1 yoshihiko_s users      209 Sep 11 17:59 scatter.sh.02
-rw-r--r-- 1 yoshihiko_s users      209 Sep 11 17:59 scatter.sh.03
-rw-r--r-- 1 yoshihiko_s users      209 Sep 11 17:59 scatter.sh.04
-rw-r--r-- 1 yoshihiko_s users      209 Sep 11 17:59 scatter.sh.05
-rw-r--r-- 1 yoshihiko_s users      209 Sep 11 17:59 scatter.sh.06
-rw-r--r-- 1 yoshihiko_s users      209 Sep 11 17:59 scatter.sh.07
-rw-r--r-- 1 yoshihiko_s users      209 Sep 11 17:59 scatter.sh.08
-rw-r--r-- 1 yoshihiko_s users      209 Sep 11 17:59 scatter.sh.09
-rw-r--r-- 1 yoshihiko_s users      210 Sep 11 17:59 scatter.sh.10
-rw-r--r-- 1 yoshihiko_s users 18489471 Sep 11 17:59 tr_reads.pkl.01
-rw-r--r-- 1 yoshihiko_s users 17285696 Sep 11 17:59 tr_reads.pkl.02
-rw-r--r-- 1 yoshihiko_s users 18097750 Sep 

# About the output data: `tr_reads.pkl`

Data stored in the Pickle file `tr_reads.pkl` are `List[vca.types.TRRead]`:

In [15]:
from BITS.util.io import load_pickle

In [16]:
type(load_pickle("tr_reads.pkl"))

list

In [17]:
type(load_pickle("tr_reads.pkl")[0])

vca.types.TRRead

In [22]:
load_pickle("tr_reads.pkl")[10]

TRRead(seq='cacaaagagagatatgtatcgactgaagtccgaaaaacaatagctaatatatggaaatagatagaaattcaattttaaattataaaaatagctaccagatttagtgaccgcaatttattattgatttagaccagatcagaatgggataacagaaatagattgtaaaaaaaaaatattttgttaaatattagtatctataatcaagaaaataataattaaatacatatcgttttctcccttttttttataattatttattaaataataaaatataaaatataaatattttaatttatttttttaattttcgtcgaacttattattattgtttttgtctcgataataagggtcttcatcgtcgctatgcgctggtttttttgtcaaggcaaatacgaacaagaggagacgaagaaattgcacttggcaagttctgtgacgtcgcgctgtgtaaattgtaatatcgtgtgatgtgaaatgggatatgtcgtattatagttatatattgcattgtataattgatgtgatagtgaaatttttggtgattatatgtagtttatatttaataattaattcaattttgtatattgtattgatgttattttagttgaatttaatttacattgtaattgtttgggtattgtatatttttattgaggcccagaaacaaaacagctttagccaggacgatagcagaattttggaagtgtgtatattatagtacaatggttggatattcagcggagcaccgtgagacgaattctggtctcgatgcaagtgctttgtttttgcacttaaaaaaagtaattcatgtggcttgggcgcatgtaattatatattttcaactttcaccttgattacgtggatggcggaaagaatagaactcccccgttggaaaaatcgaaatggcgacccacggacgaattaatttaaagccgtactattttgaattcgaaggagcgctggaggatcagagctcatgctcgaagga

In [23]:
from vca.types import TRRead

In [24]:
TRRead?

```
Init signature:
TRRead(
    seq: str,
    id: int = None,
    name: str = None,
    alignments: List[vca.types.SelfAlignment] = None,
    trs: List[vca.types.ReadInterval] = None,
    units: List[vca.types.TRUnit] = None,
    repr_units: Dict[int, str] = None,
) -> None
Docstring:      Class for a read with TRs. Multiple TRs in a read are not distinguished here.
```