In [2]:
%matplotlib inline
%config InlineBackend.figure_format = "retina"
from IPython.display import display
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.io as pio
pio.templates.default = "plotly_white"
import logging
import logzero
logzero.loglevel(logging.INFO)

# Move to the data directory

In [1]:
dir_fname = "work/"

In [3]:
import os
os.chdir(dir_fname)

In [4]:
!(ls -l)

total 4
-rw-r--r-- 1 yoshihiko_s users 250 Sep 11 17:34 DMEL.db


# Input

* DAZZ_DB file of the whole sequenced reads (here `DMEL.db`)

In [5]:
db_prefix = "DMEL"
db_fname = f"{db_prefix}.db"

# Output

* `TAN.{db_prefix}.las`: Alignment records by TANmask

# How to run

In [7]:
from vca.datander import DatanderRunner

## Docstring

In [8]:
DatanderRunner?
# following string will be shown in your Notebook

```
Init signature:
DatanderRunner(
    db_prefix: str,
    read_type: str = 'CLR',
    n_core: int = 1,
    scheduler: BITS.util.scheduler.Scheduler = None,
) -> None
Docstring:     
Entry point of datander, a commandline tool for detecting tandem repeat regions from (noisy) reads.
In VCA, slightly customized datander is used (alignment will NOT be extended to the ends of a read).

Positional arguments:
  - db_prefix <str> : Prefix of the DB file created with DAZZ_DB. DB file must be in CWD

Optional arguments:
  - read_type    <str>       ["CLR"] : Input read type. Must be one of {"CLR", "CCS"}.
  - n_core       <int>       [1]     : Number of cores used in datader
  - scheduler    <Scheduler> [None]  : Scheduler object
```

## Options

In [11]:
read_type = "CLR"   # "CLR" (default) or "CCS"
n_core = 16

In [9]:
from BITS.util.scheduler import Scheduler
scheduler = Scheduler("sge", "qsub", "all.q")   # It is optional; you do not have to use a job scheduler

## Execution

In [12]:
r = DatanderRunner(db_prefix, read_type=read_type, n_core=n_core, scheduler=scheduler)

In [13]:
r.run()

[I 190911 17:45:15 scheduler:28] Submitting a job: # Datander jobs (1)
    datander -T16 DMEL.1 DMEL.2 DMEL.3
    # Check all .las files jobs (1) (optional but recommended)
    LAcheck -vS DMEL TAN.DMEL.1 TAN.DMEL.2 TAN.DMEL.3
    # Merge jobs (3)
    LAmerge TAN.DMEL TAN.DMEL.1 TAN.DMEL.2 TAN.DMEL.3
    # Check merged .las file (optional but recommended)
    LAcheck -vS DMEL TAN.DMEL
    # TANmask jobs (1)
    TANmask DMEL TAN.DMEL.1 TAN.DMEL.2 TAN.DMEL.3
    # Merge all T.las files
    LAmerge TAN.DMEL.las TAN.DMEL.1.las TAN.DMEL.2.las TAN.DMEL.3.las
    # Cleanup all T.las files
    rm TAN.DMEL.1.las TAN.DMEL.2.las TAN.DMEL.3.las
    # Once all the .tan masks have been computed for every block
    #   you should call 'Catrack' to merge them, and then you should
    #   remove the individual block tracks, e.g.:
    #      Catrack -v DMEL tan
    #      rm .DMEL.*.tan.*
    Catrack -v DMEL tan
    rm .DMEL.*.tan.*


Now `TAN.DMEL.las` is generated:

In [14]:
!(ls -l)

total 69320
drwxr-xr-x 2 yoshihiko_s users       38 Sep 11 17:45 datander
-rw-r--r-- 1 yoshihiko_s users      250 Sep 11 17:34 DMEL.db
-rw-r--r-- 1 yoshihiko_s users 70975912 Sep 11 17:46 TAN.DMEL.las


And script files used and log files are stored in the `datander/` directory:

In [15]:
!(ls -l datander/)

total 8
-rw-r--r-- 1 yoshihiko_s users 209 Sep 11 17:46 log
-rw-r--r-- 1 yoshihiko_s users 920 Sep 11 17:45 run_datander.sh
