In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from IPython.display import display
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.io as pio
pio.templates.default = 'plotly_white'
import logging
import logzero
logzero.loglevel(logging.INFO)

In [13]:
import numpy as np
import pandas as pd
from BITS.util.io import load_pickle
from BITS.plot.plotly import make_hist, make_scatter, make_layout, show_plot

In [3]:
dir_fname = "work"
import os
os.chdir(dir_fname)

In [4]:
db_prefix = "DMEL_CSS"
db_fname = f"{db_prefix}.db"
las_fname = f"TAN.{db_prefix}.las"

In [5]:
gepard_root = "/work2/yoshihiko_s/software/gepard"
gepard_jar = f"{gepard_root}/dist/Gepard-1.40.jar"
gepard_mat = f"{gepard_root}/resources/matrices/edna.mat"
gepard = f"java -cp {gepard_jar} org.gepard.client.cmdline.CommandLine -matrix {gepard_mat}"

from vca import ReadViewer
v = ReadViewer(db_fname, las_fname, gepard)

## Filter by read length

In [6]:
centromere_reads_fname = "centromere_reads.pkl"

In [7]:
centromere_reads = load_pickle(centromere_reads_fname)

In [8]:
show_plot([make_hist([read.length for read in centromere_reads], bin_size=200)])

In [9]:
# Filter out relatively short reads
centromere_reads = list(filter(lambda read: read.length > 10000, centromere_reads))

In [10]:
len(centromere_reads)

743

In [11]:
show_plot([make_hist([read.length for read in centromere_reads], bin_size=200)])

In [14]:
dmel_centromere_size = 1600000
len(centromere_reads) * np.mean([read.length for read in centromere_reads]) / dmel_centromere_size

5.88228125

-> Average coverage ~ 6x, assuming centromere size is 1.6 Mbp and it consists only of core-centromere

## Looking at some CCS reads

In [15]:
for read in centromere_reads[:10]:
    v.show(read=read)

## Classify the units 

In [16]:
from copy import copy
from logzero import logger
from collections import Counter, defaultdict
from dataclasses import dataclass, field
from typing import Any, Type, List, Dict
import numpy as np
import pandas as pd
from BITS.clustering.seq import ClusteringSeq
from BITS.clustering.numeric import ClusteringNumeric
from BITS.seq.align import EdlibRunner
from BITS.seq.consed import ConsedRunner
from BITS.util.io import save_pickle, load_pickle
from BITS.util.proc import NoDaemonPool

In [None]:
all_units = [read.seq[unit.start:unit.end] for read in centromere_reads for unit in read.units]

In [None]:
len(all_units)