In [1]:
%matplotlib inline
%config InlineBackend.figure_format = "retina"
from IPython.display import display, Image
import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

In [2]:
import numpy as np
import pandas as pd
from BITS.utils import load_pickle, save_pickle

In [3]:
dir_name = "develop/dmel_all"

In [4]:
db_file = f"{dir_name}/DMEL.db"
las_file = f"{dir_name}/TAN.DMEL.las"
out_dir = f"{dir_name}/tmp"

#gepard_root = "/work2/yoshihiko_s/software/gepard/"   # hx
#gepard_root = "/Users/yoshihikosuzuki/software/gepard/"   # local
gepard_root = "/home/ysuzuki/work/software/gepard/"   # falcon
gepard_jar = gepard_root + "dist/Gepard-1.40.jar"
gepard_mat = gepard_root + "resources/matrices/edna.mat"
gepard_command = f"java -cp {gepard_jar} org.gepard.client.cmdline.CommandLine -matrix {gepard_mat}"

In [5]:
repr_units = pd.read_csv(f"{dir_name}/repr_units", sep='\t', index_col=[0, 1])

## Unit classification by variants

Actually units are not clustered, but these plots are useful for checking whether or not the variant sites defined on a representative unit have enough power to separate the units belonging to the representative class into sub-classes (i.e. power to find unique overlaps).

In [6]:
from dacmaster.clustering import ClusteringVarMat

In [7]:
import random

In [12]:
max_N = 3000
for (peak_id, repr_id), df in repr_units.iterrows():
    c = ClusteringVarMat(f"develop/dmel_all_hc/consed_out/peak_{peak_id}_repr_{repr_id}.raw_units.t0.15.consed.V", None)
    c.names = np.arange(c.N)
    if c.N > max_N:   # sub-sample only <max_N> data for too large classes
        sub = random.sample(list(range(c.N)), max_N)
        c.data = c.data[sub]
        c.names = c.names[sub]
        c.N = max_N
        c.assignment = c.assignment[sub]
    c.calc_dist_mat()
    c.plot_tsne()

In [8]:
# num of variant sites detected for each representative class
!(for DATA in develop/dmel_all_hc/consed_out/peak_*_repr_*.raw_units.t0.15.consed.V; do wc -l ${DATA}; done)

8 develop/dmel_all_hc/consed_out/peak_0_repr_0.raw_units.t0.15.consed.V
8 develop/dmel_all_hc/consed_out/peak_0_repr_1.raw_units.t0.15.consed.V
6 develop/dmel_all_hc/consed_out/peak_0_repr_2.raw_units.t0.15.consed.V
9 develop/dmel_all_hc/consed_out/peak_0_repr_3.raw_units.t0.15.consed.V
15 develop/dmel_all_hc/consed_out/peak_1_repr_0.raw_units.t0.15.consed.V
25 develop/dmel_all_hc/consed_out/peak_1_repr_1.raw_units.t0.15.consed.V
24 develop/dmel_all_hc/consed_out/peak_1_repr_2.raw_units.t0.15.consed.V
27 develop/dmel_all_hc/consed_out/peak_1_repr_3.raw_units.t0.15.consed.V
26 develop/dmel_all_hc/consed_out/peak_1_repr_4.raw_units.t0.15.consed.V
34 develop/dmel_all_hc/consed_out/peak_1_repr_5.raw_units.t0.15.consed.V
32 develop/dmel_all_hc/consed_out/peak_1_repr_6.raw_units.t0.15.consed.V
35 develop/dmel_all_hc/consed_out/peak_1_repr_7.raw_units.t0.15.consed.V
