In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from IPython.display import display
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.io as pio
pio.templates.default = 'plotly_white'
import logging
import logzero
logzero.loglevel(logging.INFO)

In [2]:
dir_fname = 'work'
import os
os.chdir(dir_fname)

In [3]:
from BITS.util.io import load_pickle, save_pickle

In [4]:
centromere_reads_fname = "centromere_reads.pkl"
centromere_reads = load_pickle(centromere_reads_fname)
# Filter out relatively short reads
centromere_reads = list(filter(lambda read: read.length > 10000, centromere_reads))

## Per-read units

In [6]:
from copy import copy
from logzero import logger
from collections import Counter, defaultdict
from dataclasses import dataclass, field
from typing import Any, Type, List, Dict
import numpy as np
import pandas as pd
from BITS.clustering.seq import ClusteringSeq
from BITS.clustering.numeric import ClusteringNumeric
from BITS.seq.align import EdlibRunner
from BITS.seq.consed import ConsedRunner
from BITS.util.io import save_pickle, load_pickle
from BITS.util.proc import NoDaemonPool

In [19]:
for read in centromere_reads[:10]:
    c = ClusteringSeq([read.seq[unit.start:unit.end] for unit in read.units], revcomp=False, cyclic=True)
    c.calc_dist_mat()
    c.plot_dist_mat(variable_scale=True)

[I 190910 15:24:22 log:17] Starting distance matrix calculation 
[I 190910 15:24:22 log:19] Finished distance matrix calculation


[I 190910 15:24:22 log:17] Starting distance matrix calculation 
[I 190910 15:24:23 log:19] Finished distance matrix calculation


[I 190910 15:24:23 log:17] Starting distance matrix calculation 
[I 190910 15:24:23 log:19] Finished distance matrix calculation


[I 190910 15:24:24 log:17] Starting distance matrix calculation 
[I 190910 15:24:24 log:19] Finished distance matrix calculation


[I 190910 15:24:24 log:17] Starting distance matrix calculation 
[I 190910 15:24:24 log:19] Finished distance matrix calculation


[I 190910 15:24:25 log:17] Starting distance matrix calculation 
[I 190910 15:24:25 log:19] Finished distance matrix calculation


[I 190910 15:24:25 log:17] Starting distance matrix calculation 
[I 190910 15:24:26 log:19] Finished distance matrix calculation


[I 190910 15:24:26 log:17] Starting distance matrix calculation 
[I 190910 15:24:26 log:19] Finished distance matrix calculation


[I 190910 15:24:27 log:17] Starting distance matrix calculation 
[I 190910 15:24:27 log:19] Finished distance matrix calculation


[I 190910 15:24:27 log:17] Starting distance matrix calculation 
[I 190910 15:24:27 log:19] Finished distance matrix calculation


## Repr. units diversity

In [14]:
for read in centromere_reads:
    c = ClusteringSeq([read.seq[unit.start:unit.end] for i, unit in enumerate(read.units)
                   if (i != 0 and i != len(read.units) - 1
                       and read.units[i].start == read.units[i - 1].end
                       and read.units[i].end == read.units[i + 1].start)],   # exclude units around boundaries
                  revcomp=False, cyclic=True)
    c.calc_dist_mat()
    c.cluster_hierarchical()
    c.generate_consensus()
    read.repr_units = {}
    for cluster_id in set(c.assignment):
        if cluster_id in set(c.cons_seqs["cluster_id"]):
            read.repr_units[cluster_id] = c.cons_seqs[c.cons_seqs["cluster_id"] == cluster_id]["sequence"].iloc[0]

[I 190910 15:04:26 log:17] Starting distance matrix calculation 
[I 190910 15:04:27 log:19] Finished distance matrix calculation
[I 190910 15:04:27 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            36     358   
    
                                                sequence  
    0  atgacccccctccttacaaaaaatgcgaaaattgatccaaaaatta...  
[I 190910 15:04:27 log:17] Starting distance matrix calculation 
[I 190910 15:04:27 log:19] Finished distance matrix calculation
[I 190910 15:04:27 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            30     359   
    
                                                sequence  
    0  gcactggtaattagctgctcaaaacagttattcttacatctatgtc...  
[I 190910 15:04:28 log:17] Starting distance matrix calculation 
[I 190910 15:04:28 log:19] Finished distance matrix calculation
[I 190910 15:04:28 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:04:39 log:17] Starting distance matrix calculation 
[I 190910 15:04:39 log:19] Finished distance matrix calculation
[I 190910 15:04:39 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            30     359   
    
                                                sequence  
    0  atatggccaaaaaatttaatttccattttttgaacacagtttgatt...  
[I 190910 15:04:39 log:17] Starting distance matrix calculation 
[I 190910 15:04:40 log:19] Finished distance matrix calculation
[I 190910 15:04:40 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            33     359   
    
                                                sequence  
    0  tttgatgaccgaaatttggaaaaacagactctgcaaaaatgtagat...  
[I 190910 15:04:40 log:17] Starting distance matrix calculation 
[I 190910 15:04:40 log:19] Finished distance matrix calculation
[I 190910 15:04:40 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:04:51 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            38     358   
    
                                                sequence  
    0  ttgatatttacaaacgaaattttcgtcataacttggctaaaaatag...  
[I 190910 15:04:51 log:17] Starting distance matrix calculation 
[I 190910 15:04:52 log:19] Finished distance matrix calculation
[I 190910 15:04:52 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            28     357   
    
                                                sequence  
    0  cactgagctcgtaataaaatttccaatcaaactgtgttcaaaaatg...  
[I 190910 15:04:52 log:17] Starting distance matrix calculation 
[I 190910 15:04:52 log:19] Finished distance matrix calculation
[I 190910 15:04:52 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            24     359   
    
                                         

[I 190910 15:05:03 log:17] Starting distance matrix calculation 
[I 190910 15:05:03 log:19] Finished distance matrix calculation
[I 190910 15:05:03 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            19     359   
    
                                                sequence  
    0  aaagtaatagcgatcgttagcactggtaattagctgctcaaaacag...  
[I 190910 15:05:03 log:17] Starting distance matrix calculation 
[I 190910 15:05:04 log:19] Finished distance matrix calculation
[I 190910 15:05:04 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            33     359   
    
                                                sequence  
    0  taatttccctaattccttcaaaaagtaatagggatcgttagcactg...  
[I 190910 15:05:04 log:17] Starting distance matrix calculation 
[I 190910 15:05:04 log:19] Finished distance matrix calculation
[I 190910 15:05:05 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:05:15 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            24     357   
    
                                                sequence  
    0  aatcatttattttgccacaacataaaaaataattgtctgaatatgg...  
[I 190910 15:05:15 log:17] Starting distance matrix calculation 
[I 190910 15:05:15 log:19] Finished distance matrix calculation
[I 190910 15:05:15 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            37     358   
    
                                                sequence  
    0  aaattttcgttataacttggctaaaaatggtcacatagatgtaaga...  
[I 190910 15:05:16 log:17] Starting distance matrix calculation 
[I 190910 15:05:16 log:19] Finished distance matrix calculation
[I 190910 15:05:16 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            32     358   
    
                                         

[I 190910 15:05:27 log:17] Starting distance matrix calculation 
[I 190910 15:05:27 log:19] Finished distance matrix calculation
[I 190910 15:05:27 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            25     359   
    
                                                sequence  
    0  aattagctgctcaaaacagttgttcttacatctatgtcaccatttt...  
[I 190910 15:05:27 log:17] Starting distance matrix calculation 
[I 190910 15:05:28 log:19] Finished distance matrix calculation
[I 190910 15:05:28 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            27     354   
    
                                                sequence  
    0  atcatttattttgccacaacatacgaaataattgtctgaatatgga...  
[I 190910 15:05:28 log:17] Starting distance matrix calculation 
[I 190910 15:05:28 log:19] Finished distance matrix calculation
[I 190910 15:05:28 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:05:39 log:17] Starting distance matrix calculation 
[I 190910 15:05:39 log:19] Finished distance matrix calculation
[I 190910 15:05:40 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            24     355   
    
                                                sequence  
    0  tttttgcagagtctgtttttccaaatttcggtcatcaaataatcat...  
[I 190910 15:05:40 log:17] Starting distance matrix calculation 
[I 190910 15:05:40 log:19] Finished distance matrix calculation
[I 190910 15:05:40 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            26     358   
    
                                                sequence  
    0  ggaaaaacagattctgccaaaatgttgatatttacaaaccaaattt...  
[I 190910 15:05:40 log:17] Starting distance matrix calculation 
[I 190910 15:05:41 log:19] Finished distance matrix calculation
[I 190910 15:05:41 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:05:51 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            30     358   
    
                                                sequence  
    0  tgccacaacataaaaaataattgtctgaatatggaatgtcatacct...  
[I 190910 15:05:51 log:17] Starting distance matrix calculation 
[I 190910 15:05:52 log:19] Finished distance matrix calculation
[I 190910 15:05:52 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            27     358   
    
                                                sequence  
    0  aaattttgatgacccccctccttacagaaaatgcgaaaattgatcc...  
[I 190910 15:05:52 log:17] Starting distance matrix calculation 
[I 190910 15:05:52 log:19] Finished distance matrix calculation
[I 190910 15:05:52 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            25     358   
    
                                         

[I 190910 15:06:03 seq:179] Synchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1             4     358   
    1           2            28     357   
    
                                                sequence  
    0  ttttgccacaacataaaaaataattgtctgaatatggaatgtcata...  
    1  tctgcaaaaatgttgatatttacaaacgaaattttcgttataactc...  
[I 190910 15:06:03 log:17] Starting distance matrix calculation 
[I 190910 15:06:03 log:19] Finished distance matrix calculation
[I 190910 15:06:04 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            32     358   
    
                                                sequence  
    0  taaggaggggggtcatcaaaatttgcaaaatatggccaaaaaattt...  
[I 190910 15:06:04 log:17] Starting distance matrix calculation 
[I 190910 15:06:04 log:19] Finished distance matrix calculation
[I 190910 15:06:04 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size 

[I 190910 15:06:15 log:17] Starting distance matrix calculation 
[I 190910 15:06:15 log:19] Finished distance matrix calculation
[I 190910 15:06:16 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            29     358   
    
                                                sequence  
    0  aaaaataattgtctgaatatggaatgtcatacctcactgagctcgt...  
[I 190910 15:06:16 log:17] Starting distance matrix calculation 
[I 190910 15:06:16 log:19] Finished distance matrix calculation
[I 190910 15:06:16 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            26     359   
    
                                                sequence  
    0  atcaaataatcatttattttgccacaacattaaaaataattgtcag...  
[I 190910 15:06:16 log:17] Starting distance matrix calculation 
[I 190910 15:06:17 log:19] Finished distance matrix calculation
[I 190910 15:06:17 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:06:27 log:17] Starting distance matrix calculation 
[I 190910 15:06:28 log:19] Finished distance matrix calculation
[I 190910 15:06:28 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            26     358   
    
                                                sequence  
    0  cagctaattaccagtgctaacgatccctattactttttgaaggatt...  
[I 190910 15:06:28 log:17] Starting distance matrix calculation 
[I 190910 15:06:28 log:19] Finished distance matrix calculation
[I 190910 15:06:28 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            28     359   
    
                                                sequence  
    0  cttacaaaaaatgcgaaaattgatccaaaaattaatttccctaaat...  
[I 190910 15:06:28 log:17] Starting distance matrix calculation 
[I 190910 15:06:29 log:19] Finished distance matrix calculation
[I 190910 15:06:29 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:06:39 log:17] Starting distance matrix calculation 
[I 190910 15:06:39 log:19] Finished distance matrix calculation
[I 190910 15:06:39 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            21     358   
    
                                                sequence  
    0  ttttcgcattttttgtaaggagggggtcatcaaaatttgcaaaata...  
[I 190910 15:06:40 log:17] Starting distance matrix calculation 
[I 190910 15:06:40 log:19] Finished distance matrix calculation
[I 190910 15:06:40 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            31     358   
    
                                                sequence  
    0  agactctgcaaaaatgttgatatttacaaacgaaattttcgttata...  
[I 190910 15:06:40 log:17] Starting distance matrix calculation 
[I 190910 15:06:40 log:19] Finished distance matrix calculation
[I 190910 15:06:41 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:06:50 log:19] Finished distance matrix calculation
[I 190910 15:06:50 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            31     359   
    
                                                sequence  
    0  ctgtgttcaaaaatggaaattaaattttttggcattattttgcaaa...  
[I 190910 15:06:51 log:17] Starting distance matrix calculation 
[I 190910 15:06:51 log:19] Finished distance matrix calculation
[I 190910 15:06:51 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            24     358   
    
                                                sequence  
    0  aatttggaaaaacagactctgcaaaaatgttgatatttacaaacga...  
[I 190910 15:06:51 log:17] Starting distance matrix calculation 
[I 190910 15:06:51 log:19] Finished distance matrix calculation
[I 190910 15:06:51 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1        

[I 190910 15:07:02 log:17] Starting distance matrix calculation 
[I 190910 15:07:02 log:19] Finished distance matrix calculation
[I 190910 15:07:03 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            30     358   
    
                                                sequence  
    0  tagctgcttaaaacagttattcttacatctatgtgaccatttttag...  
[I 190910 15:07:03 log:17] Starting distance matrix calculation 
[I 190910 15:07:03 log:19] Finished distance matrix calculation
[I 190910 15:07:03 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            21     359   
    
                                                sequence  
    0  gtcatcaaataatcatttattttgccacaacataaaaaataattgt...  
[I 190910 15:07:03 log:17] Starting distance matrix calculation 
[I 190910 15:07:03 log:19] Finished distance matrix calculation
[I 190910 15:07:04 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:07:15 log:17] Starting distance matrix calculation 
[I 190910 15:07:15 log:19] Finished distance matrix calculation
[I 190910 15:07:15 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            30     358   
    
                                                sequence  
    0  gtctgaatatggaatgtcatacctcactgagctcgtaataaaattt...  
[I 190910 15:07:15 log:17] Starting distance matrix calculation 
[I 190910 15:07:15 log:19] Finished distance matrix calculation
[I 190910 15:07:15 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            32     358   
    
                                                sequence  
    0  atctatgtgaccatttttagccaagttataacgaaaattttgtttg...  
[I 190910 15:07:16 log:17] Starting distance matrix calculation 
[I 190910 15:07:16 log:19] Finished distance matrix calculation
[I 190910 15:07:16 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:07:27 log:17] Starting distance matrix calculation 
[I 190910 15:07:27 log:19] Finished distance matrix calculation
[I 190910 15:07:28 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            33     358   
    
                                                sequence  
    0  tttatgttgtggcaaaataaatgattatttgatgaccgaaatttgg...  
[I 190910 15:07:28 log:17] Starting distance matrix calculation 
[I 190910 15:07:28 log:19] Finished distance matrix calculation
[I 190910 15:07:28 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            29     358   
    
                                                sequence  
    0  aaaattgatccaaaaattaatttcctaaatccttcaaaaagtaata...  
[I 190910 15:07:28 log:17] Starting distance matrix calculation 
[I 190910 15:07:29 log:19] Finished distance matrix calculation
[I 190910 15:07:29 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:07:40 log:17] Starting distance matrix calculation 
[I 190910 15:07:40 log:19] Finished distance matrix calculation
[I 190910 15:07:40 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            32     359   
    
                                                sequence  
    0  tatgtgaccatttttagccaagttatgacgaaaatttcgtttgtaa...  
[I 190910 15:07:40 log:17] Starting distance matrix calculation 
[I 190910 15:07:41 log:19] Finished distance matrix calculation
[I 190910 15:07:41 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            21     357   
    
                                                sequence  
    0  aaaaaatgcgaaaattgatccaaaaattaatttcctaaatccttca...  
[I 190910 15:07:41 log:17] Starting distance matrix calculation 
[I 190910 15:07:41 log:19] Finished distance matrix calculation
[I 190910 15:07:41 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:07:51 log:17] Starting distance matrix calculation 
[I 190910 15:07:51 log:19] Finished distance matrix calculation
[I 190910 15:07:52 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            25     359   
    
                                                sequence  
    0  gaccatttttagccaagttataacgaaaatttcgtttgtaaatatc...  
[I 190910 15:07:52 log:17] Starting distance matrix calculation 
[I 190910 15:07:52 log:19] Finished distance matrix calculation
[I 190910 15:07:52 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            34     359   
    
                                                sequence  
    0  ttattttgccacaacataaaaaataattgtctgaatatggaatgtc...  
[I 190910 15:07:52 log:17] Starting distance matrix calculation 
[I 190910 15:07:53 log:19] Finished distance matrix calculation
[I 190910 15:07:53 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:08:03 log:17] Starting distance matrix calculation 
[I 190910 15:08:04 log:19] Finished distance matrix calculation
[I 190910 15:08:04 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            25     359   
    
                                                sequence  
    0  ttttgaacacagtttgattggaaattttattacgagctcagtgagg...  
[I 190910 15:08:04 log:17] Starting distance matrix calculation 
[I 190910 15:08:04 log:19] Finished distance matrix calculation
[I 190910 15:08:04 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            28     359   
    
                                                sequence  
    0  tgaatatggaatgtcatacctcactgagcttgtaataaaatttcca...  
[I 190910 15:08:04 log:17] Starting distance matrix calculation 
[I 190910 15:08:05 log:19] Finished distance matrix calculation
[I 190910 15:08:05 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:08:16 log:17] Starting distance matrix calculation 
[I 190910 15:08:16 log:19] Finished distance matrix calculation
[I 190910 15:08:16 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            21     358   
    
                                                sequence  
    0  ctcgtaataaaatttccaatcaaactgtgttcaaaaatggaaatta...  
[I 190910 15:08:16 log:17] Starting distance matrix calculation 
[I 190910 15:08:16 log:19] Finished distance matrix calculation
[I 190910 15:08:17 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            31     358   
    
                                                sequence  
    0  gatccctattactttttgaaggatttaggaaactaatttttggatc...  
[I 190910 15:08:17 log:17] Starting distance matrix calculation 
[I 190910 15:08:17 log:19] Finished distance matrix calculation
[I 190910 15:08:17 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:08:28 log:17] Starting distance matrix calculation 
[I 190910 15:08:28 log:19] Finished distance matrix calculation
[I 190910 15:08:28 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            33     359   
    
                                                sequence  
    0  acgaaaatttcgtttgtaaatatcattactttggcagaatctgttt...  
[I 190910 15:08:28 log:17] Starting distance matrix calculation 
[I 190910 15:08:29 log:19] Finished distance matrix calculation
[I 190910 15:08:29 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            32     358   
    
                                                sequence  
    0  ttccatattcagacaattattttttatgttgtggcaaaataaatga...  
[I 190910 15:08:29 log:17] Starting distance matrix calculation 
[I 190910 15:08:29 log:19] Finished distance matrix calculation
[I 190910 15:08:29 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:08:40 log:17] Starting distance matrix calculation 
[I 190910 15:08:41 log:19] Finished distance matrix calculation
[I 190910 15:08:41 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            25     358   
    
                                                sequence  
    0  ttgattggaaattttattacgagctcagtgaaatataacattccat...  
[I 190910 15:08:41 log:17] Starting distance matrix calculation 
[I 190910 15:08:41 log:19] Finished distance matrix calculation
[I 190910 15:08:41 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            36     359   
    
                                                sequence  
    0  aatgattatttgatgaccgaaatttggaaaaacagactctgcaaaa...  
[I 190910 15:08:42 log:17] Starting distance matrix calculation 
[I 190910 15:08:42 log:19] Finished distance matrix calculation
[I 190910 15:08:42 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:08:53 log:17] Starting distance matrix calculation 
[I 190910 15:08:53 log:19] Finished distance matrix calculation
[I 190910 15:08:54 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            32     358   
    
                                                sequence  
    0  ccacaacataaaaaataattgtctgaatatggaatgtcatacctca...  
[I 190910 15:08:54 log:17] Starting distance matrix calculation 
[I 190910 15:08:54 log:19] Finished distance matrix calculation
[I 190910 15:08:54 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            34     358   
    
                                                sequence  
    0  aaaatgcgaaaattgatccaaaaattagtttcctaaatccttcaaa...  
[I 190910 15:08:54 log:17] Starting distance matrix calculation 
[I 190910 15:08:55 log:19] Finished distance matrix calculation
[I 190910 15:08:55 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:09:06 log:17] Starting distance matrix calculation 
[I 190910 15:09:06 log:19] Finished distance matrix calculation
[I 190910 15:09:06 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            31     359   
    
                                                sequence  
    0  gtcatacctcactgagctcgtaataaaatttccaatcaaactgtgt...  
[I 190910 15:09:06 log:17] Starting distance matrix calculation 
[I 190910 15:09:07 log:19] Finished distance matrix calculation
[I 190910 15:09:07 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            28     359   
    
                                                sequence  
    0  gaacacagtttgattggaaattttattacgagctcagtgaggtatg...  
[I 190910 15:09:07 log:17] Starting distance matrix calculation 
[I 190910 15:09:07 log:19] Finished distance matrix calculation
[I 190910 15:09:07 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:09:18 log:17] Starting distance matrix calculation 
[I 190910 15:09:18 log:19] Finished distance matrix calculation
[I 190910 15:09:19 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            20     358   
    
                                                sequence  
    0  tgacattccatattcagacaattattttttatgttgtggcaaaata...  
[I 190910 15:09:19 log:17] Starting distance matrix calculation 
[I 190910 15:09:19 log:19] Finished distance matrix calculation
[I 190910 15:09:19 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            21     358   
    
                                                sequence  
    0  aatagggatcgttagcactggtaattagctgctcaaaacagttatt...  
[I 190910 15:09:19 log:17] Starting distance matrix calculation 
[I 190910 15:09:20 log:19] Finished distance matrix calculation
[I 190910 15:09:20 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:09:31 log:17] Starting distance matrix calculation 
[I 190910 15:09:31 log:19] Finished distance matrix calculation
[I 190910 15:09:31 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            28     359   
    
                                                sequence  
    0  atgttgtggcaaaataaatgattatttgatgaccgaaatttggaaa...  
[I 190910 15:09:31 log:17] Starting distance matrix calculation 
[I 190910 15:09:32 log:19] Finished distance matrix calculation
[I 190910 15:09:32 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            24     359   
    
                                                sequence  
    0  gttagcactggtaattagctgctcaaaacagttattcttacatcta...  
[I 190910 15:09:32 log:17] Starting distance matrix calculation 
[I 190910 15:09:32 log:19] Finished distance matrix calculation
[I 190910 15:09:32 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:09:43 log:17] Starting distance matrix calculation 
[I 190910 15:09:44 log:19] Finished distance matrix calculation
[I 190910 15:09:44 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            21     358   
    
                                                sequence  
    0  tttccatttttgaacacagtttgattggaaattttattacgagctc...  
[I 190910 15:09:44 log:17] Starting distance matrix calculation 
[I 190910 15:09:44 log:19] Finished distance matrix calculation
[I 190910 15:09:44 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            35     358   
    
                                                sequence  
    0  caaaaaatttaatttccatttttgaacacagtttgattggaaattt...  
[I 190910 15:09:45 log:17] Starting distance matrix calculation 
[I 190910 15:09:45 log:19] Finished distance matrix calculation
[I 190910 15:09:45 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:09:55 log:17] Starting distance matrix calculation 
[I 190910 15:09:56 log:19] Finished distance matrix calculation
[I 190910 15:09:56 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            34     358   
    
                                                sequence  
    0  tgcgaaaattgatccaaaaattaatttcctaaatccttcaaaaagt...  
[I 190910 15:09:56 log:17] Starting distance matrix calculation 
[I 190910 15:09:56 log:19] Finished distance matrix calculation
[I 190910 15:09:57 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            15     358   
    1           2            17     359   
    
                                                sequence  
    0  atttattttgccacaacataaaaaataattgtctgaatatggaatg...  
    1  ttcagacaattattttttatgttgtggcaaaataaatgattatttg...  
[I 190910 15:09:57 seq:179] Synchronized consensus sequences:
       cluster_id  cluster_size 

[I 190910 15:10:07 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            30     359   
    
                                                sequence  
    0  ttttggatcaattttcgcattttttgtaaggaggggggtcatcaaa...  
[I 190910 15:10:08 log:17] Starting distance matrix calculation 
[I 190910 15:10:08 log:19] Finished distance matrix calculation
[I 190910 15:10:08 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            17     356   
    
                                                sequence  
    0  ataattgtctgaatatggaatgtcatacctcactgagctcgtaata...  
[I 190910 15:10:08 log:17] Starting distance matrix calculation 
[I 190910 15:10:08 log:19] Finished distance matrix calculation
[I 190910 15:10:09 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            33     358   
    
                                         

[I 190910 15:10:20 log:17] Starting distance matrix calculation 
[I 190910 15:10:20 log:19] Finished distance matrix calculation
[I 190910 15:10:20 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            32     359   
    
                                                sequence  
    0  gatcgttagcactggtaattagctgctcaaaacagttattcttaca...  
[I 190910 15:10:20 log:17] Starting distance matrix calculation 
[I 190910 15:10:20 log:19] Finished distance matrix calculation
[I 190910 15:10:21 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            31     358   
    
                                                sequence  
    0  gctcgtaataaaatttccaatcaaactgtgttcaaaaatggaaatt...  
[I 190910 15:10:21 log:17] Starting distance matrix calculation 
[I 190910 15:10:21 log:19] Finished distance matrix calculation
[I 190910 15:10:21 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:10:33 log:17] Starting distance matrix calculation 
[I 190910 15:10:33 log:19] Finished distance matrix calculation
[I 190910 15:10:33 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            28     342   
    
                                                sequence  
    0  ccatattttgcaaatttacaaaaaatgcgaaaattgatccaaaaat...  
[I 190910 15:10:33 log:17] Starting distance matrix calculation 
[I 190910 15:10:33 log:19] Finished distance matrix calculation
[I 190910 15:10:34 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            32     359   
    
                                                sequence  
    0  attccatattcagacaattattttttatgttgtggcaaaataaatg...  
[I 190910 15:10:34 log:17] Starting distance matrix calculation 
[I 190910 15:10:34 log:19] Finished distance matrix calculation
[I 190910 15:10:34 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:10:45 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            29     359   
    
                                                sequence  
    0  taaatatcaacatttttgcagagtctgtttttccaaatttcggtca...  
[I 190910 15:10:45 log:17] Starting distance matrix calculation 
[I 190910 15:10:45 log:19] Finished distance matrix calculation
[I 190910 15:10:45 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            29     357   
    
                                                sequence  
    0  atcaattttcgcattttttgtaaggagggggtcatcaaaatttgca...  
[I 190910 15:10:45 log:17] Starting distance matrix calculation 
[I 190910 15:10:46 log:19] Finished distance matrix calculation
[I 190910 15:10:46 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            28     359   
    
                                         

[I 190910 15:10:57 log:17] Starting distance matrix calculation 
[I 190910 15:10:57 log:19] Finished distance matrix calculation
[I 190910 15:10:57 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            30     358   
    
                                                sequence  
    0  atccaaaaattagtttcctaaatccttcaaaaagtaatagggatcg...  
[I 190910 15:10:58 log:17] Starting distance matrix calculation 
[I 190910 15:10:58 log:19] Finished distance matrix calculation
[I 190910 15:10:58 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            33     359   
    
                                                sequence  
    0  aatttccaatcaaactgtgttcaaaaatggaaattaaattttttgg...  
[I 190910 15:10:58 log:17] Starting distance matrix calculation 
[I 190910 15:10:58 log:19] Finished distance matrix calculation
[I 190910 15:10:59 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:11:09 log:17] Starting distance matrix calculation 
[I 190910 15:11:10 log:19] Finished distance matrix calculation
[I 190910 15:11:10 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            27     359   
    
                                                sequence  
    0  attttttggccatattttgcaaattttgatgacccccctccttaca...  
[I 190910 15:11:10 log:17] Starting distance matrix calculation 
[I 190910 15:11:10 log:19] Finished distance matrix calculation
[I 190910 15:11:11 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            27     358   
    
                                                sequence  
    0  atcaaaatttgcaaaatatggccaaaaaatttaatttccatttttg...  
[I 190910 15:11:11 log:17] Starting distance matrix calculation 
[I 190910 15:11:11 log:19] Finished distance matrix calculation
[I 190910 15:11:11 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:11:22 log:17] Starting distance matrix calculation 
[I 190910 15:11:23 log:19] Finished distance matrix calculation
[I 190910 15:11:23 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            27     359   
    
                                                sequence  
    0  tgagctcgtaataaaatttccaatcaaactgtgttcaaaaatggaa...  
[I 190910 15:11:23 log:17] Starting distance matrix calculation 
[I 190910 15:11:23 log:19] Finished distance matrix calculation
[I 190910 15:11:24 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            33     359   
    
                                                sequence  
    0  ctaaatccttcaaaaagtaatagggatcgttagcactggtaattag...  
[I 190910 15:11:24 log:17] Starting distance matrix calculation 
[I 190910 15:11:24 log:19] Finished distance matrix calculation
[I 190910 15:11:24 seq:152] Unsynchronized consensus sequences:
   

[I 190910 15:11:35 log:17] Starting distance matrix calculation 
[I 190910 15:11:36 log:19] Finished distance matrix calculation
[I 190910 15:11:36 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            33     358   
    
                                                sequence  
    0  tattacgagctcagtgaggtatgacattccatattcagacaattat...  
[I 190910 15:11:36 log:17] Starting distance matrix calculation 
[I 190910 15:11:36 log:19] Finished distance matrix calculation
[I 190910 15:11:36 seq:152] Unsynchronized consensus sequences:
       cluster_id  cluster_size  length  \
    0           1            26     354   
    
                                                sequence  
    0  tgtcagaatatggaatgtcatacttcagtgagctcgtaataaaatt...  
[I 190910 15:11:37 log:17] Starting distance matrix calculation 
[I 190910 15:11:37 log:19] Finished distance matrix calculation
[I 190910 15:11:37 seq:152] Unsynchronized consensus sequences:
   

In [15]:
c = ClusteringSeq([rr for r in centromere_reads for rr in r.repr_units.values()], revcomp=True, cyclic=True)

In [17]:
c.calc_dist_mat()

[I 190910 15:17:24 log:17] Starting distance matrix calculation 
[I 190910 15:20:04 log:19] Finished distance matrix calculation


In [18]:
c.plot_tsne()   # TODO: set the names as f"{read_id}, {repr_id}"