似た representative を持つリードに含まれるユニットを分類する。

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
from IPython.display import display
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.io as pio
pio.templates.default = 'plotly_white'
import logging
import logzero
logzero.loglevel(logging.INFO)

In [2]:
dir_fname = 'work'
import os
os.chdir(dir_fname)

In [3]:
from BITS.util.io import load_pickle, save_pickle
import numpy as np
import pandas as pd
from BITS.plot.plotly import make_hist, make_scatter, make_layout, show_plot
from BITS.clustering.seq import ClusteringSeq
import consed
from BITS.seq.align import EdlibRunner
from collections import Counter, defaultdict
from logzero import logger
from dataclasses import dataclass
from typing import List
import random

In [76]:
sync_reads = load_pickle("centromere_reads_sync.pkl")

In [5]:
centromere_phreds = load_pickle("centromere_phreds.pkl")   # Integer QV array for each centromere read extracted from fastq

In [6]:
db_prefix = "DMEL_CSS"
db_fname = f"{db_prefix}.db"
las_fname = f"TAN.{db_prefix}.las"
from vca import ReadViewer
v = ReadViewer(db_fname, las_fname)

In [77]:
read = sync_reads[0]

In [8]:
v.show(read=read)

[I 190917 11:10:39 log:17] Starting distance matrix calculation 
[I 190917 11:10:39 log:19] Finished distance matrix calculation


## Functions for counting variants

In [8]:
# Variants and sequencing errors

class PairwiseAlignment:
    def __init__(self, a_seq, b_seq):
        er = EdlibRunner("global", revcomp=False, cyclic=False)
        self.fcigar = er.align(b_seq.lower(), a_seq.lower()).cigar.flatten().string   # NOTE: b vs a; be careful!
        self.source, self.target = '', ''
        s_pos, t_pos = 0, 0
        for c in self.fcigar:
            if c == '=' or c == 'X':
                self.source += a_seq[s_pos]
                self.target += b_seq[t_pos]
                s_pos += 1
                t_pos += 1
            elif c == 'I':
                self.source += '-'
                self.target += b_seq[t_pos]
                t_pos += 1
            else:
                self.source += a_seq[s_pos]
                self.target += '-'
                s_pos += 1
        
    def show(self, by_cigar=False):
        if by_cigar:   # standard alignment like BLAST
            print(self.source)
            print(self.fcigar)
            print(self.target)
        else:
            print(''.join([' ' if c == '=' else self.source[i] for i, c in enumerate(self.fcigar)]))
            print(''.join([self.source[i] if c == '=' else ' ' for i, c in enumerate(self.fcigar)]))
            print(''.join([' ' if c == '=' else self.target[i] for i, c in enumerate(self.fcigar)]))

def count_variants(cluster_cons_unit, cluster_units):
    """Given a set of unit sequences <units> in a cluster, calculate the composition of
    nucleotides including '-' (= distribution of each )
    for each position on <cluster_cons_unit> as a seed.
    from which <units> are generated, compute the variations (= nucleotides inconsistent between
    <units> and <cluster_cons_unit> and their relative frequency).
    Since a cluster should be homogeneous (i.e., mono-source), the relative frequencies are
    expected to be not much larger than sequencing error.
    """
    assert cluster_cons_unit != "", "Empty strings are not allowed"
    # TODO: how to decide "same variant?" especially for multiple variations on same position (but slightly different among units)?
    variants = Counter()
    for unit in cluster_units:
        assert unit != "", "Empty strings are not allowed"
        alignment = PairwiseAlignment(cluster_cons_unit, unit)   # alignment.fcigar(cluster_cons_unit) = unit
        tpos = 0
        var_index = 0   # positive values for continuous insertions
        for i, c in enumerate(alignment.fcigar):
            if c == '=':
                var_index = 0
            elif c == 'I':
                var_index += 1
            if c != '=':
                variants[(tpos, var_index, c, alignment.target[i])] += 1   # TODO: multiple D on the same pos are aggregated
            if c != 'I':
                tpos += 1
        assert tpos == len(cluster_cons_unit)
    return variants

def list_variations(template_unit, cluster_cons_unit):
    """Single-vs-single version of count_variants().
    That is, list up the differences between the (imaginary) template unit and the consensus unit
    of a cluster (which should be a real instance).
    The return value is [(position_on_template_unit, variant_type, base_on_cluster_cons_unit)].
    """
    assert template_unit != "" and cluster_cons_unit != "", "Empty strings are not allowed"
    return list(count_variants(template_unit, [cluster_cons_unit]).keys())

## Functions for alignment probability

In [9]:
# Probability of alignment 1) between units and 2) between a unit and a representative unit

def phred_to_log_p_correct(phred):
    return np.log10(1 - np.power(10, -phred / 10))

def phred_to_log_p_error(phred):
    return -phred / 10

def log_prob_gen(cons_unit, obs_unit, obs_qual=None, p_non_match=0.01):
    """Log likelihood of generating <obs_unit> from <cons_unit>.
    <obs_qual> is positional QVs of <obs_unit> and if not given,
    <p_non_match> is used as average error rate for every position.
    """
    if cons_unit == "":   # input sequences for <cons_unit> were empty; or, Consed did not return
        return -np.inf

    # Compute alignment
    er = EdlibRunner("global", revcomp=False)
    fcigar = er.align(cons_unit, obs_unit).cigar.flatten().string
    logger.debug(fcigar)
    
    # Calculate the sum of log probabilities for each position in the alignment
    if obs_qual is None:
        n_match = Counter(fcigar)['=']
        n_non_match = len(fcigar) - n_match
        return n_match * np.log10(1 - p_non_match) + n_non_match * np.log10(p_non_match)
    else:
        p = 0.
        pos = 0
        for c in fcigar:
            p += (phred_to_log_p_correct(obs_qual[pos]) if c == '='
                  else phred_to_log_p_error(obs_qual[pos]))
            if c in ('=', 'X', 'D'):
                pos += 1
        assert pos == len(obs_unit) == len(obs_qual), "Invalid length"
        return p

def log_prob_align(unit_x, unit_y, qual_x=None, qual_y=None, p_error=0.01):
    """Log likelihood of alignment between <unit_x> from <unit_y>.
    <qual_*> is positional QVs of <unit_*> and if not given,
    <p_error> is used as average error rate for every position of each read.
    """
    # Compute alignment
    er = EdlibRunner("global", revcomp=False)
    fcigar = er.align(unit_x, unit_y).cigar.flatten().string
    logger.debug(fcigar)
    
    # Calculate the sum of log probabilities for each position in the alignment
    if qual_x is None and qual_y is None:
        p_match = (1 - p_error) * (1 - p_error)
        n_match = Counter(fcigar)['=']
        n_non_match = len(fcigar) - n_match
        return n_match * np.log10(p_match) + n_non_match * np.log10(1 - p_match)
    else:
        p = 0.
        pos_x = pos_y = 0
        for c in fcigar:   # fcigar(unit_y) = unit_x
            p_match = phred_to_log_p_correct(qual_x[pos_x]) + phred_to_log_p_correct(qual_y[pos_y])
            p += (p_match if c == '='
                  else np.log10(1 - np.power(10, p_match)))
            if c in ('=', 'X', 'I'):
                pos_x += 1
            if c in ('=', 'X', 'D'):
                pos_y += 1
        assert pos_x == len(unit_x) == len(qual_x) and pos_y == len(unit_y) == len(qual_y), "Invalid length"
        return p
    
def log_factorial(n):
    return np.sum([np.log10(i) for i in range(1, n + 1)])
    
def log_prob_composition(cons_unit, obs_units, p_error=0.001):   # TODO: use positional QVs
    """Log likelihood of composition of <obs_units> given <cons_unit> as a seed; i.e., probability of alignment pileup.
    <p_error> is used for average sequencing error rate (= non-match rate in a single alignment).
    Concretely, compute Multinomial(n_A, ..., n_-; p_A, ..., p_-) for each position, where p_X = 1 - p_error
    if X is the base of <cons_unit>, otherwise p_X = p_error.
    """
    var_counts = count_variants(cons_unit, obs_units)
    var_pos = [pos for pos, index, op, base in var_counts.keys()]
    
    # compute for matches
    n_matches = len(cons_unit) - len(set(var_pos))
    p_match = n_matches * len(obs_units) * np.log10(1 - p_error)
    
    # compute for variants
    var_freqs = defaultdict(Counter)   # {(pos, index): Counter('A': n_A, ..., '-': n_-)} for each variant column
    for (pos, index, op, base), count in var_counts.items():   # list up frequencies of each variant for each position
        var_freqs[(pos, index)][base] = count
    log_factorial_N = log_factorial(len(obs_units))
    p_var = 0.
    for key, counts in var_freqs.items():   # for each variant position
        p_var += log_factorial_N
        for base, count in counts.items():
            p_var -= log_factorial(count)
            p_var += count * np.log10(p_error)
        n_match = len(obs_units) - np.sum(list(counts.values()))   # number of units having base same as seed
        p_var -= log_factorial(n_match)
        p_var += n_match * np.log10(1 - p_error)

    return p_match + p_var

## Split-merge clustering

In [46]:
@dataclass(eq=False)
class SplitMergeClustering:
    units: List[str]
    quals: np.ndarray
    alpha: float
    
    def __post_init__(self):
        self.N = len(self.units)   # number of data
        self.assignments = np.zeros([self.N], dtype=np.int16)   # cluster assignments

        # Compute all-vs-all unit alignment likelihood
        self.log_p_mat = np.zeros([self.N, self.N], dtype=np.float32)
        for i in range(self.N):
            for j in range(i + 1, self.N):
                self.log_p_mat[i][j] = self.log_p_mat[j][i] = log_prob_align(self.units[i], self.units[j],
                                                                            self.quals[i], self.quals[j])
        
        # Compute consensus unit of the whole units so that comparing clusters can be easy
        self.template_unit = self.cluster_cons(0)
        
    def show_clustering(self):
        er = EdlibRunner("global", revcomp=False)
        for cluster_id in np.unique(self.assignments):
            print(f"Cluster {cluster_id} ({len(self.cluster_units(cluster_id))} units):\n"
                  f"{self.cluster_unit_ids(cluster_id)}\n"
                  f"{self.cluster_cons(cluster_id)}\n"
                  f"{er.align(self.cluster_cons(cluster_id), self.template_unit).cigar.flatten().string}")
            print("---")
            for unit in self.cluster_units(cluster_id):
                print(f"{er.align(unit, self.cluster_cons(cluster_id)).cigar.flatten().string}")
                
    def n_units(self, cluster_id, assignments=None, exclude_unit=None):
        """Return the number of units in the cluster <cluster_id> given a clustering state <assignments>,
        while excluding a unit <exclude_unit> if provided."""
        return len(self.cluster_unit_ids(cluster_id, assignments, exclude_unit))
            
    def cluster_unit_ids(self, cluster_id, assignments=None, exclude_unit=None):
        """Return indices of the units belonging to the cluster <cluster_id> given a clustering state <assignments>,
        while excluding a unit <exclude_unit> if provided."""
        return [i for i in range(self.N)
                if i != exclude_unit and (self.assignments if assignments is None else assignments)[i] == cluster_id]

    def cluster_units(self, cluster_id, assignments=None, exclude_unit=None):
        """Return unit sequences belonging to the cluster <cluster_id> given a clustering state <assignments>,
        while excluding a unit <exclude_unit> if provided."""
        return [self.units[i] for i in self.cluster_unit_ids(cluster_id, assignments, exclude_unit)]
    
    def n_clusters(self, assignments=None):
        """Return the number of clusters."""
        return len(self.cluster_ids(self.assignments if assignments is None else assignments))

    def cluster_ids(self, assignments=None):
        """Return a list of cluster indices."""
        return np.unique(self.assignments if assignments is None else assignments)

    def cluster_cons(self, cluster_id, assignments=None, exclude_unit=None):
        """Return the consensus sequence of the units belonging to the cluster <cluster_id> given a clustering state <assignments>,
        while excluding a unit <exclude_unit> if provided."""
        cluster_units = self.cluster_units(cluster_id, assignments, exclude_unit)   # units belonging to the cluster
        if len(cluster_units) == 0:   # cluster with single unit which is excluded
            return ""
        elif len(cluster_units) == 1:   # cluster with single unit
            return cluster_units[0]   # TODO: NOTE: single data cluster can be harmful!
        else:
            return consed.consensus(cluster_units)
        
    def log_prob_ewens(self, assignments=None):
        p = self.n_clusters() * np.log10(self.alpha)
        for cluster_id in self.cluster_ids():
            for i in range(1, self.n_units(cluster_id)):
                p += np.log10(i)
        for i in range(self.N):
            p -= np.log10(self.alpha + i)
        return p
    
    def log_prob_cluster_composition(self, cluster_id, assignments=None, p_error=0.001):
        """Return log probability of the composition of the cluster <cluster_id> given a clustering state <assignments>"""
        return log_prob_composition(self.cluster_cons(cluster_id, assignments),
                                    self.cluster_units(cluster_id, assignments))

    def log_prob_units_generation(self, cluster_id, assignments=None):
        """Return log probability of generating the units belonging to a cluster <cluster_id> from the cluster
        given a clustering state <assignments>."""
        cons = self.cluster_cons(cluster_id, assignments)
        return np.sum([log_prob_gen(cons, unit) for unit in self.cluster_units(cluster_id, assignments)])
        
    def log_prob_clustering(self, assignments=None):
        """Compute the joint probability of the current clustering state."""
        # First of all, check if consensus sequence exists for every cluster
        for cluster_id in self.cluster_ids(assignments):
            cons = self.cluster_cons(cluster_id, assignments)
            if cons == "":   # Consed did not return
                return -np.inf
            
        # Probability of partition (= Ewens)
        p_ewens = self.log_prob_ewens(assignments)

        # Probability of cluster compositions
        p_cluster_compositions = np.sum([self.log_prob_cluster_composition(cluster_id, assignments)
                                         for cluster_id in self.cluster_ids(assignments)])
        
        # Probability of unit generation from each cluster
        p_gen_units = np.sum([self.log_prob_units_generation(cluster_id, assignments)
                              for cluster_id in self.cluster_ids(assignments)])

        return p_gen_units + p_cluster_compositions + p_gen_units
        
    def gibbs_sampling_single(self, unit_id, cluster_ids, assignments):
        """Compute probability of the unit assignment for each cluster while excluding the unit."""
        weights = tuple(map(lambda log_p: np.power(10, log_p),
                            [(np.log10(self.n_units(cluster_id, assignments, exclude_unit=unit_id))
                              - np.log10(self.N - 1 + self.alpha)
                              + log_prob_gen(self.cluster_cons(cluster_id, assignments, exclude_unit=unit_id),
                                          self.units[unit_id]))
                             for cluster_id in cluster_ids]))   # TODO: possibility of create a new cluster
        new_assignment = random.choices(cluster_ids, weights=weights)[0]   # sample a new cluster assignment based on the probabilities
        logger.info(f"weights: {weights}, {assignments[unit_id]} -> {new_assignment}")
        return new_assignment
        
    def gibbs_sampling(self, unit_ids, cluster_ids, assignments, n_iter=2):
        """Re-assign each unit of <unit_ids> into one of the clusters <cluster_ids>,
        Given a clustering state <assignments>.
        """
        for t in range(n_iter):
            for unit_id in unit_ids:
                assignments[unit_id] = self.gibbs_sampling_single(unit_id, cluster_ids, assignments)
        return assignments
    
    def do_gibbs(self, n_iter=2):
        """Run a single iteration of Gibbs sampling with all units."""
        logger.info(f"State prob before Gibbs sampling: {self.log_prob_clustering()}")
        self.assignments = self.gibbs_sampling(list(range(self.N)), self.cluster_ids(), self.assignments, n_iter)
        logger.info(f"State prob after Gibbs sampling: {self.log_prob_clustering()}")
    
    def do_proposal(self, n_iter=10):
        """Propose a new state by choosing random two units."""
        for t in range(n_iter):
            x, y = random.sample(list(range(self.N)), 2)
            logger.info(f"Selected: {x}({self.assignments[x]}) and {y}({self.assignments[y]})")
            if self.assignments[x] == self.assignments[y]:
                logger.info("Split")
                self.propose_split(x, y)
            else:
                logger.info("Merge")
                self.propose_merge(x, y)

    def propose_split(self, x, y, n_gibbs_iter=2):
        assert self.assignments[x] == self.assignments[y], "Must belong to the same cluster"
        
        # Split cluster <old_cluster_id> into <old_cluster_id> and <new_cluster_id>
        old_cluster_id = self.assignments[x]
        new_cluster_id = np.max(self.assignments) + 1
        new_assignments = np.copy(self.assignments)
        
        # Assign each unit to one of x and y with higher alignment likelihood   # TODO: random init assignment?
        new_assignments[y] = new_cluster_id
        for i in range(self.N):
            if i != x and i != y and new_assignments[i] == old_cluster_id:
                if self.log_p_mat[i][x] < self.log_p_mat[i][y]:
                    new_assignments[i] = new_cluster_id
        logger.info(f"\nCurrent state:\n{self.assignments}\nProposed state (init):\n{new_assignments}")
        
        # Re-assign each unit to one of the new clusters (= Gibbs sampling)
        self.gibbs_sampling(self.cluster_unit_ids(old_cluster_id),
                            (old_cluster_id, new_cluster_id),
                            new_assignments, n_iter=n_gibbs_iter)
        logger.info(f"\nCurrent state:\n{self.assignments}\nProposed state (Gibbs):\n{new_assignments}")
        
        # Compare the probability of the current state and the proposed state
        p_current = self.log_prob_clustering()
        p_new = self.log_prob_clustering(new_assignments)
        logger.info(f"Current state prob: {p_current}, Proposed state prob: {p_new}")
        if p_current < p_new:
            logger.info("Accepted")
            self.assignments = new_assignments
        else:
            logger.info("Rejected")
            
    def propose_merge(self, x, y):
        assert self.assignments[x] != self.assignments[y], "Must belong to different clusters"
        
        # Merge cluster <old_cluster_id_x> and <old_cluster_id_y> into <old_cluster_id_x>
        old_cluster_id_x = self.assignments[x]
        old_cluster_id_y = self.assignments[y]
        new_assignments = np.copy(self.assignments)
        
        # Change cluster assignment of the units in the cluster which units[y] belongs to
        for i in range(self.N):
            if new_assignments[i] == old_cluster_id_y:
                new_assignments[i] = old_cluster_id_x
        logger.info(f"\nCurrent state:\n{self.assignments}\nProposed state:\n{new_assignments}")
        
        # Compare the probability of the current state and the proposed state
        p_current = self.log_prob_clustering()
        p_new = self.log_prob_clustering(new_assignments)
        logger.info(f"Current state prob: {p_current}, Proposed state prob: {p_new}")
        if p_current < p_new:
            logger.info("Accepted")
            self.assignments = new_assignments
        else:
            logger.info("Rejected")

In [47]:
smc = SplitMergeClustering([read.seq[unit.start:unit.end] for unit in read.units],
                           [centromere_phreds[read.name][unit.start:unit.end] for unit in read.units],
                           alpha=1)

In [48]:
smc.log_p_mat

array([[   0.      , -201.68346 , -119.009834, ..., -234.69757 ,
        -148.79634 , -189.88011 ],
       [-201.68346 ,    0.      , -124.35283 , ...,  -84.709526,
        -135.29305 , -176.1989  ],
       [-119.009834, -124.35283 ,    0.      , ..., -208.66441 ,
         -76.216515, -183.25117 ],
       ...,
       [-234.69757 ,  -84.709526, -208.66441 , ...,    0.      ,
        -182.74564 , -112.65529 ],
       [-148.79634 , -135.29305 ,  -76.216515, ..., -182.74564 ,
           0.      , -160.83798 ],
       [-189.88011 , -176.1989  , -183.25117 , ..., -112.65529 ,
        -160.83798 ,    0.      ]], dtype=float32)

In [49]:
smc.log_prob_clustering()

-2957.54199812112

In [63]:
smc.do_proposal()

[I 190918 00:25:43 <ipython-input-46-86e3fb84df62>:139] Selected: 4(7) and 20(7)
[I 190918 00:25:43 <ipython-input-46-86e3fb84df62>:141] Split
[I 190918 00:25:43 <ipython-input-46-86e3fb84df62>:161] 
    Current state:
    [3 7 4 4 7 5 2 7 7 5 4 6 6 3 7 3 4 7 5 4 7 5 4 4 0 5 6 4 2 2 0 5 6 4 2 7 5
     7]
    Proposed state (init):
    [3 8 4 4 7 5 2 8 8 5 4 6 6 3 8 3 4 8 5 4 8 5 4 4 0 5 6 4 2 2 0 5 6 4 2 8 5
     8]
[I 190918 00:25:43 <ipython-input-46-86e3fb84df62>:117] weights: (7.729825806965065e-20, 5.4108780648755465e-17), 8 -> 8

divide by zero encountered in log10

[I 190918 00:25:43 <ipython-input-46-86e3fb84df62>:117] weights: (0.0, 6.000193800537946e-11), 7 -> 8
[I 190918 00:25:43 <ipython-input-46-86e3fb84df62>:117] weights: (0.0, 6.246323884416258e-21), 8 -> 8
[I 190918 00:25:44 <ipython-input-46-86e3fb84df62>:117] weights: (0.0, 6.502550311880039e-27), 8 -> 8
[I 190918 00:25:44 <ipython-input-46-86e3fb84df62>:117] weights: (0.0, 6.183860645572054e-17), 8 -> 8
[I 190918 00:

[I 190918 00:26:01 <ipython-input-46-86e3fb84df62>:139] Selected: 28(2) and 32(6)
[I 190918 00:26:01 <ipython-input-46-86e3fb84df62>:144] Merge
[I 190918 00:26:01 <ipython-input-46-86e3fb84df62>:191] 
    Current state:
    [3 7 4 4 7 5 2 7 7 5 4 6 6 3 7 3 4 7 5 4 7 5 4 4 0 5 6 4 2 2 0 5 6 4 2 7 5
     7]
    Proposed state:
    [3 7 4 4 7 5 2 7 7 5 4 2 2 3 7 3 4 7 5 4 7 5 4 4 0 5 2 4 2 2 0 5 2 4 2 7 5
     7]
[I 190918 00:26:04 <ipython-input-46-86e3fb84df62>:196] Current state prob: -1371.2310606040128, Proposed state prob: -1730.0024788899407
[I 190918 00:26:04 <ipython-input-46-86e3fb84df62>:201] Rejected
[I 190918 00:26:04 <ipython-input-46-86e3fb84df62>:139] Selected: 33(4) and 1(7)
[I 190918 00:26:04 <ipython-input-46-86e3fb84df62>:144] Merge
[I 190918 00:26:04 <ipython-input-46-86e3fb84df62>:191] 
    Current state:
    [3 7 4 4 7 5 2 7 7 5 4 6 6 3 7 3 4 7 5 4 7 5 4 4 0 5 6 4 2 2 0 5 6 4 2 7 5
     7]
    Proposed state:
    [3 4 4 4 4 5 2 4 4 5 4 6 6 3 4 3 4 4 5 4 4 5 4 4 0 5 

In [57]:
smc.do_gibbs()

[I 190918 00:21:28 <ipython-input-46-86e3fb84df62>:131] State prob before Gibbs sampling: -1364.7453614922047
[I 190918 00:21:28 <ipython-input-46-86e3fb84df62>:117] weights: (1.815668626061288e-53, 4.8700142917513204e-73, 1.5773545162667259e-07, 6.837663877307398e-37, 8.547079846634383e-37, 1.7795368204026445e-47, 7.190047759202536e-51), 3 -> 3
[I 190918 00:21:29 <ipython-input-46-86e3fb84df62>:117] weights: (1.577354516266729e-21, 4.023453005475778e-33, 2.7510130697897592e-55, 6.634578422487466e-35, 8.376992957686249e-37, 1.6420581595656768e-31, 5.4108780648755465e-17), 7 -> 7
[I 190918 00:21:29 <ipython-input-46-86e3fb84df62>:117] weights: (1.726682797299859e-41, 4.44884205100661e-55, 2.5900241959497893e-37, 6.122022039116343e-15, 7.027114369968293e-19, 1.5615809711040641e-21, 6.9764961507064356e-43), 5 -> 4
[I 190918 00:21:30 <ipython-input-46-86e3fb84df62>:117] weights: (1.7975119398006334e-53, 4.539171565153218e-61, 2.6161860565149134e-43, 5.94019186253255e-11, 7.169793255757853e

[I 190918 00:21:44 <ipython-input-46-86e3fb84df62>:117] weights: (1.6093812021903117e-25, 3.3507971830745e-39, 2.8638606044072444e-63, 8.25303920936928e-55, 6.291291789302217e-51, 3.6313372521225165e-51, 6.183860645572054e-17), 7 -> 7
[I 190918 00:21:45 <ipython-input-46-86e3fb84df62>:117] weights: (1.6923218096336247e-39, 3.595023879601268e-55, 2.616186056514913e-41, 7.38926171804555e-29, 4.828143606570937e-25, 3.1865747803368223e-25, 7.692371861970826e-41), 5 -> 5
[I 190918 00:21:45 <ipython-input-46-86e3fb84df62>:117] weights: (1.726682797299859e-39, 3.780295997817599e-65, 2.751013069789759e-53, 8.088803729102856e-49, 6.166095082695122e-45, 3.523482904397213e-45, 6.837663877307398e-37), 7 -> 7
[I 190918 00:21:46 <ipython-input-46-86e3fb84df62>:117] weights: (1.815668626061288e-53, 3.935365084243455e-75, 1.5773545162667259e-07, 7.692371861970826e-37, 5.982955892643972e-35, 3.559073640805231e-47, 8.088803729102856e-51), 3 -> 3
[I 190918 00:21:46 <ipython-input-46-86e3fb84df62>:117] we

[I 190918 00:22:00 <ipython-input-46-86e3fb84df62>:117] weights: (1.709415969326877e-39, 3.5590736408052312e-53, 2.6693052306039234e-47, 6.956843226268563e-19, 5.632834207666091e-27, 2.2052962289652125e-09, 7.848558169544743e-43), 6 -> 6
[I 190918 00:22:01 <ipython-input-46-86e3fb84df62>:117] weights: (1.7975119398006334e-53, 3.5590736408052312e-53, 2.590024195949789e-41, 6.122022039116343e-17, 5.576505865589439e-25, 3.154709032533459e-25, 8.088803729102856e-53), 4 -> 4
[I 190918 00:22:01 <ipython-input-46-86e3fb84df62>:117] weights: (1.726682797299859e-47, 2.2728006820219456e-15, 2.951523813182591e-75, 8.170508817275666e-59, 6.228378871409154e-55, 3.5590736408052312e-53, 7.539293661917629e-39), 2 -> 2
[I 190918 00:22:02 <ipython-input-46-86e3fb84df62>:117] weights: (1.6093812021903117e-25, 3.3507971830745e-39, 2.8638606044072444e-63, 8.25303920936928e-55, 6.291291789302217e-51, 3.6313372521225165e-51, 6.183860645572054e-17), 7 -> 7
[I 190918 00:22:02 <ipython-input-46-86e3fb84df62>:11

In [58]:
smc.show_clustering()

Cluster 0 (2 units):
[24, 30]
atgacccccctccttacaaaaaatgcgaaaagtgattcaaaaattaatttccctaaatccttcaaaagtaatagagatcgttagcactggtaattagctgcttaaaacagttattgttacatctatgtgacaattttagccaagttataacgaaaatttggttgtaaatatctacatttttgcagagtctgtttttccaaatttcggtcatcaaataatcatttattttgccacaacataaaaaataattgtctgaacatggaatgtcatacctctctgagctcgtaataaaatttccaatcaaactgtgttcaaaaatggaaattaaatttttgggccatattttgcaaattttc
---
Cluster 2 (4 units):
[6, 28, 29, 34]
atgaccaccctccttacaaaaaatgcgaaaagtgattaaaaaattaatttccctaaatccttcaaaaagtaacagagatcgttagcactggtaattagctgcttaaaacagttattgttacatctatgtgacaatttttagccaagttatgacgaaaatttcgtttgtaaatatcatttctttggcagaatctgtttttccaaatttcggtcatcaaataatcatttattttgccacaacataaaaaataattgtctgaatatggaatgtcatacctcactgagctcgtaataaaatttccaatcaaactgtgttcaaaaatggaaacaaaattttttggcattattttgcaaattttg
---
Cluster 3 (3 units):
[0, 13, 15]
atgacccccctccttacaaaaaatgcgaaaattggtccaaaaattaatttcctaaatccttcaaaaagtaatagggatcgttagcactggtaattagctgctcaaaacagatattcgtacatctatgtgaccatttttagccaagtaataacgaaaattttgtttgtaaatatca

Cluster 5 (7 units):
[5, 9, 18, 21, 25, 31, 36]
atgacccccttcttacaaaaaatgcgaaaattgatccaaaaattaatttcctaaatccttcaaaaagtaatagggatcgttagcactggtaattagctgctcaaaacagatattcgtacatctatgtgaccatttttagccaagttataacgaaaatttcgtttgtaaatatcaacatttttgcagagtctgtttttccaattttcggtcatcaaaaatcatttattttgccacaacataaaaaataattgtctgaatatggaatgtcatacctcactgagctcgtaataaaatttccaatcaaactgtgttcaaaaatggaaattaaattttttggccatattttgcaaattttg
---
Cluster 6 (4 units):
[11, 12, 26, 32]
atgacccccctccttacaaaaaatgcgaaaattgatccaaaaattaatttccctaaatccttcaaaaagtaatagggatcgttagcaatggtaattagctgctcaaaacagttattcttacatctatgtgaccattgctagccaagttataacgaaaatttcgtttgtaaatatctacatttttgcagagtctgtttttccaaatttcggtcatcaaaaaatcatttattttgccacaacataaaaaataattgtctgaatatggaatgtcatacctcactgagctcgtaataaaatttccaatcaaactgtgttcaaaaatggaaattaaattttttggccatatttggcaaattttg
---
Cluster 7 (9 units):
[1, 4, 7, 8, 14, 17, 20, 35, 37]
atgacccccctccttacaaaaaatgcgaaaagtgattcaaaaattaatttccctaaatccttcaaaaagtaacagagatcgttagcactggtaattagctgcttaaaacagttattgttacatctatgtgacaat



In [59]:
for cluster_id in np.unique(smc.assignments):
    print(f"Cluster {cluster_id} ({len(smc.cluster_unit_ids(cluster_id))} units)")
    for variant in list_variations(smc.template_unit, smc.cluster_cons(cluster_id)):
        print(variant)
    print(count_variants(smc.cluster_cons(cluster_id), smc.cluster_units(cluster_id)))

Cluster 0 (2 units)
(31, 0, 'X', 'g')
(36, 0, 'X', 't')
(52, 1, 'I', 'c')
(66, 0, 'D', '-')
(74, 0, 'X', 'a')
(102, 0, 'X', 't')
(131, 0, 'X', 'a')
(137, 0, 'D', '-')
(160, 0, 'X', 'g')
(164, 0, 'D', '-')
(174, 0, 'X', 't')
(259, 0, 'X', 'c')
(277, 0, 'X', 't')
(336, 0, 'X', 'g')
(357, 0, 'X', 'c')
Counter({(67, 1, 'I', 'a'): 1, (137, 0, 'X', 't'): 1, (138, 1, 'I', 't'): 1})
Cluster 2 (4 units)
(6, 0, 'X', 'a')
(31, 0, 'X', 'g')
(36, 0, 'X', 't')
(37, 0, 'X', 'a')
(52, 1, 'I', 'c')
(71, 0, 'X', 'c')
(74, 0, 'X', 'a')
(102, 0, 'X', 't')
(131, 0, 'X', 'a')
(149, 0, 'X', 'g')
(175, 0, 'X', 't')
(176, 0, 'D', '-')
(177, 0, 'D', '-')
(180, 1, 'I', 'c')
(184, 1, 'I', 'g')
(188, 0, 'X', 'a')
(326, 0, 'X', 'c')
(327, 0, 'X', 'a')
(340, 0, 'D', '-')
(343, 1, 'I', 't')
Counter({(31, 0, 'X', 't'): 2, (72, 0, 'X', 't'): 2, (37, 0, 'X', 'c'): 2, (328, 0, 'X', 't'): 2, (304, 0, 'X', 't'): 1, (150, 0, 'X', 'a'): 1, (161, 0, 'X', 'g'): 1, (165, 0, 'D', '-'): 1, (176, 0, 'X', 'a'): 1, (177, 0, 'X', 'c'

In [60]:
c = ClusteringSeq([smc.cluster_cons(cluster_id) for cluster_id in np.unique(smc.assignments)], revcomp=False)
c.calc_dist_mat()
c.plot_dist_mat(variable_scale=True)

[I 190918 00:24:02 log:17] Starting distance matrix calculation 
[I 190918 00:24:02 log:19] Finished distance matrix calculation


- Gibbs sampling の確率
- split/merge の受容率

In [61]:
for i in range(smc.N):
    read.units[i].id = smc.assignments[i]

In [62]:
v.show(read=read)

[I 190918 00:24:16 log:17] Starting distance matrix calculation 
[I 190918 00:24:16 log:19] Finished distance matrix calculation


In [21]:
v.show(read=read)

[I 190917 11:17:57 log:17] Starting distance matrix calculation 
[I 190917 11:17:57 log:19] Finished distance matrix calculation
