In [14]:
from itertools import product
import pandas as pd
import numpy as np
from numpy.random import choice
from bs4 import BeautifulSoup
import re
from collections import Counter
import itertools

In [15]:
class MarkovChain:
    
    def __init__(self, zeroth, kth, k=2):
        self.k = k
        self.zeroth = zeroth
        self.kth = kth
        
    def generate(self, n, seed=None):
        np.random.seed(seed)
        
        seq = []
        i = 0

        while i < self.k and i < n:
                seq.append(random_event(self.zeroth))
                i += 1

        while i < n:
            ctx = "".join(seq[i - self.k:i])
            seq.append(random_event(self.kth[ctx]))
            
            i += 1

        return "".join(seq)
            


if __name__ == '__main__':
    zeroth = {'A': 0.2, 'C': 0.19, 'T': 0.31, 'G': 0.3}
    kth = {'GT': {'A': 1.0, 'C': 0.0, 'T': 0.0, 'G': 0.0},
           'CA': {'A': 0.0, 'C': 0.0, 'T': 1.0, 'G': 0.0},
           'TC': {'A': 0.5, 'C': 0.0, 'T': 0.0, 'G': 0.5},
           'GA': {'A': 0.0, 'C': 0.3333333333333333, 'T': 0.6666666666666666, 'G': 0.0},
           'TG': {'A': 0.5, 'C': 0.0, 'T': 0.5, 'G': 0.0},
           'AT': {'A': 0.2, 'C': 0.4, 'T': 0.0, 'G': 0.4},
           'TA': {'A': 0.0, 'C': 0.0, 'T': 0.5, 'G': 0.5},
           'AC': {'A': 0.0, 'C': 0.0, 'T': 0.0, 'G': 1.0},
           'CG': {'A': 1.0, 'C': 0.0, 'T': 0.0, 'G': 0.0}}
    n = 10
    seed = 0
    mc = MarkovChain(zeroth, kth)
    print(mc.generate(n, seed))

NameError: name 'random_event' is not defined

In [None]:
from itertools import product

def context_pseudo_probabilities(s, k):
    context_pseudo_probs = {}
    
    symbols = "ACGT"
    contexts = context_list(s, k)

    k_mers = ["".join(t) for t in product(symbols, repeat=k)]

    for k_mer in k_mers:
        if k_mer not in contexts:
            contexts[k_mer] = symbols
        else:
            contexts[k_mer] += symbols

    for k_mer in k_mers:
        ctx = contexts[k_mer]
        context_pseudo_probs[k_mer] = {ch: ctx.count(ch) / len(ctx) for ch in symbols}

    return context_pseudo_probs

if __name__ == '__main__':
    k = 2
    s = "ATGATATCATCGACGATGTAG"
    kth = context_pseudo_probabilities(s, k)
    zeroth = context_pseudo_probabilities(s, 0)[""]
    print(f"zeroth: {zeroth}")
    print("\n".join(f"{k}: {dict(v)}" for k, v in kth.items()))
    
    print("\n", MarkovChain(zeroth, kth, k).generate(20))

In [None]:
class MarkovProb:
    def __init__(self, k, zeroth, kth):
        self.k = k
        self.zeroth = zeroth
        self.kth = kth
        
    def probability(self, s):
        prob = 1.0
        i = 0

        while i < self.k and i < len(s):
            prob *= self.zeroth[s[i]]
            i += 1

        while i < len(s):
            context = s[i - self.k:i]
            prob *= self.kth[context][s[i]]
            i += 1

        return prob

    
if __name__ == '__main__':
    k = 2
    kth = context_pseudo_probabilities("ATGATATCATCGACGATGTAG", k)
    zeroth = context_pseudo_probabilities("ATGATATCATCGACGATGTAG", 0)[""]
    mc = MarkovProb(2, zeroth, kth)
    s="ATGATATCATCGACGATGTAG"
    print(f"Probability of sequence {s} is {mc.probability(s)}")

In [4]:
class MarkovLog(object):
    def __init__(self, k, zeroth, kth):
        self.k = k
        self.zeroth = zeroth
        self.kth = kth
        
    def log_probability(self, s):
        prob = 1.0
        i = 0

        while i < self.k and i < len(s):
            prob *= self.zeroth[s[i]]
            i += 1

        while i < len(s):
            context = s[i - self.k:i]
            prob *= self.kth[context][s[i]]
            i += 1

        return np.log2(prob)
    
if __name__ == '__main__':
    k = 2
    kth = context_pseudo_probabilities("ATGATATCATCGACGATGTAG", k)
    zeroth = context_pseudo_probabilities("ATGATATCATCGACGATGTAG", 0)[""]
    mc = MarkovLog(2, zeroth, kth)
    s="ATGATATCATCGACGATGTAG"
    print(f"Log probability of sequence {s} is {mc.log_probability(s)}")

NameError: name 'context_list' is not defined

In [5]:

from collections import Counter

def better_context_probabilities(s, k):
    better_context_probs = {}
    
    symbols = "ACGT"
    contexts = context_list(s, k)

    k_mers = ["".join(t) for t in product(symbols, repeat=k)]

    for k_mer in k_mers:
        ctx = contexts.get(k_mer, "")
        better_context_probs[k_mer] = {ch: (ctx + symbols).count(ch) / len(ctx + symbols) for ch in symbols}

    return better_context_probs

if __name__ == '__main__':
    k = 2
    s = "ATGATATCATCGACGATGTAG"
    d = better_context_probabilities(s, k)
    print("\n".join(f"{k}: {v}" for k, v in d.items()))

NameError: name 'context_list' is not defined

In [6]:
class SimpleMarkovChain(object):
    def __init__(self, s, k):
        self.s = s
        self.k = k

    def generate(self, n, seed=None):
        np.random.seed(seed)

        a = list(set(self.s))
        p = [f / len(self.s) for f in Counter(self.s).values()]

        return "".join(np.random.choice(a=a, size=n, p=p))
        
if __name__ == '__main__':
    k = 2
    s = "ATGATATCATCGACGATGTAG"
    n = 10
    seed = 7
    mc = SimpleMarkovChain(s, k)
    print(mc.generate(n, seed))

NameError: name 'np' is not defined

In [7]:
def kmer_index(s, k):
    kmer_ds = {}
    i = 0

    while i + k <= len(s):
        kmer = s[i:i + k]
        
        if kmer not in kmer_ds:
            kmer_ds[kmer] = []

        kmer_ds[kmer].append(i)

        i += 1                

    return kmer_ds

if __name__ == '__main__':
    k=2
    s = "ATGATATCATCGACGATGTAG"
    print("Using string:")
    print(s)
    print("".join([str(i%10) for i in range(len(s))]))
    print(f"\n{k}-mer index is:")
    d=kmer_index(s, k)
    print(dict(d))

Using string:
ATGATATCATCGACGATGTAG
012345678901234567890

2-mer index is:
{'AT': [0, 3, 5, 8, 15], 'TG': [1, 16], 'GA': [2, 11, 14], 'TA': [4, 18], 'TC': [6, 9], 'CA': [7], 'CG': [10, 13], 'AC': [12], 'GT': [17], 'AG': [19]}


In [8]:
def codon_probabilities(rna):
    """
    Given an RNA sequence, simply calculates the proability of
    all 3-mers empirically based on the sequence
    """
    c_probs = {}

    for codon in product(set(rna), repeat=3):
        c_seq = "".join(codon)
        if c_seq in rna[:-1]:
            kmer = kmer_index(rna, 3)
            c_probs[c_seq] = len(kmer[c_seq]) / len(rna)
        else: 
            c_probs[c_seq] = 0 

    print(c_probs)

    return c_probs
    
def kullback_leibler(p, q):
    """
    Computes Kullback-Leibler divergence between two distributions.
    Both p and q must be dictionaries from events to probabilities.
    The divergence is defined only when q[event] == 0 implies p[event] == 0.
    """
    div = 0.0

    for e in p:
        if p[e] == 0 or q[e] == 0:
            if p[e] == q[e]:
                pass 
            else:
                if q[e] == 0:
                    raise ZeroDivisionError
                else:
                    pass
        else:
            div += p[e] * np.log2(p[e] / q[e])

    return div

if __name__ == '__main__':
    aas = list("*ACDEFGHIKLMNPQRSTVWY") # List of amino acids
    n = 10000
    
    # generate a random protein and some associated rna
    protein = "".join(choice(aas, n))    
    rna = ProteinToRandomRNA().convert(protein)
    
    # Maybe check that converting back to protein results in the same sequence
    assert rna_to_prot(rna) == protein
    
    # Calculate codon probabilities of the rna sequence
    cp_predicted = codon_probabilities(rna) # placeholder call
    
    # Calculate codon probabilities based on the codon usage table
    cp_orig = {}
    i = 0
    table_probs = get_probabability_dict()

    while i + 3 < len(rna):
        kmer = rna[i:i + 3]
        cp_orig[kmer] = table_probs[kmer]

        i += 1
    
    # Create a completely random RNA sequence and get the codon probabilities
    rand_rna = ProteinToRandomRNA().convert(protein)
    cp_uniform = codon_probabilities(rand_rna) # placeholder call
    
    print("d(original || predicted) =", kullback_leibler(cp_orig, cp_predicted))
    print("d(predicted || original) =", kullback_leibler(cp_predicted, cp_orig))
    print()
    print("d(original || uniform) =", kullback_leibler(cp_orig, cp_uniform))
    print("d(uniform || original) =", kullback_leibler(cp_uniform, cp_orig))
    print()
    print("d(predicted || uniform) =", kullback_leibler(cp_predicted, cp_uniform))
    print("d(uniform || predicted) =", kullback_leibler(cp_uniform, cp_predicted))

    p = dict(zip("ACGT", [1.0, 0.0, 0.0, 0.0]))
    q = dict(zip("ACGT", [0.25]*4))
    print(kullback_leibler(p, q))

NameError: name 'choice' is not defined

In [9]:
def get_stationary_distributions(transition):
    """
    The function get a transition matrix of a degree one Markov chain as parameter.
    It returns a list of stationary distributions, in vector form, for that chain.
    """
    tt = transition.T

    w, v = np.linalg.eig(tt)
    i1 = np.isclose(w, 1)
    v1 = v[:, i1]

    v1 = v1 / v1.sum(axis=0)

    return np.abs(v1.T)
    
    
if __name__ == "__main__":
    transition=np.array([[0.3, 0, 0.7, 0],
                         [0, 0.4, 0, 0.6],
                         [0.35, 0, 0.65, 0],
                         [0, 0.2, 0, 0.8]])
    print("\n".join(
        ", ".join(
            f"{pv:+.3f}"
            for pv in p) 
        for p in get_stationary_distributions(transition)))

NameError: name 'np' is not defined

In [10]:
def kl_divergences(initial, transition):
    """
    Calculates the the Kullback-Leibler divergences between empirical distributions
    generated using a markov model seeded with an initial distributin and a transition 
    matrix, and the initial distribution.
    Sequences of length [1, 10, 100, 1000, 10000] are generated.
    """
    prefs = [1, 10, 100, 1000, 10000]
    divs = []
    zeroth = dict(zip("ACGT", initial))
    kth = {ch: dict(zip([k for k in "ACGT"], transition[i, :].tolist())) for i, ch in enumerate("ACGT")}
    mc = MarkovChain(zeroth, kth, k=1)
    seq = mc.generate(10000)

    for pref in prefs:
        probs = better_context_probabilities(seq[:pref], 0)[""]
        divs.append(kullback_leibler(zeroth, probs))

    return zip(prefs, divs)

if __name__ == "__main__":
    transition=np.array([[0.3, 0, 0.7, 0],
                         [0, 0.4, 0, 0.6],
                         [0.35, 0, 0.65, 0],
                         [0, 0.2, 0, 0.8]])
    print("Transition probabilities are:")
    print(transition)
    stationary_distributions = get_stationary_distributions(transition)
    print("Stationary distributions:")
    print(np.stack(stationary_distributions))
    initial = stationary_distributions[1]
    print("Using [{}] as initial distribution\n".format(", ".join(f"{v:.2f}" for v in initial)))
    results = kl_divergences(initial, transition)
    for prefix_length, divergence in results: # iterate on prefix lengths in order (1, 10, 100...)
        print("KL divergence of stationary distribution prefix " \
              "of length {:5d} is {:.8f}".format(prefix_length, divergence))

NameError: name 'np' is not defined

In [11]:
def kl_divergences_ed(initial, transition):
    """
    Calculates the the Kullback-Leibler divergences between empirical distributions
    generated using a markov model seeded with an initial distributin and a transition 
    matrix, and the initial distribution.
    Sequences of length [1, 10, 100, 1000, 10000] are generated.
    Returns the KL divergence and empirical distributions that generated them
    """
    prefs = [1, 10, 100, 1000, 10000]
    divs = []
    ed = []
    zeroth = dict(zip("ACGT", initial))
    kth = {ch: dict(zip([k for k in "ACGT"], transition[i, :].tolist())) for i, ch in enumerate("ACGT")}
    mc = MarkovChain(zeroth, kth, k=1)
    seq = mc.generate(10000)

    for pref in prefs:
        probs = better_context_probabilities(seq[:pref], 0)[""]
        ed.append(list(probs.values()))
        divs.append(kullback_leibler(zeroth, probs))

    return list(zip(divs, ed))

def main(transition, equilibrium_distribution):
    initial = []
    divs_and_emps = []

    samp_dists = np.random.dirichlet(equilibrium_distribution, 2)

    print(samp_dists)

    for samp in samp_dists:
        samp_dist = dict(zip("ACGT", samp))
        divs = kl_divergences_ed(samp, transition)
        initial.append(samp)
        divs_and_emps.append(divs)
    
    return zip(initial, divs_and_emps)

if __name__ == "__main__":
    transition = np.array([[0.3, 0.1, 0.5, 0.1],
                           [0.2, 0.3, 0.15, 0.35],
                           [0.25, 0.15, 0.2, 0.4],
                           [0.35, 0.2, 0.4, 0.05]])
    print("Transition probabilities are:", transition, sep="\n")
    stationary_distributions = get_stationary_distributions(transition)
    # Uncomment the below line to check that there actually is only one stationary distribution
    assert len(stationary_distributions) == 1
    equilibrium_distribution = stationary_distributions[0]
    print("Equilibrium distribution:")
    print(equilibrium_distribution)
    for initial_distribution, results in main(transition, equilibrium_distribution):
        print("\nUsing {} as initial distribution:".format(initial_distribution))
        print("kl-divergence   empirical distribution")
        print("\n".join("{:.11f}   {}".format(di, kl) for di, kl in results))

NameError: name 'np' is not defined