In [8]:
#https://www.upgrad.com/blog/markov-chain-in-python-tutorial/#Principle_of_Markov_Chain_Markov_Property

In [None]:
def make_markov_matrix(input_sequence: str) -> np.matrix:
    """
    Returns markov matrix based on the input sequence,

          A      T      G      C
        A P(AA)  P(AT)  P(AG)  P(AC)
        T P(TA)  P(TT)  P(TG)  P(TC)
        G P(GA)  P(GT)  P(GG)  P(GC)
        C P(CA)  P(CT)  P(CG)  P(CC)

        where P(AA) is the probability of A following an A.

    Args:
        input_sequence: Input sequence.

    Returns:
        Markov matrix as 4x4 np.matrix with propabilities of nucleotide pairs.

    Raises:
        ValueError: input_string contains characters other than ATGC.
    """
    pass

In [3]:
import numpy as np

In [90]:
seq = "AAATGGCTAGBAGTA"

In [91]:
#Issue = with count function from python, overlapping occurences are only counted once: AAA is only counted  once

def make_markov_matrix(seq: str):
    nucl = ["A","C","G","T"]

    # Possible nucleotide pairs
    comb =  [["AA","AC","AG","AT"],
            ["CA","CC","CG","CT"],
            ["GA","GC","GG","GT"],
            ["TA","TC","TG","TT"]]
    result = np.zeros((4,4))
    
    #count occurence of nucleotide pairs:
    for i in range(len(nucl)): #i represents the row 
        for j in range(len(nucl)): #j represents the column
            a = seq.count(comb[i][j])
            result[i,j] = a
            a = 0
            
    #convert occurence to probabilities
    for i in range(len(nucl)): #i represents the row 
        tot = np.sum(result[i])
        for j in range(len(nucl)): #j represents the column
            prob = result[i,j]/tot
            result[i,j] = prob
            
    #check if probabilities are correct
    tot = 0
    for i in range(len(nucl)): #i represents the row 
        tot = tot + np.sum(result[i])
        if tot == i+1:
            d = 0
        else: 
            result = f'Imhonogenous probability distribution for the {nucl[i]} nucleotides'
            break

    return result
        

   
    
    

In [93]:
def CountOccurrences(string, substring): 
  
    # Initialize count and start to 0 
    count = 0
    start = 0
  
    # Search through the string till 
    # we reach the end of it 
    while start < len(string): 
  
        # Check if a substring is present from 
        # 'start' position till the end 
        pos = string.find(substring, start) 
  
        if pos != -1: 
            # If a substring is present, move 'start' to 
            # the next position from start of the substring 
            start = pos + 1
  
            # Increment the count 
            count += 1
        else: 
            # If no further substring is present 
            break
    # return the value of count 
    return count 

In [107]:
#uses countoccurences function - found on google- to count overlapping occurences

def make_markov_matrix2(seq: str):
    nucl = ["A","C","G","T"]

    # Possible nucleotide pairs
    comb =  [["AA","AC","AG","AT"],
            ["CA","CC","CG","CT"],
            ["GA","GC","GG","GT"],
            ["TA","TC","TG","TT"]]
    result = np.zeros((4,4))
    
    #count occurence of nucleotide pairs:
    for i in range(len(nucl)): #i represents the row 
        for j in range(len(nucl)): #j represents the column
            a = CountOccurrences(seq, comb[i][j])
            result[i,j] = a

            a = 0
            
    #convert occurence to probabilities
    for i in range(len(nucl)): #i represents the row 
        tot = np.sum(result[i])
        for j in range(len(nucl)): #j represents the column
            prob = result[i,j]/tot
            result[i,j] = prob
            
    #check if probabilities are correct
    tot = 0
    for i in range(len(nucl)): #i represents the row 
        tot = tot + np.sum(result[i])
        if tot == i+1:
            d = 0
        else: 
            result = f'Imhonogenous probability distribution for the {nucl[i]} nucleotides'
            break

    return result
        

   
    
    

In [108]:
b = make_markov_matrix2(seq)
print(b)

[[0.4        0.         0.4        0.2       ]
 [0.         0.         0.         1.        ]
 [0.         0.33333333 0.33333333 0.33333333]
 [0.66666667 0.         0.33333333 0.        ]]
