In [44]:
"""
Adjusted functions: make markov matrix (now includes a section to define a base probability)
"""

def make_markov_matrix(seq: str):
    """
    Returns markov matrix based on the input sequence,
          A      T      G      C
        A P(AA)  P(AT)  P(AG)  P(AC)
        T P(TA)  P(TT)  P(TG)  P(TC)
        G P(GA)  P(GT)  P(GG)  P(GC)
        C P(CA)  P(CT)  P(CG)  P(CC)
        where P(AA) is the probability of A following an A.
    Args:
        input_sequence: Input sequence.
    Returns:
        Markov matrix as 4x4 np.matrix with propabilities of nucleotide pairs.
    Raises:
        ValueError: input_string contains characters other than ATGC.
    """
    nucl = ["A","C","G","T"]
    basefrequency = 0.01 #Base frequency to avoid a 0 occurence

    # Possible nucleotide pairs
    comb =  [["AA","AC","AG","AT"],
            ["CA","CC","CG","CT"],
            ["GA","GC","GG","GT"],
            ["TA","TC","TG","TT"]]
    result = np.zeros((4,4))
    basefrequency = 0.01 #base frequency of nucleotide pairs if the probability is zero
    
    #count occurence of nucleotide pairs:

    for i in range(len(nucl)): #i represents the row 
        for j in range(len(nucl)): #j represents the column
            a = CountOccurrences(seq, comb[i][j])
            result[i,j] += a
            a = 0
    
    #convert occurence to probabilities
    for i in range(len(nucl)): #i represents the row 
        tot = np.sum(result[i])
        for j in range(len(nucl)): #j represents the column
            prob = result[i,j]/tot
            if prob == 0:
                result[i,j] = basefrequency
            else:
                result[i,j] = prob

    #Taking care of nucleotide results if they did not occur: replace NaN with base occurence frequency
    result[np.isnan(result)] = basefrequency
    """        
    #check if probabilities are correct
    #should not really be implemented, because it can occur that a nucleotide does not occur -> check is way off
    tot = 0
    for i in range(len(nucl)): #i represents the row 
        tot = tot + np.sum(result[i])
        if 0.98 < tot< i+1.03:
            d = 0
        else: 
            result = f'Imhonogenous probability distribution for the {nucl[i]} nucleotides'
            break
            
    """
            

    return result