In [5]:
import numpy as np
import pandas as pd

def build_transition_df(seq):
    bases = ['a', 'c', 'g', 't']
    idx = {b: i for i, b in enumerate(bases)}

    # initialize count matrix
    count_mat = np.zeros((4, 4), dtype=int)

    # tally each dinucleotide here
    for first, second in zip(seq, seq[1:]):
        if first in idx and second in idx:
            count_mat[idx[first], idx[second]] += 1

    # converting counts to probabilities
    prob_mat = count_mat.astype(float)
    row_totals = prob_mat.sum(axis=1, keepdims=True)
    with np.errstate(divide='ignore', invalid='ignore'):
        prob_mat = np.divide(prob_mat, row_totals, where=row_totals > 0)

    # building a dataFrame
    transition_df = pd.DataFrame(
        prob_mat,
        index=bases,
        columns=bases
    )
    return transition_df


Dna sequence is random, extracted from a website

In [6]:
# Using
dna_sequence = (
    "ttgaatccctgtacgttaagtatatcacgcacgacgagcagcgggcacagtgttgtatgtcgagttgggtcgtagcc"
    "aatacgtgacgtcctccgtacagaggtctatacgttttttaactagtaggctcatttacttgagggactaatgtc"
    "caactcatattagcgacttggattgggacgcgtaatggacggagccagcctaaggcgaaccgatggcatcaaata"
    "cggttgacgtccttatggggaagctcagggtagaagacagttttaacagatccctacggggcgccccttggcatt"
    "agccagacctcggtgcaacatcagacttgttgggtttcaaataagtaccccgcctgtaaactcccgcgagccatg"
    "ccgggtggagttactgcgttttgcgcgtgctcggagtataatgcctataaacgtctaccgcaaaatgaggatatgagg"
    "gatctcaacctcgactctattaagcccagacgacgtgaaacaggggctactctctgatagccccatcgacatatag"
    "ttcccgattaatattttaatttctatatagatcctcgggaagccgcctcgcgtcggttgcaggcattccaagagt"
    "atcccgctgtcagagatatgaggtggtgatatcattgacctatacttcgcaacggggatagacgttggcgggctc"
    "gcaccaagtcgattacatcaccgacccggatcagagcccgcgataacccataatatgcagagtcgactcacattc"
    "aggagccgtagcatcatcatcggctggcaatcgtacaaccccggggatcctaagccatccgttgctatcgagttat"
    "ttgcgttcgacaataattgctgcttagtacggaacgtggacccgtagctgggaataatcagagtttccctttacgc"
    "tcgatcgtgtgttcgacacgcgttgattgatgattactacaggaagagccaccgcgacaatcgcgtggctgcttct"
    "gcgcgcatcacgaggctaggataagcaaacatctacgcgatttttgcctgcgcgga"
)

df_transitions = build_transition_df(dna_sequence)
print(df_transitions)

          a         c         g         t
a  0.198444  0.249027  0.249027  0.303502
c  0.234615  0.246154  0.315385  0.203846
g  0.273408  0.262172  0.247191  0.217228
t  0.286275  0.243137  0.215686  0.254902
