In [1]:
%pip install Bio 

Note: you may need to restart the kernel to use updated packages.


In [3]:
#  Read from a multi-record FASTA file and concatenate sequences
from Bio import SeqIO
fasta_path = "data/multiline_input.fasta"
sequence = []
for record in SeqIO.parse(fasta_path, "fasta"):
    sequence.append(str(record.seq))
sequence = ''.join(sequence)

# 2. Define nucleotide states
states = ['A', 'C', 'G', 'T']

# 3. Initialize transition count matrix
t_counts = {s: {t: 0 for t in states} for s in states}

# 4. Count observed transitions
for i in range(len(sequence) - 1):
    curr, nxt = sequence[i].upper(), sequence[i+1].upper()
    if curr in states and nxt in states:
        t_counts[curr][nxt] += 1

# 5. Convert counts to probabilities
t_probs = {s: {} for s in states}
for s in states:
    total = sum(t_counts[s].values())
    if total > 0:
        for t in states:
            t_probs[s][t] = t_counts[s][t] / total
    else:
        for t in states:
            t_probs[s][t] = 0.0

# 6. Display as a matrix using pandas
import pandas as pd
transition_df = pd.DataFrame(t_probs).T  # rows: current state, cols: next state

print("First-Order Markov Transition Probability Matrix:")
print(transition_df)


First-Order Markov Transition Probability Matrix:
          A         C         G         T
A  0.250870  0.242645  0.264790  0.241696
C  0.246548  0.254438  0.244905  0.254109
G  0.251826  0.247380  0.255637  0.245157
T  0.270936  0.237438  0.250246  0.241379
