## **Data Preprocessing**

In [1]:
from Bio import SeqIO
from Bio.Seq import Seq, MutableSeq
from BCBio import GFF
import pandas as pd
import re
from collections import Counter
import numpy as np
import pickle
import json
from tqdm import tqdm

In [2]:
fasta_file = "GCF_000001405.40_GRCh38.p14_genomic.fna"
sequences = {}
pattern = re.compile(r"chromosome (\d+)")
cant_seq = 0
with open(fasta_file, "r") as file:
    for record in SeqIO.parse(file, "fasta"):
        if record.description[:2] == "NC":
            try: 
                n_crom = pattern.search(record.description).group(1)
            except:
                print(f"Error en la secuencia {record.description}")
                n_crom = cant_seq
            print(f"\rNC{n_crom}", end="")
            sequences[f"NC{n_crom}"] = record.seq
        cant_seq += 1


sequences['NCX'] = sequences.pop('NC61') # Cromosoma X
sequences['NCY'] = sequences.pop('NC62') # Cromosoma Y
sequences['NCM'] = sequences.pop('NC704') # Mitocondria

NC22Error en la secuencia NC_000023.11 Homo sapiens chromosome X, GRCh38.p14 Primary Assembly
NC61Error en la secuencia NC_000024.10 Homo sapiens chromosome Y, GRCh38.p14 Primary Assembly
NC62Error en la secuencia NC_012920.1 Homo sapiens mitochondrion, complete genome
NC704

In [58]:
uniques = {}
for cromosome, seq in sequences.items():
    uniques[cromosome] = list(set(seq))

In [59]:
uniques_all = set()
for values in uniques.values():
    uniques_all.update(values)
uniques_all = list(uniques_all)
states = ['A', 'C', 'G', 'T']
to_delete = [x for x in uniques_all if x.upper() not in states]
to_delete

[]

In [None]:
for key, value in tqdm(sequences.items()):
    value = MutableSeq(value)
    for nucleotid in to_delete:
        value.replace(nucleotid, '', inplace=True)
    sequences[key] = Seq(value)

In [61]:
pickle.dump(sequences, open("sequences.pkl", "wb"))

## **Creation of the Transition and Emission Matrices**

In [3]:
sequences = pickle.load(open("sequences.pkl", "rb"))

In [4]:
hidden_states = ["Exon", "Intron"]
sequences

{'NC1': Seq('taaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccc...ggg'),
 'NC2': Seq('CGTAtcccacacaccacacccacacaccacacccacacacacccacacccacac...tag'),
 'NC3': Seq('ctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctcacc...TTC'),
 'NC4': Seq('accctaaccctaaccctaaccctaaccctaaccctaccctaaccctaaccctta...tag'),
 'NC5': Seq('taaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccc...ATC'),
 'NC6': Seq('GATCTTATATAACTGTGAGATTAATCTCAGATAATGACACAAAATATAGTGAAG...ATC'),
 'NC7': Seq('ctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaacc...ggg'),
 'NC8': Seq('GCAATtatgacacaaaaaattaaacagtgcaGACTgatatataaatcaaaacaa...ata'),
 'NC9': Seq('taaccctaaccctaaccctaacccaaccccaccccaaccccaaccccaacccaa...TTC'),
 'NC10': Seq('ctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaacc...ggg'),
 'NC11': Seq('GAATtctacattagaaaaataaaccataGCCTCATCACAGGCACTTAAATACAC...tag'),
 'NC12': Seq('ctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaaccctaacc...tag'),
 'NC13': Seq('agcattctgagaaattactttgtgatgtgtgcattcatcacaaagagttgaacc...ta

## **Viterbi Algorithm**

In [None]:

def Viterbi(A, B, pi, observados, states):
    # A: matriz de transición de la cadena de Markov oculta
    # B: matriz de emisión
    # pi: distribución inicial de la cadena de Markov
    # observados: lista de valores observados
    # states: lista de estados ocultos
    # retorna la lista de valores no observados más probable

    num_obs = len(observados)
    num_states = len(A)

    # Matrices para almacenar las probabilidades y los caminos
    m = np.zeros((num_obs, num_states), dtype=np.float64)
    path = np.zeros((num_obs, num_states), dtype=int)

    # Inicialización
    for i in range(num_states):
        m[0][i] = pi[i] * B[i][observados[0]]
        path[0][i] = 0

    # Recursión
    for t in range(1, num_obs):
        for i in range(num_states):
            max_prob = -1
            for j in range(num_states):
                prob = m[t-1][j] * A[j][i] * B[i][observados[t]]
                if prob > max_prob:
                    max_prob = prob
                    path[t][i] = j
            m[t][i] = max_prob

    # Construcción del camino más probable
    max_seq = []
    last_state = np.argmax(m[num_obs-1])
    for t in range(num_obs-1, -1, -1):
        max_seq.insert(0, states[last_state])
        last_state = path[t][last_state]

    return max_seq

A = np.array([[0.7, 0.3], [0.4, 0.6]])
B = np.array([[0.1, 0.4, 0.5], [0.7, 0.2, 0.1]])
pi = np.array([0.6, 0.4])
obs = np.array([0, 1, 0, 1])
states = ["H", "C"]
print(Viterbi(A, B, pi, obs, states))