In [2]:
import pathlib
from more_itertools import batched

In [3]:
_DNA_TO_AA = """TTT F      CTT L      ATT I      GTT V
TTC F      CTC L      ATC I      GTC V
TTA L      CTA L      ATA I      GTA V
TTG L      CTG L      ATG M      GTG V
TCT S      CCT P      ACT T      GCT A
TCC S      CCC P      ACC T      GCC A
TCA S      CCA P      ACA T      GCA A
TCG S      CCG P      ACG T      GCG A
TAT Y      CAT H      AAT N      GAT D
TAC Y      CAC H      AAC N      GAC D
TAA Stop   CAA Q      AAA K      GAA E
TAG Stop   CAG Q      AAG K      GAG E
TGT C      CGT R      AGT S      GGT G
TGC C      CGC R      AGC S      GGC G
TGA Stop   CGA R      AGA R      GGA G
TGG W      CGG R      AGG R      GGG G"""

DNA_TO_AA = {
    "TTT": "F",
    "CTT": "L",
    "ATT": "I",
    "GTT": "V",
    "TTC": "F",
    "CTC": "L",
    "ATC": "I",
    "GTC": "V",
    "TTA": "L",
    "CTA": "L",
    "ATA": "I",
    "GTA": "V",
    "TTG": "L",
    "CTG": "L",
    "ATG": "M",
    "GTG": "V",
    "TCT": "S",
    "CCT": "P",
    "ACT": "T",
    "GCT": "A",
    "TCC": "S",
    "CCC": "P",
    "ACC": "T",
    "GCC": "A",
    "TCA": "S",
    "CCA": "P",
    "ACA": "T",
    "GCA": "A",
    "TCG": "S",
    "CCG": "P",
    "ACG": "T",
    "GCG": "A",
    "TAT": "Y",
    "CAT": "H",
    "AAT": "N",
    "GAT": "D",
    "TAC": "Y",
    "CAC": "H",
    "AAC": "N",
    "GAC": "D",
    "TAA": "0",
    "CAA": "Q",
    "AAA": "K",
    "GAA": "E",
    "TAG": "0",
    "CAG": "Q",
    "AAG": "K",
    "GAG": "E",
    "TGT": "C",
    "CGT": "R",
    "AGT": "S",
    "GGT": "G",
    "TGC": "C",
    "CGC": "R",
    "AGC": "S",
    "GGC": "G",
    "TGA": "0",
    "CGA": "R",
    "AGA": "R",
    "GGA": "G",
    "TGG": "W",
    "CGG": "R",
    "AGG": "R",
    "GGG": "G",
}

START_CODON = "M"
STOP_CODON = "0"

In [12]:
def reverse_complement(s):
    mapping = dict(A="T", T="A", C="G", G="C")
    return "".join(map(lambda c: mapping[c], reversed(s)))


def solve(s: str):
    proteins = []
    for s_ in (s, reverse_complement(s)):
        for offset in range(3):
            rf = s_[offset:]
            rf = "".join(
                map(
                    lambda codon: DNA_TO_AA[codon],
                    ["".join(codon) for codon in batched(rf, 3) if len(codon) == 3],
                )
            )

            protein = ""
            for i, c in enumerate(rf):
                if c == START_CODON:
                    protein = ""
                    for c_ in rf[i:]:
                        if c_ != STOP_CODON:
                            protein += c_
                        else:
                            proteins.append(protein)
                            break

                # if not started and c == START_CODON:
                #     started = True
                #     protein += c
                # elif started and c == STOP_CODON:
                #     started = False
                #     # protein += c
                #     proteins.append(protein)
                #     protein = ""
                # elif started:
                #     protein += c

    return "\n".join(set(proteins))


print(
    solve(
        "AGCCATGTAGCTAACTCAGGTTACATGGGGATGACCCCGCGACTTGGATTAGAGTCTCTTTTGGAATAAGCCTGAATGATCCGAGTAGCATCTCAG"
    )
)

MGMTPRLGLESLLE
MTPRLGLESLLE
MLLGSFRLIPKETLIQVAGSSPCNLS
M


In [17]:
_, *lines = pathlib.Path("data/rosalind_orf.txt").read_text().strip().split("\n")

s = "".join(lines)
print(solve(s))

MQPERHALARCSRGRT
MLGE
MSDTIGCVERKTLKDCTLSKSLGAFNRASMRAEHFPSGDKTCL
MYFAP
MAAMAACKHLRNFAQLRSSAH
MGLRESPAPFL
MLSRVAVRLASVRNSMDFLS
MCESSARRKGAKVTSGQGTIAEPAVFRIQEDTTPVTQSRKLSNLCAMAAMAACKHLRNFAQLRSSAH
MHCKDVLSSGDKPADERNIP
MAHKFDNFLDCVTGVVSS
MFLSSAGLSPLDSTSLQCIYSPSMFKRQNVR
MRQDFCSKCNLLGSCVQRTRLCRTSAHSADLATWVRDSAERGMSANPNG
MAFTGQNTYRQRAGLWVCENHQPLFFNLLRIALQKRT
MFKRQNVR
MLRSHTGPVKCAKTFAQSAIF
MRAEHFPSGDKTCL
MAACKHLRNFAQLRSSAH
MELRAAIDTEPFMLSRVAVRLASVRNSMDFLS
MKGSVSIAARSSIRPSSAVY
MDFLS
MRST
MHLLPKHVQEAEREIAISRSAS
MSANPNG
MDNRA
