# Transcript Sequences Example
We use here the cDNA/CDS/PEP files from [Ensembl](https://ftp.ensembl.org/pub/release-111/fasta/homo_sapiens/) to generate the RNA, ORF, AA, 5'UTR and 3'UTR sequences of a given transcript.

One can also use the gene annotation API (see examples in `gene_annotation_example.ipynb`) or the Ensembl REST API (see examples in `Explore_ensembl_REST\example_ensembl_rest.ipynb`) to get these transcript sequences.

To get the pre-RNA sequence, see either one of the two notebooks listed above.

In [None]:
import pathlib
from Bio import SeqIO
from requests.exceptions import HTTPError
import mysys_init as si
import ensembl_rest_utils as erut  # used only to get the ENSP ID of a given ENST ID

In [None]:
# the downloaded Ensembl files
cdna_file: pathlib.Path = si.human_sequences_path / 'CDNA' / 'Homo_sapiens.GRCh38.cdna.all.fa'
cds_file: pathlib.Path = si.human_sequences_path / 'CDS' / 'Homo_sapiens.GRCh38.cds.all.fa'
protein_file: pathlib.Path = si.human_sequences_path / 'PEP' / 'Homo_sapiens.GRCh38.pep.all.fa'

# Example

In [None]:
#transcript_id: str = 'ENST00000300403'  # with or without version
transcript_id: str = 'ENST00000300403.11'  # with or without version
# ====================================================================

# cDNA (RNA sequence)
print(f"cDNA\n=====")
try:
    record_dict = SeqIO.index(str(cdna_file), "fasta")
    ky = transcript_id if '.' in transcript_id else [x for x in record_dict.keys() if transcript_id in x][0]
    cdna_seq_record = record_dict[ky]
except KeyError:
    print(f"{transcript_id=} not found in {cdna_file}.")
else:
    print(f"Record information:\n{cdna_seq_record}")
    print(f"\n{cdna_seq_record.id} cDNA seq:\n{cdna_seq_record.seq}")

# CDS (ORF sequence)
print(f"\nCDS\n====")
try:
    record_dict = SeqIO.index(str(cds_file), "fasta")
    ky = transcript_id if '.' in transcript_id else [x for x in record_dict.keys() if transcript_id in x][0]
    cds_seq_record = record_dict[ky]
except KeyError:
    print(f"{transcript_id=} not found in {cds_file}.")
else:
    print(f"Record information:\n{cds_seq_record}")
    print(f"\n{cds_seq_record.id} CDS seq:\n{cds_seq_record.seq}")

    # UTRs
    print(f"\nUTRs\n====")
    # the 5'UTR and 3'UTR sequences can be retrieved from the cDNA and CDS sequences
    if (index := cdna_seq_record.seq.find(cds_seq_record.seq)) == -1:
        pass
    else:
        utr5_seq, utr3_seq = cdna_seq_record.seq[:index], cdna_seq_record.seq[index+len(cds_seq_record.seq):]
        print(f"{transcript_id} 5UTR seq:\n{utr5_seq}\n{transcript_id} 3UTR seq:\n{utr3_seq}")

# Protein (AA sequence)
print(f"\nProtein\n=======")
try:
    protein_id = erut.transcript_id2protein_id_with_version(transcript_id.split('.')[0])
except HTTPError:
    print(f"No protein ID found for {transcript_id=}")
else:
    print(f"{protein_id=} corresponding to {transcript_id=}")
    try:
        record_dict = SeqIO.index(str(protein_file), "fasta")
        pp_seq_record = record_dict[protein_id]
    except KeyError:
        print(f"{protein_id=} not found in {protein_file}.")
    else:
        print(f"Record information:\n{pp_seq_record}")
        print(f"\n{pp_seq_record.id} AA seq:\n{pp_seq_record.seq}")
