# Gene Annotation Usage Example
This notebook provides gene annotation examples, using the `Gene_cls` or `Gene_gff3_cls` classes from `Gene_annotation.py`.

The underlying data is the Ensembl GFF3 annotation file from [Ensembl gene annotation GFF3 file](https://useast.ensembl.org/Homo_sapiens/Info/Index).

Requirements: Python >= 3.10, Pandas, and Numpy. In addition, add 

`export PYTHONPATH="$PYTHONPATH:/tamir2/yoramzar/Projects/Cancer_mut/Utils"` 

to your `.bashrc`.

In [1]:
'''
Global configuration.
'''
import pathlib
import pandas as pd
import numpy as np

# make sure PYTHONPATH is set, containing the Utils folder.
import mysys_init as si

# import API
import Gene_annotation as gaut

# set folder to create reports
report_path = pathlib.Path(si.ROOT_path) / 'Ensembl_gene_anot' / 'Reports'

# Example - by Gene Name

We use the gene name to instantiate a `Gene_cls` object, which provides the gene's annotation.


In [20]:
'''
Select a gene and a corresponding transcript.
'''
#gene, transcript_id = 'EGFR', 'ENST00000275493'
#gene, transcript_id = 'IDH1', 'ENST00000345146'
gene, transcript_id = 'PIK3CA', 'ENST00000263967'

#gene, transcript_id = 'PIK3C2A', 'ENST00000533645'
#gene, transcript_id = 'KLF16', 'ENST00000250916'

#gene, transcript_id = 'ENSG00000290149', 'ENST00000476620'
# ---------------------------------------------------------

gene_obj = gaut.Gene_cls(gene)
#gene_obj = gaut.Gene_cls(gene, verbose=False)

Loading /Users/yoramzarai/work/mystuff/Ramot/Projects/Cancer_mut/Data/Ensembl_gene_annotation/Homo_sapiens.GRCh38.109.gff3 to a dataframe...Done.


In [21]:
"""
General information
"""
# get the annotation source
print(f"{gene} annotation source: {gene_obj.source}")

print(f"\n{gene=} information:")
gene_obj.info()

print(f"\ntranscripts:")
gene_obj.show_all_transcript_IDs()

print(f"\nGene start = {gene_obj.gene_start:,}, gene end = {gene_obj.gene_end:,}")

(ts, te) = gene_obj.transcript_boundaries(transcript_id)
print(f"\n{transcript_id=}: start site (TSS) = {ts:,}, end site (TES) = {te:,}.")

# start and stop codons positions
if (codons_info := gene_obj.start_and_stop_codons_pos(transcript_id)) is not None:
    print(f"\nFirst bp of the Start codon located in chr{gene_obj.chrm}:{codons_info[0][0]:,}, and is bp #{codons_info[0][1]} in the RNA")
    print(f"First bp of the Stop codon located in chr{gene_obj.chrm}:{codons_info[1][0]:,}, and is bp #{codons_info[1][1]} in the RNA")


PIK3CA annotation source: /Users/yoramzarai/work/mystuff/Ramot/Projects/Cancer_mut/Data/Ensembl_gene_annotation/Homo_sapiens.GRCh38.109.gff3

gene='PIK3CA' information:
Gene name=PIK3CA, Gene ID=ENSG00000121879
Source=/Users/yoramzarai/work/mystuff/Ramot/Projects/Cancer_mut/Data/Ensembl_gene_annotation/Homo_sapiens.GRCh38.109.gff3
Positive strand=True.
Type=protein_coding, version=6.
Description=phosphatidylinositol-4%2C5-bisphosphate 3-kinase catalytic subunit alpha [Source:HGNC Symbol%3BAcc:HGNC:8975]
Gene region=chr3:179,148,114-179,240,093.
10 transcripts.

transcripts:
PIK3CA transcript IDs:
1. ENST00000477735
2. ENST00000263967
3. ENST00000643187
4. ENST00000675786
5. ENST00000468036
6. ENST00000675467
7. ENST00000674534
8. ENST00000674622
9. ENST00000462255
10. ENST00000675796

Gene start = 179,148,114, gene end = 179,240,093

transcript_id='ENST00000263967': start site (TSS) = 179,148,357, end site (TES) = 179,240,093.

First bp of the Start codon located in chr3:179,198,826, a

In [22]:
# get a detailed information about a transcript
print(f"Detailed {transcript_id=} information:")
gene_obj.transcript_info(transcript_id, verbose=True)

Detailed transcript_id='ENST00000263967' information:
ENST00000263967 information:
Name=PIK3CA-201, ID=ENST00000263967.4, version=4
Type=mRNA, biotype=protein_coding
Start=179,148,357, end=179,240,093
protein ID = ENSP00000263967
ccdsid=CCDS43171.1
ORF start offset = 323, ORF end offset = 3,529 (0-based offsets from the beginning of the RNA)
ORF start chromosome position = 179,198,826, ORF end chromosome position = 179,234,364
Exon regions:
	1. 179,148,357 - 179,148,603, ID=ENSE00001493081, phase=-1, end phase=-1
	2. 179,198,750 - 179,199,177, ID=ENSE00001139995, phase=-1, end phase=1
	3. 179,199,690 - 179,199,899, ID=ENSE00000997375, phase=1, end phase=1
	4. 179,201,290 - 179,201,540, ID=ENSE00001077693, phase=1, end phase=0
	5. 179,203,544 - 179,203,789, ID=ENSE00001077692, phase=0, end phase=0
	6. 179,204,503 - 179,204,588, ID=ENSE00001077694, phase=0, end phase=2
	7. 179,209,595 - 179,209,700, ID=ENSE00001077691, phase=2, end phase=0
	8. 179,210,186 - 179,210,338, ID=ENSE0000112847

Regarding the exon phases see [this](https://blog.gene-test.com/coding-phases-frames/). Basically:
1. exon end phase is the exon phase of the next exon.
2. Conversion from exon phase to CDS phase: 0-->0, 1-->2, 2-->1.

In [23]:
# exon-intron table
print(f'\nExon-intron map for {transcript_id=}:')
display(gene_obj.exon_intron_map(transcript_id))


Exon-intron map for transcript_id='ENST00000263967':


Unnamed: 0,name,region,region_size
0,Exon 1,"(179148357, 179148603)",247
1,Intron 1,"(179148604, 179198749)",50146
2,Exon 2,"(179198750, 179199177)",428
3,Intron 2,"(179199178, 179199689)",512
4,Exon 3,"(179199690, 179199899)",210
5,Intron 3,"(179199900, 179201289)",1390
6,Exon 4,"(179201290, 179201540)",251
7,Intron 4,"(179201541, 179203543)",2003
8,Exon 5,"(179203544, 179203789)",246
9,Intron 5,"(179203790, 179204502)",713


In [24]:
# exon table
print(f'\nExon map for {transcript_id=}:')
display(gene_obj.exon_map(transcript_id))

# if (df := gene_obj.exon_map(transcript_id)) is not None:
#     print(f"Exon map for {gene=}, {transcript_id=}:")
#     display(df.style.format(formatter={"exon_region": lambda x: f'({x[0]:,}, {x[1]:,})'}))


Exon map for transcript_id='ENST00000263967':


Unnamed: 0,exon_number,exon_region,exon_size,mRNA_NT_region,ORF_AA_region,ORF_NT_region,next_exon_frame_alignment,exon_ID
0,1,"(179148357, 179148603)",247,"[1, 247]","[0, 0]","[0, 0]",0,ENSE00001493081
1,2,"(179198750, 179199177)",428,"[248, 675]","[1, 118]","[1, 352]",2,ENSE00001139995
2,3,"(179199690, 179199899)",210,"[676, 885]","[118, 188]","[353, 562]",2,ENSE00000997375
3,4,"(179201290, 179201540)",251,"[886, 1136]","[188, 271]","[563, 813]",0,ENSE00001077693
4,5,"(179203544, 179203789)",246,"[1137, 1382]","[272, 353]","[814, 1059]",0,ENSE00001077692
5,6,"(179204503, 179204588)",86,"[1383, 1468]","[354, 382]","[1060, 1145]",1,ENSE00001077694
6,7,"(179209595, 179209700)",106,"[1469, 1574]","[382, 417]","[1146, 1251]",0,ENSE00001077691
7,8,"(179210186, 179210338)",153,"[1575, 1727]","[418, 468]","[1252, 1404]",0,ENSE00001128470
8,9,"(179210431, 179210565)",135,"[1728, 1862]","[469, 513]","[1405, 1539]",0,ENSE00001128465
9,10,"(179218210, 179218334)",125,"[1863, 1987]","[514, 555]","[1540, 1664]",1,ENSE00001077674


In [None]:
'''
exon map can be save to excel or html files.
'''
gene_obj.exon_map_to_html(transcript_id, report_path / f"{gene}_{transcript_id}_exon_map.html")
gene_obj.exon_map_to_excel(transcript_id, report_path / f"{gene}_{transcript_id}_exon_map.xlsx", usr_desc={"General": f"Exon table for {gene}, {transcript_id}", "Source": "gene_annotation_example.ipynb" })

In [25]:
"""
Get sequences
"""
chrm_path = str(si.chromosome_path / si.get_chrm_file_name(gene_obj.chrm))

transcript_seq = gene_obj.seq(transcript_id, chrm_path)
print(f"{transcript_id=} contains {len(transcript_seq):,} bps.")

rna_seq = gene_obj.rna(transcript_id, chrm_path)
print(f"\nrna=\n{rna_seq}")

orf_seq = gene_obj.ORF(transcript_id, chrm_path)
print(f"\norf=\n{orf_seq}")

aa_seq = gene_obj.AA(transcript_id, chrm_path)
print(f"\nAA=\n{aa_seq}")

utr5_seq = gene_obj.UTR5(transcript_id, chrm_path)
print(f"\nutr5=\n{utr5_seq}")

utr3_seq = gene_obj.UTR3(transcript_id, chrm_path)
print(f"\nutr3=\n{utr3_seq}")

transcript_id='ENST00000263967' contains 91,737 bps.

rna=
AGTTCCGGTGCCGCCGCTGCGGCCGCTGAGGTGTCGGGCTGCTGCTGCCGCGGCCGCTGGGACTGGGGCTGGGGCCGCCGGCGAGGCAGGGCTCGGGCCCGGCCGGGCAGCTCCGGAGCGGCGGGGGAGAGGGGCCGGGAGGCGGGGGCCGTGCCGCCCGCTCTCCTCTCCCTCGGCGCCGCCGCCGCCGCCCGCGGGGCTGGGACCCGATGCGGTTAGAGCCGCGGAGCCTGGAAGAGCCCCGAGCGTTTCTGCTTTGGGACAACCATACATCTAATTCCTTAAAGTAGTTTTATATGTAAAACTTGCAAAGAATCAGAACAATGCCTCCACGACCATCATCAGGTGAACTGTGGGGCATCCACTTGATGCCCCCAAGAATCCTAGTAGAATGTTTACTACCAAATGGAATGATAGTGACTTTAGAATGCCTCCGTGAGGCTACATTAATAACCATAAAGCATGAACTATTTAAAGAAGCAAGAAAATACCCCCTCCATCAACTTCTTCAAGATGAATCTTCTTACATTTTCGTAAGTGTTACTCAAGAAGCAGAAAGGGAAGAATTTTTTGATGAAACAAGACGACTTTGTGACCTTCGGCTTTTTCAACCCTTTTTAAAAGTAATTGAACCAGTAGGCAACCGTGAAGAAAAGATCCTCAATCGAGAAATTGGTTTTGCTATCGGCATGCCAGTGTGTGAATTTGATATGGTTAAAGATCCAGAAGTACAGGACTTCCGAAGAAATATTCTGAACGTTTGTAAAGAAGCTGTGGATCTTAGGGACCTCAATTCACCTCATAGTAGAGCAATGTATGTCTATCCTCCAAATGTAGAATCTTCACCAGAATTGCCAAAGCACATATATAATAAATTAGATAAAGGGCAAATAATAGTGGTGATCTGGGTAATAGTTTCTCCAAATAATGACAAGCAGAAG

In [26]:
"""A sequence of a specific Exon or Intron"""
name, number  = 'Exon', 5  # name can be either 'Intron' or 'Exon'. number is the 1-based Intron/Exon number.
# ------------------------------------------------------------------------------
n_seq, n_label = gene_obj.exon_intron_seq(name, number, transcript_id, chrm_path)
print(f"{name}_{number} sequence ({n_label}):\n{n_seq}\n{len(n_seq)} bps.")

name, number  = 'Intron', 5  # name can be either 'Intron' or 'Exon'. number is the 1-based Intron/Exon number.
# ------------------------------------------------------------------------------
n_seq, n_label = gene_obj.exon_intron_seq(name, number, transcript_id, chrm_path)
print(f"{name}_{number} sequence ({n_label}):\n{n_seq}\n{len(n_seq)} bps.")

Exon_5 sequence (Exon_5:chr3:179203544:179203789):
TATATAAGAAGCTGTATAATGCTTGGGAGGATGCCCAATTTGATGTTGATGGCTAAAGAAAGCCTTTATTCTCAACTGCCAATGGACTGTTTTACAATGCCATCTTATTCCAGACGCATTTCCACAGCTACACCATATATGAATGGAGAAACATCTACAAAATCCCTTTGGGTTATAAATAGTGCACTCAGAATAAAAATTCTTTGTGCAACCTACGTGAATGTAAATATTCGAGACATTGATAAG
246 bps.
Intron_5 sequence (Intron_5:chr3:179203790:179204502):
GTAAAGTCAAATGCTGATGCTTATTATTTTATAGAAATTATTTTAGATAACCTTTTTCTTGCACTATACAGTAATCTGTTGACCTGTAGTATGTTTTCAGATGGTTAGGAGAACATCCAAATCTCCGAATGTAAAAATATATCAAGAATTTTACTTGAGCTTCCATCTACCTTAGCTATTATACAGCTCACAGTCCTTTGTTAATAATTCTAATATTCACAATTCTAGCTCTTAAAATCAAAAGTTTTACAGAATTCGTTTGGCAGAAAGACCTGGGCCAACCTTAAGTGAGGGTTTTTATAATCTTTATTAACCCCACTTAGTATAAAATTCCGGTATCTTATTAAAGAAATATTAATGTCTTTATGAGGTACTGCTTCACCAGCTAAGGAAGTAGTATTTAGTAAGTACGTGTACCAATTTAGCTTTCTAAAATATGGAAAAACTCTGAATTACATACCTCCCTTAAGGGGATTGTGGGCCTATATTTATGTTTTAGTAGTCTGATGTCTCCATTGTTATTAGTGGATGAAGGCAGCAACTAATTTTGGTGAAGACTCTACATCAGTATTAACGTGTTACATATGTGAAAAAAAGGAGAACCAAGCTATATCTGAACAAAAATTCCGTGGTTTTAT

In [27]:
# Optional: compare AA sequence with Ensembl REST API
# Note: if you reeceive a "HTTPError: 503 Server Error: Service Unavailable for url", try to run again.
import ensembl_rest_utils as erst
import translation as tran

p_id = gene_obj.transcripts_info[transcript_id]['protein_id']
ensb_trans = erst.lookup_id(transcript_id)
ensb_pid = ensb_trans['Translation']['id']
assert ensb_pid == p_id, f"{p_id=} and {ensb_pid=} mismatch !!"
ensb_aa = erst.sequence_endpoint_base(ensb_pid)

if aa_seq is not None:
    if ensb_aa == aa_seq:
        print(f"AA agrees with REST AA")
    else:
        print(f"AA and REST mismatch !!!!!!")


AA agrees with REST AA


In [28]:
"""
Perform some basic queries.

Query a chromosome position.
"""
#chrm_pos = 55_157_663  # query a chromosome position
#chrm_pos = 208_243_604-10 # query a chromosome position
chrm_pos = 179_148_357+800 # query a chromosome position
# -----------------------------------------------------
chrm_info = gene_obj.chrm_pos_info(transcript_id, chrm_pos, chrm_path)
print(chrm_info)

{'region': 'Intron_1', 'region_pos': 554, 'bp': 'C'}


In [29]:
"""
Chromosome position to RNA position
"""
#chrm_pos = 55_211_628  # for EGFR
#chrm_pos = 208_236_229  # for IDH1
chrm_pos = 179_199_690  # for PIK3CA
# -----------------------------------------
if (rna_pos := gene_obj.chrm_pos2rna_pos(transcript_id, chrm_pos)) is None:
    print(f"{chrm_pos=:,} is outside of the RNA (outside of {transcript_id} exons)")
else:
    print(f"{chrm_pos=:,} --> {rna_pos=:,}")

chrm_pos=179,199,690 --> rna_pos=676


In [30]:
'''
Query a position within an exon.
'''
exon_number = 7  # 1-based exon number
nt_number = 47   # nucleotide 1-based number in the exon 
# -----------------------------------------------------------------------
info = gene_obj.exon_nt_info(transcript_id, exon_number, nt_number, chrm_path)
print(info)

{'segment': 'ORF', 'pos_in_segment': 1192, 'NT': 'C', 'codon_number': 398, 'nt_in_codon': 1, 'codon': 'CGT', 'aa': 'R'}


In [31]:
'''
Query an amino-acid position.

The 'codon_exon_pos' is a list of 3 positions, each in the form <exon number>:<pos>.
pos in the 1-based position relative to the start of exon <exon number>. The corresponding
chromosome position is then exon_start_coordinate+m*(pos-1), where m=1 [m=-1] for genes encoded on
the positive [negative] DNA strand (in genes encoded on the negative strand, 
exon_start_coordinate>exon_end_coordinate).
'''
aa_number = 163 #1028 # 78 # 397  # 1-based AA position in the ORF
# --------------------------------------------------
aa_info = gene_obj.aa_exon_info(transcript_id, aa_number, chrm_path)
print(aa_info)

{'codon': 'GCA', 'AA': 'A', 'codon_exon_pos': ['Exon_3:135', 'Exon_3:136', 'Exon_3:137'], 'codon_chromosome_pos': [179199824, 179199825, 179199826]}


In [33]:
"""
Converts a DNA mutation to the corresponding AA mutation.
"""
#mut_id = 'EGFR:chr7:Missense_Mutation:55152609:G:C'
#mut_id = 'IDH1:chr2:Missense_Mutation:208248455:C:T'
mut_id = 'PIK3CA:chr3:Missense_Mutation:179218306:C:A' #'PIK3CA:chr3:Missense_Mutation:179234297:A:G'

mut_id_cols = ['Gene_name', 'Chromosome', 'Variant_Classification', 
               'Start_Position', 'Reference_Allele', 'Tumor_Seq_Allele2']
mut_id_sep = ':'
# ==================================================================================

mut_info = dict(zip(mut_id_cols, mut_id.split(mut_id_sep)))
ref_a, mut_a, pos = mut_info['Reference_Allele'], mut_info['Tumor_Seq_Allele2'], int(mut_info['Start_Position'])

aa_mut = gene_obj.DNA_SNP_mut_to_AA_mut(ref_a, mut_a, pos, transcript_id, chrm_path)
print(f"{mut_id=} --> {aa_mut=}")

mut_id='PIK3CA:chr3:Missense_Mutation:179218306:C:A' --> aa_mut='Q546K'


In [34]:
"""
Converts a AA mutation to (all possible) DNA mutation.
"""
dna_all_muts = gene_obj.AA_mut_to_DNA_SNP_mut(aa_mut, transcript_id, chrm_path) # type: ignore
print(f"{aa_mut=} corresponds to:\n{dna_all_muts}.")

aa_mut='Q546K' corresponds to:
{'AAA': {'start_pos': 179218306, 'reference_allele': 'CAG', 'alternative_allele': 'AAA'}, 'AAG': {'start_pos': 179218306, 'reference_allele': 'C', 'alternative_allele': 'A'}}.


# Example - Specifying GFF3 file or a Corresponding DataFrame
The `Gene_cls` loads internally the latest GFF3 file into a dataframe. This load takes few seconds to cpmplete. 
It is possible to specify a GFF3 file, or instead to provide a dataframe of the GFF3 file, when instatntiating the annotation class.

For example, in cases where annotation is needed for several genes, it is more efficient to load the GFF3 file into a dataframe once, and then instantiate multiple annotation classes with the dataframe. 

In [16]:
# loading GFF3 file into a dataframe
gff3_df = gaut.ensembl_gff3_df()

In [17]:
# now instantiating using the dataframe (much faster)
g_a1 = gaut.Gene_gff3_cls('EGFR', gff3_df)
g_a1.info()
g_a2 = gaut.Gene_gff3_cls('BRCA1', gff3_df)
g_a2.info()

Gene name=EGFR, Gene ID=ENSG00000146648
Source=Input DataFrame (ID=12364455696, size=(3414011, 9)).
Positive strand=True.
Type=protein_coding, version=21.
Description=epidermal growth factor receptor [Source:HGNC Symbol%3BAcc:HGNC:3236]
Gene region=chr7:55,019,017-55,211,628.
11 transcripts.
Gene name=BRCA1, Gene ID=ENSG00000012048
Source=Input DataFrame (ID=12364455696, size=(3414011, 9)).
Positive strand=False.
Type=protein_coding, version=24.
Description=BRCA1 DNA repair associated [Source:HGNC Symbol%3BAcc:HGNC:1100]
Gene region=chr17:43,044,295-43,170,245.
38 transcripts.


In [35]:
# another example is to instatntiate by specifying your GFF3 file
#mygff3_file: pathlib.Path = gaut.GFF3_file  # specify here your Ensembl GFF3 file
mygff3_file: pathlib.Path = pathlib.Path(si.ROOT_path) / 'Data' / 'Ensembl_gene_annotation' / 'Homo_sapiens.GRCh38.108.gff3'
# --------------------------------------------------------------------------------------------------------------------------
g_af = gaut.Gene_gff3_cls('EGFR', mygff3_file)
g_af.info()


Loading /Users/yoramzarai/work/mystuff/Ramot/Projects/Cancer_mut/Data/Ensembl_gene_annotation/Homo_sapiens.GRCh38.108.gff3 to a dataframe...Done.
Gene name=EGFR, Gene ID=ENSG00000146648
Source=/Users/yoramzarai/work/mystuff/Ramot/Projects/Cancer_mut/Data/Ensembl_gene_annotation/Homo_sapiens.GRCh38.108.gff3
Positive strand=True.
Type=protein_coding, version=21.
Description=epidermal growth factor receptor [Source:HGNC Symbol%3BAcc:HGNC:3236]
Gene region=chr7:55,019,017-55,211,628.
11 transcripts.


In [19]:
# finally, you can load your GFF3 file into a dataframe, and instantiate with that dataframe
mygff3_df = gaut.ensembl_gff3_df(mygff3_file)
g_ad = gaut.Gene_gff3_cls('EGFR', mygff3_df)
g_ad.info()

Gene name=EGFR, Gene ID=ENSG00000146648
Source=Input DataFrame (ID=13471703248, size=(3414011, 9)).
Positive strand=True.
Type=protein_coding, version=21.
Description=epidermal growth factor receptor [Source:HGNC Symbol%3BAcc:HGNC:3236]
Gene region=chr7:55,019,017-55,211,628.
11 transcripts.


# Main Interfaces

In [None]:
exon_list, intron_list = [1, 2,3 , 4], [5, 6, 7, 8]


In [None]:
# import API
import Gene_annotation as gaut

# gene annotation object
gene_obj = gaut.Gene_cls(gene)

# info
gene_obj.info()
gene_obj.transcript_info(transcript_id, verbose=True)

# maps
gene_obj.exon_intron_map(transcript_id)
gene_obj.exon_map(transcript_id)

# sequences
gene_obj.rna(transcript_id, chrm_path)
gene_obj.ORF(transcript_id, chrm_path)
gene_obj.AA(transcript_id, chrm_path)
gene_obj.UTR5(transcript_id, chrm_path)
gene_obj.UTR3(transcript_id, chrm_path)
gene_obj.exon_intron_seq(name, number, transcript_id, chrm_path)
gene_obj.modified_transcript(exon_list, intron_list, transcript_id, chrm_path)

# query
gene_obj.chrm_pos_info(transcript_id, chrm_pos, chrm_path)
gene_obj.exon_nt_info(transcript_id, exon_number, nt_number, chrm_path)
gene_obj.aa_exon_info(transcript_id, aa_number, chrm_path)
gene_obj.DNA_SNP_mut_to_AA_mut(ref_a, mut_a, pos, transcript_id, chrm_path)
gene_obj.AA_mut_to_DNA_SNP_mut(aa_mut, transcript_id, chrm_path)                          # type: ignore