# Gene Annotation Usage Example
This notebook provides gene annotation examples, using the `Gene_cls` or `Gene_gff3_cls` classes from `Gene_annotation.py`.

The underlying data is the Ensembl GFF3 annotation file from [Ensembl gene annotation GFF3 file](https://useast.ensembl.org/Homo_sapiens/Info/Index).

Requirements: Python >= 3.10, Pandas, and Numpy. In addition, add 

`export PYTHONPATH="$PYTHONPATH:/tamir2/yoramzar/Projects/Cancer_mut/Utils"` 

to your `.bashrc`.

In [2]:
'''
Global configuration.
'''
import pathlib
import pandas as pd

# make sure PYTHONPATH is set, containing the Utils folder.
import mysys_init as si

# import API
import Gene_annotation as gaut

# set folder to create reports
report_path = pathlib.Path(si.ROOT_path) / 'Ensembl_gene_anot' / 'Reports'

# Example - by Gene Name

We use the gene name to instantiate a `Gene_cls` object, which provides the gene's annotation.


In [8]:
'''
Select a gene and a corresponding transcript.
'''
#gene, transcript_id = 'FLT3', 'ENST00000241453'

gene, transcript_id = 'EGFR', 'ENST00000275493'
#gene, transcript_id = 'IDH1', 'ENST00000345146'
#gene, transcript_id = 'PIK3CA', 'ENST00000263967'

#gene, transcript_id = 'ENSG00000290826', 'ENST00000642116'  # example of a ncRNA gene ()

#gene, transcript_id = 'PIK3C2A', 'ENST00000533645'
#gene, transcript_id = 'KLF16', 'ENST00000250916'

#gene, transcript_id = 'ENSG00000290149', 'ENST00000476620'
# ---------------------------------------------------------

gene_obj = gaut.Gene_cls(gene)
#gene_obj = gaut.Gene_cls(gene, verbose=False)

Loading /Users/yoramzarai/work/mystuff/Ramot/Projects/Cancer_mut/Data/Ensembl_gene_annotation/Homo_sapiens.GRCh38.111.gff3.gz to a dataframe...Done.


In [4]:
"""
General information
"""
print(f"\n{gene=} information:")
gene_obj.info()

print(f"\n{gene} contains {len(gene_obj)} transcripts.")
gene_obj.show_all_transcript_IDs()

print(f"\nGene start = {gene_obj.gene_start:,}, gene end = {gene_obj.gene_end:,}")

(ts, te) = gene_obj.transcript_boundaries(transcript_id)
print(f"\n{transcript_id=}: start site (TSS) = {ts:,}, end site (TES) = {te:,}.")

# start and stop codons positions
if (codons_info := gene_obj.start_and_stop_codons_pos(transcript_id)) is not None:
    print(f"\nFirst bp of the Start codon located in chr{gene_obj.chrm}:{codons_info[0][0]:,}, and is bp #{codons_info[0][1]} in the RNA.")
    print(f"First bp of the Stop codon located in chr{gene_obj.chrm}:{codons_info[1][0]:,}, and is bp #{codons_info[1][1]} in the RNA.")


gene='FLT3' information:
Gene name=FLT3, Gene ID=ENSG00000122025
Source=/Users/yoramzarai/work/mystuff/Ramot/Projects/Cancer_mut/Data/Ensembl_gene_annotation/Homo_sapiens.GRCh38.111.gff3.gz
Positive strand=False.
Type=protein_coding, version=15.
Description=fms related receptor tyrosine kinase 3 [Source:HGNC Symbol%3BAcc:HGNC:3765]
Gene region=chr13:28,003,274-28,100,592.
2 transcripts.

FLT3 contains 2 transcripts.
FLT3 transcript IDs:
1. ENST00000241453
2. ENST00000380987

Gene start = 28,003,274, gene end = 28,100,592

transcript_id='ENST00000241453': start site (TSS) = 28,100,576, end site (TES) = 28,003,274.

First bp of the Start codon located in chr13:28,100,510, and is bp #67 in the RNA.
First bp of the Stop codon located in chr13:28,004,054, and is bp #3046 in the RNA.


In [5]:
# get a detailed information about a transcript
print(f"Detailed {transcript_id=} information:")
gene_obj.transcript_info(transcript_id, verbose=True)

Detailed transcript_id='ENST00000241453' information:
ENST00000241453 information:
Name=FLT3-201, ID=ENST00000241453.12, version=12
Type=mRNA, biotype=protein_coding
Start=28,100,576, end=28,003,274
protein ID = ENSP00000241453
ccdsid=CCDS31953.1
ORF start offset = 66, ORF end offset = 3,047 (0-based offsets from the beginning of the RNA)
ORF start chromosome position = 28,100,510, ORF end chromosome position = 28,004,052
Exon regions:
	1. 28,100,576 - 28,100,468, ID=ENSE00003896396, phase=-1, end phase=1
	2. 28,070,612 - 28,070,491, ID=ENSE00000906615, phase=1, end phase=0
	3. 28,062,069 - 28,061,867, ID=ENSE00000906614, phase=0, end phase=2
	4. 28,057,462 - 28,057,347, ID=ENSE00000906613, phase=2, end phase=1
	5. 28,052,674 - 28,052,545, ID=ENSE00000906612, phase=1, end phase=2
	6. 28,050,222 - 28,050,095, ID=ENSE00000906611, phase=2, end phase=1
	7. 28,049,774 - 28,049,635, ID=ENSE00000906610, phase=1, end phase=0
	8. 28,049,537 - 28,049,384, ID=ENSE00000906609, phase=0, end phase=1

Regarding the exon phases see [this](https://blog.gene-test.com/coding-phases-frames/). Basically:
1. exon end phase is the exon phase of the next exon.
2. Conversion from exon phase to CDS phase: 0-->0, 1-->2, 2-->1.

In [6]:
# exon-intron table
print(f'\nExon-intron map for {transcript_id=}:')
display(gene_obj.exon_intron_map(transcript_id))


Exon-intron map for transcript_id='ENST00000241453':


Unnamed: 0,name,region,region_size
0,Exon 1,"(28100576, 28100468)",109
1,Intron 1,"(28100467, 28070613)",29855
2,Exon 2,"(28070612, 28070491)",122
3,Intron 2,"(28070490, 28062070)",8421
4,Exon 3,"(28062069, 28061867)",203
5,Intron 3,"(28061866, 28057463)",4404
6,Exon 4,"(28057462, 28057347)",116
7,Intron 4,"(28057346, 28052675)",4672
8,Exon 5,"(28052674, 28052545)",130
9,Intron 5,"(28052544, 28050223)",2322


In [7]:
# exon table
print(f'\nExon map for {transcript_id=}:')
display(gene_obj.exon_map(transcript_id))

# if (df := gene_obj.exon_map(transcript_id)) is not None:
#     print(f"Exon map for {gene=}, {transcript_id=}:")
#     display(df.style.format(formatter={"exon_region": lambda x: f'({x[0]:,}, {x[1]:,})'}))


Exon map for transcript_id='ENST00000241453':


Unnamed: 0,exon_number,exon_region,exon_size,mRNA_NT_region,ORF_NT_region,ORF_AA_region,next_exon_frame_alignment,exon_ID
0,1,"(28100576, 28100468)",109,"[1, 109]","[1, 43]","[1, 15]",2,ENSE00003896396
1,2,"(28070612, 28070491)",122,"[110, 231]","[44, 165]","[15, 55]",0,ENSE00000906615
2,3,"(28062069, 28061867)",203,"[232, 434]","[166, 368]","[56, 123]",1,ENSE00000906614
3,4,"(28057462, 28057347)",116,"[435, 550]","[369, 484]","[123, 162]",2,ENSE00000906613
4,5,"(28052674, 28052545)",130,"[551, 680]","[485, 614]","[162, 205]",1,ENSE00000906612
5,6,"(28050222, 28050095)",128,"[681, 808]","[615, 742]","[205, 248]",2,ENSE00000906611
6,7,"(28049774, 28049635)",140,"[809, 948]","[743, 882]","[248, 294]",0,ENSE00000906610
7,8,"(28049537, 28049384)",154,"[949, 1102]","[883, 1036]","[295, 346]",2,ENSE00000906609
8,9,"(28048443, 28048275)",169,"[1103, 1271]","[1037, 1205]","[346, 402]",1,ENSE00000906608
9,10,"(28037288, 28037185)",104,"[1272, 1375]","[1206, 1309]","[402, 437]",2,ENSE00000827401


In [None]:
'''
exon map can be save to excel, csv, or html files.
'''
gene_obj.exon_map_to_html(transcript_id, report_path / f"{gene}_{transcript_id}_exon_map.html")
gene_obj.exon_map_to_excel(transcript_id, report_path / f"{gene}_{transcript_id}_exon_map.xlsx", usr_desc={"General": f"Exon table for {gene}, {transcript_id}", "Source": "gene_annotation_example.ipynb" })
gene_obj.exon_map_to_csv(transcript_id, report_path / f"{gene}_{transcript_id}_exon_map.csv")

In [None]:
"""
Get sequences
"""
chrm_path = str(si.chromosome_path / si.get_chrm_file_name(gene_obj.chrm))

transcript_seq = gene_obj.seq(transcript_id, chrm_path).upper()
print(f"{transcript_id=} contains {len(transcript_seq):,} bps.")

rna_seq = gene_obj.rna(transcript_id, chrm_path).upper()
print(f"\nrna=\n{rna_seq}")

orf_seq = gene_obj.ORF(transcript_id, chrm_path).upper()
print(f"\norf=\n{orf_seq}")

aa_seq = gene_obj.AA(transcript_id, chrm_path)
print(f"\nAA=\n{aa_seq}")

utr5_seq = gene_obj.UTR5(transcript_id, chrm_path).upper()
print(f"\nutr5=\n{utr5_seq}")

utr3_seq = gene_obj.UTR3(transcript_id, chrm_path).upper()
print(f"\nutr3=\n{utr3_seq}")

In [None]:
"""A sequence of a specific Exon or Intron"""
name, number  = 'Exon', 5  # name can be either 'Intron' or 'Exon'. number is the 1-based Intron/Exon number.
# ------------------------------------------------------------------------------
n_seq, n_label = gene_obj.exon_intron_seq(name, number, transcript_id, chrm_path)
print(f"{name}_{number} sequence ({n_label}):\n{n_seq}\n{len(n_seq)} bps.")

name, number  = 'Intron', 5  # name can be either 'Intron' or 'Exon'. number is the 1-based Intron/Exon number.
# ------------------------------------------------------------------------------
n_seq, n_label = gene_obj.exon_intron_seq(name, number, transcript_id, chrm_path)
print(f"{name}_{number} sequence ({n_label}):\n{n_seq}\n{len(n_seq)} bps.")

In [None]:
# Optional: compare AA sequence with Ensembl REST API
# Note: if you reeceive a "HTTPError: 503 Server Error: Service Unavailable for url", try to run again.
import ensembl_rest_utils as erut
ensb_aa = erut.sequence_endpoint_base(transcript_id, seq_type='protein')
if aa_seq is not None:
    if ensb_aa == aa_seq:
        print(f"AA agrees with REST AA")
    else:
        print(f"AA and REST mismatch !!!!!!")

In [None]:
"""
Perform some basic queries.

Query a chromosome position.
"""
chrm_pos = 55_157_663  # query a chromosome position
#chrm_pos = 208_243_604-10 # query a chromosome position
#chrm_pos = 179_148_357+800 # query a chromosome position
# -----------------------------------------------------
chrm_info = gene_obj.chrm_pos_info(transcript_id, chrm_pos, chrm_path)
print(chrm_info)

In [None]:
"""
Chromosome position to RNA position
"""
chrm_pos = 55_211_628  # for EGFR
#chrm_pos = 208_236_229  # for IDH1
#chrm_pos = 179_199_690  # for PIK3CA
# -----------------------------------------
if (rna_pos := gene_obj.chrm_pos2rna_pos(transcript_id, chrm_pos)) is None:
    print(f"{chrm_pos=:,} is outside of the RNA (outside of {transcript_id} exons)")
else:
    print(f"{chrm_pos=:,} --> {rna_pos=:,}")

In [None]:
"""
RNA position to chromosome position    
"""
rna_pos: int = 685
# ================
chrm_p = gene_obj.rna_pos2chrm_pos(transcript_id, rna_pos, chrm_path)
print(f"{rna_pos=} --> {chrm_p=}")

In [None]:
"""
RNA position to chromosome information   
"""
rna_pos: int = 685
# ================
chrm_info = gene_obj.rna_pos2chrm_info(transcript_id, rna_pos, chrm_path)
print(chrm_info)

In [None]:
'''
Query a position within an exon.
'''
exon_number = 7  # 1-based exon number
nt_number = 47   # nucleotide 1-based number in the exon 
# -----------------------------------------------------------------------
info = gene_obj.exon_nt_info(transcript_id, exon_number, nt_number, chrm_path)
print(info)

In [None]:
'''
Query an amino-acid position.

The 'codon_exon_pos' is a list of 3 positions, each in the form <exon number>:<pos>.
pos in the 1-based position relative to the start of exon <exon number>. The corresponding
chromosome position is then exon_start_coordinate+m*(pos-1), where m=1 [m=-1] for genes encoded on
the positive [negative] DNA strand (in genes encoded on the negative strand, 
exon_start_coordinate>exon_end_coordinate).
'''
aa_number = 163 #1028 # 78 # 397  # 1-based AA position in the ORF
# --------------------------------------------------
aa_info = gene_obj.aa_exon_info(transcript_id, aa_number, chrm_path)
print(aa_info)

In [None]:
"""
Converts a DNA mutation to the corresponding AA mutation.
"""
mut_id = 'EGFR:chr7:Missense_Mutation:55152609:G:C'
#mut_id = 'IDH1:chr2:Missense_Mutation:208248455:C:T'
#mut_id = 'PIK3CA:chr3:Missense_Mutation:179218306:C:A' #'PIK3CA:chr3:Missense_Mutation:179234297:A:G'

mut_id_cols = ['Gene_name', 'Chromosome', 'Variant_Classification', 
               'Start_Position', 'Reference_Allele', 'Tumor_Seq_Allele2']
mut_id_sep = ':'
# ==================================================================================

mut_info = dict(zip(mut_id_cols, mut_id.split(mut_id_sep)))
ref_a, mut_a, pos = mut_info['Reference_Allele'], mut_info['Tumor_Seq_Allele2'], int(mut_info['Start_Position'])

aa_mut = gene_obj.DNA_SNP_mut_to_AA_mut(ref_a, mut_a, pos, transcript_id, chrm_path)
print(f"{mut_id=} --> {aa_mut=}")

In [None]:
"""
Converts a AA mutation to (all possible) DNA mutation.
"""
dna_all_muts = gene_obj.AA_mut_to_DNA_SNP_mut(aa_mut, transcript_id, chrm_path) # type: ignore
print(f"{aa_mut=} corresponds to:\n{dna_all_muts}.")

# Example - Specifying GFF3 file or a Corresponding DataFrame
The `Gene_cls` loads internally the latest GFF3 file into a dataframe. This load takes few seconds to complete. 
It is possible to specify a GFF3 file, or instead to provide a dataframe of the GFF3 file, when instatntiating the annotation class.

For example, in cases where annotation is needed for multiple genes, it is more efficient to load the GFF3 file into a dataframe once, and then instantiate multiple annotation classes with the dataframe. Below are some examples.

In [None]:
# loading GFF3 file into a dataframe, and returning a tuple containing it, and a subset of it, which contains only rows with Type values that are genes
gff3_dfs = gaut.ensembl_gff3_df()  # analyzing genes, ncRNA_gene and pseudogene
# gff3_dfs = gaut.ensembl_gff3_df(gene_type_values=['gene'])  # analyzing only genes (excluding ncRNA_gene and pseudogene). This slightly reduces processing times.

In [None]:
# now instantiating using the dataframe (much faster)
g_a1 = gaut.Gene_cls('EGFR', gff3_dfs)
g_a1.info()
g_a2 = gaut.Gene_cls('BRCA1', gff3_dfs)
g_a2.info()

In [None]:
# Another option
g_a1 = gaut.Gene_gff3_cls('EGFR', gff3_dfs)
g_a1.info()
g_a2 = gaut.Gene_gff3_cls('BRCA1', gff3_dfs)
g_a2.info()

In [None]:
# another example is to instantiate by specifying your GFF3 file
mygff3_file: pathlib.Path = pathlib.Path(si.ROOT_path) / 'Data' / 'Ensembl_gene_annotation' / 'Homo_sapiens.GRCh38.110.gff3.gz'
# -----------------------------------------------------------------------------------------------------------------------------
g_af = gaut.Gene_cls('EGFR', mygff3_file)
g_af.info()

In [None]:
# Another option
g_af = gaut.Gene_gff3_cls('EGFR', mygff3_file)
g_af.info()

In [None]:
# finally, you can load your GFF3 file into a dataframe, and instantiate with that dataframe (and its gene subset dataframe)
print("Loading GFF3 file to a dataframe...")
mygff3_dfs = gaut.ensembl_gff3_df(mygff3_file)
print(f"Instantiating...")
g_ad = gaut.Gene_cls('EGFR', mygff3_dfs)
g_ad.info()

In [None]:
# Another option
mygff3_dfs = gaut.ensembl_gff3_df(mygff3_file)
g_ad = gaut.Gene_gff3_cls('EGFR', mygff3_dfs)
g_ad.info()

# Main Interfaces

In [None]:
exon_list, intron_list = [1, 2, 3 ,4], [5, 6, 7, 8]

In [None]:
# import API
import Gene_annotation as gaut

# gene annotation object
gene_obj = gaut.Gene_cls(gene)

# info
gene_obj.info()
gene_obj.transcript_info(transcript_id, verbose=True)

# maps
gene_obj.exon_intron_map(transcript_id)
gene_obj.exon_map(transcript_id)

# sequences
gene_obj.seq(transcript_id, chrm_path)
gene_obj.rna(transcript_id, chrm_path)
gene_obj.ORF(transcript_id, chrm_path)
gene_obj.AA(transcript_id, chrm_path)
gene_obj.UTR5(transcript_id, chrm_path)
gene_obj.UTR3(transcript_id, chrm_path)
gene_obj.exon_intron_seq(name, number, transcript_id, chrm_path)
gene_obj.modified_transcript(exon_list, intron_list, transcript_id, chrm_path)

# query
gene_obj.chrm_pos_info(transcript_id, chrm_pos, chrm_path)
gene_obj.rna_pos2chrm_pos(transcript_id, rna_pos, chrm_path)
gene_obj.rna_pos2chrm_info(transcript_id, rna_pos, chrm_path)
gene_obj.exon_nt_info(transcript_id, exon_number, nt_number, chrm_path)
gene_obj.aa_exon_info(transcript_id, aa_number, chrm_path)
gene_obj.DNA_SNP_mut_to_AA_mut(ref_a, mut_a, pos, transcript_id, chrm_path)
gene_obj.AA_mut_to_DNA_SNP_mut(aa_mut, transcript_id, chrm_path)                          # type: ignore