# Gene Annotation Usage Example
This notebook provides gene annotation examples, using the `Gene_cls` or `Gene_gff3_cls` classes from `Gene_annotation.py`.

The underlying data is the Ensembl GFF3 annotation file from [Ensembl gene annotation GFF3 file](https://useast.ensembl.org/Homo_sapiens/Info/Index).

Requirements: Python >= 3.10, Pandas, and Numpy. In addition, add 

`export PYTHONPATH="$PYTHONPATH:/tamir2/yoramzar/Projects/Cancer_mut/Utils"` 

to your `.bashrc`.

In [1]:
'''
Global configuration.
'''
import pathlib

# make sure PYTHONPATH is set, containing the Utils folder.
import mysys_init as si

# API
import Gene_annotation as gaut

# set folder to create reports
report_path = pathlib.Path(si.ROOT_path) / 'Ensembl_gene_anot' / 'Reports'

# Example - by Gene Name

We use the gene name to instantiate a `Gene_cls` object, which provides the gene's annotation.


In [2]:
'''
Select a gene and a corresponding transcript.

gene can be either a gene name (e.g. 'PIK3CA') or a gene ID (e.g. 'ENSG00000138641') 
'''
#gene, transcript_id = 'FLT3', 'ENST00000241453'

#gene, transcript_id = 'EGFR', 'ENST00000275493'
#gene, transcript_id = 'ENSG00000146648', 'ENST00000275493'
gene, transcript_id = 'IDH1', 'ENST00000345146'
#gene, transcript_id = 'PIK3CA', 'ENST00000263967'

#gene, transcript_id = 'ENSG00000290826', 'ENST00000642116'  # example of a ncRNA gene ()

#gene, transcript_id = 'PIK3C2A', 'ENST00000533645'
#gene, transcript_id = 'KLF16', 'ENST00000250916'

#gene, transcript_id = 'ENSG00000290149', 'ENST00000476620'
# ---------------------------------------------------------

gene_obj = gaut.Gene_cls(gene)
#gene_obj = gaut.Gene_cls(gene, verbose=False)

Loading /Users/yoramzarai/work/mystuff/Ramot/Projects/Cancer_mut/Data/Ensembl_gene_annotation/Homo_sapiens.GRCh38.113.gff3.gz to a dataframe...Done.


In [3]:
"""
General information
"""
print(f"\n{gene=} information:")
gene_obj.info()

print(f"\n{gene} contains {len(gene_obj)} transcripts.")
gene_obj.show_all_transcript_IDs()

print(f"\nGene start = {gene_obj.gene_start:,}, gene end = {gene_obj.gene_end:,}")

(ts, te) = gene_obj.transcript_boundaries(transcript_id)
print(f"\n{transcript_id=}: start site (TSS) = {ts:,}, end site (TES) = {te:,}.")

# start and stop codons positions
if (codons_info := gene_obj.start_and_stop_codons_pos(transcript_id)) is not None:
    print(f"\nFirst bp of the Start codon located in chr{gene_obj.chrm}:{codons_info[0][0]:,}, and is bp #{codons_info[0][1]} in the RNA.")
    print(f"First bp of the Stop codon located in chr{gene_obj.chrm}:{codons_info[1][0]:,}, and is bp #{codons_info[1][1]} in the RNA.")


gene='IDH1' information:
Gene=IDH1, Gene name=IDH1, Gene ID=ENSG00000138413
Source=/Users/yoramzarai/work/mystuff/Ramot/Projects/Cancer_mut/Data/Ensembl_gene_annotation/Homo_sapiens.GRCh38.113.gff3.gz
Positive strand=False.
Type=protein_coding, version=14.
Description=isocitrate dehydrogenase (NADP(+)) 1 [Source:HGNC Symbol%3BAcc:HGNC:5382]
Gene region=chr2:208,236,229-208,266,074.
9 transcripts.

IDH1 contains 9 transcripts.
IDH1 transcript IDs:
1. ENST00000345146
2. ENST00000446179
3. ENST00000415913
4. ENST00000484575
5. ENST00000415282
6. ENST00000462386
7. ENST00000417583
8. ENST00000451391
9. ENST00000481557

Gene start = 208,236,229, gene end = 208,266,074

transcript_id='ENST00000345146': start site (TSS) = 208,255,071, end site (TES) = 208,236,229.

First bp of the Start codon located in chr2:208,251,551, and is bp #224 in the RNA.
First bp of the Stop codon located in chr2:208,237,081, and is bp #1466 in the RNA.


In [4]:
# get a detailed information about a transcript
print(f"Detailed {transcript_id=} information:")
gene_obj.transcript_info(transcript_id, verbose=True)

Detailed transcript_id='ENST00000345146' information:
ENST00000345146 information:
Name=IDH1-201, ID=ENST00000345146.7, version=7
Type=mRNA, biotype=protein_coding
Start=208,255,071, end=208,236,229
protein ID = ENSP00000260985
ccdsid=CCDS2381.1
ORF start offset = 223, ORF end offset = 1,467 (0-based offsets from the beginning of the RNA)
ORF start chromosome position = 208,251,551, ORF end chromosome position = 208,237,079
Exon regions:
	1. 208,255,071 - 208,254,939, ID=ENSE00001908890, phase=-1, end phase=-1
	2. 208,253,959 - 208,253,886, ID=ENSE00001357891, phase=-1, end phase=-1
	3. 208,251,567 - 208,251,430, ID=ENSE00003564564, phase=-1, end phase=2
	4. 208,248,660 - 208,248,369, ID=ENSE00000934687, phase=2, end phase=0
	5. 208,245,424 - 208,245,319, ID=ENSE00002416886, phase=0, end phase=1
	6. 208,243,604 - 208,243,427, ID=ENSE00002424476, phase=1, end phase=2
	7. 208,242,145 - 208,241,994, ID=ENSE00000934684, phase=2, end phase=1
	8. 208,240,003 - 208,239,863, ID=ENSE00000934683

Regarding the exon phases see [this](https://blog.gene-test.com/coding-phases-frames/). Basically:
1. exon end phase is the exon phase of the next exon.
2. Conversion from exon phase to CDS phase: 0-->0, 1-->2, 2-->1.

In [5]:
# exon-intron table
print(f'\nExon-intron map for {transcript_id=}:')
display(gene_obj.exon_intron_map(transcript_id))


Exon-intron map for transcript_id='ENST00000345146':


Unnamed: 0,name,region,region_size
0,Exon 1,"(208255071, 208254939)",133
1,Intron 1,"(208254938, 208253960)",979
2,Exon 2,"(208253959, 208253886)",74
3,Intron 2,"(208253885, 208251568)",2318
4,Exon 3,"(208251567, 208251430)",138
5,Intron 3,"(208251429, 208248661)",2769
6,Exon 4,"(208248660, 208248369)",292
7,Intron 4,"(208248368, 208245425)",2944
8,Exon 5,"(208245424, 208245319)",106
9,Intron 5,"(208245318, 208243605)",1714


In [6]:
# exon table
print(f'\nExon map for {transcript_id=}:')
display(gene_obj.exon_map(transcript_id))

# if (df := gene_obj.exon_map(transcript_id)) is not None:
#     print(f"Exon map for {gene=}, {transcript_id=}:")
#     display(df.style.format(formatter={"exon_region": lambda x: f'({x[0]:,}, {x[1]:,})'}))


Exon map for transcript_id='ENST00000345146':


Unnamed: 0,exon_number,exon_region,exon_size,mRNA_NT_region,ORF_NT_region,ORF_AA_region,next_exon_frame_alignment,exon_ID
0,1,"(208255071, 208254939)",133,"[1, 133]","[0, 0]","[0, 0]",0,ENSE00001908890
1,2,"(208253959, 208253886)",74,"[134, 207]","[0, 0]","[0, 0]",0,ENSE00001357891
2,3,"(208251567, 208251430)",138,"[208, 345]","[1, 122]","[1, 41]",1,ENSE00003564564
3,4,"(208248660, 208248369)",292,"[346, 637]","[123, 414]","[41, 138]",0,ENSE00000934687
4,5,"(208245424, 208245319)",106,"[638, 743]","[415, 520]","[139, 174]",2,ENSE00002416886
5,6,"(208243604, 208243427)",178,"[744, 921]","[521, 698]","[174, 233]",1,ENSE00002424476
6,7,"(208242145, 208241994)",152,"[922, 1073]","[699, 850]","[233, 284]",2,ENSE00000934684
7,8,"(208240003, 208239863)",141,"[1074, 1214]","[851, 991]","[284, 331]",2,ENSE00000934683
8,9,"(208239233, 208239071)",163,"[1215, 1377]","[992, 1154]","[331, 385]",1,ENSE00003511233
9,10,"(208237169, 208236229)",941,"[1378, 2318]","[1155, 1245]","[385, 414]",0,ENSE00000934681


In [None]:
'''
exon map can be save to excel, csv, or html files.
'''
gene_obj.exon_map_to_html(transcript_id, str(report_path / f"{gene}_{transcript_id}_exon_map.html"))
gene_obj.exon_map_to_excel(transcript_id, str(report_path / f"{gene}_{transcript_id}_exon_map.xlsx"), usr_desc={"General": f"Exon table for {gene}, {transcript_id}", "Source": "gene_annotation_example.ipynb"}, desc_sheet_pos=0)
gene_obj.exon_map_to_csv(transcript_id, str(report_path / f"{gene}_{transcript_id}_exon_map.csv"))

In [7]:
"""
Get sequences
"""
chrm_path = str(si.chromosome_path / si.get_chrm_file_name(gene_obj.chrm))

transcript_seq = gene_obj.seq(transcript_id, chrm_path).upper()
print(f"{transcript_id=} contains {len(transcript_seq):,} bps.")

rna_seq = gene_obj.rna(transcript_id, chrm_path).upper()
print(f"\nrna=\n{rna_seq}")

orf_seq = gene_obj.ORF(transcript_id, chrm_path).upper()
print(f"\norf=\n{orf_seq}")

aa_seq = gene_obj.AA(transcript_id, chrm_path)
print(f"\nAA=\n{aa_seq}")

utr5_seq = gene_obj.UTR5(transcript_id, chrm_path).upper()
print(f"\nutr5=\n{utr5_seq}")

utr3_seq = gene_obj.UTR3(transcript_id, chrm_path).upper()
print(f"\nutr3=\n{utr3_seq}")

transcript_id='ENST00000345146' contains 18,843 bps.

rna=
GGGTTTCTGCAGAGTCTACTTCAGAAGCGGAGGCACTGGGAGTCCGGTTTGGGATTGCCAGGCTGTGGTTGTGAGTCTGAGCTTGTGAGCGGCTGTGGCGCCCCAACTCTTCGCCAGCATATCATCCCGGCAGGCGATAAACTACATTCAGTTGAGTCTGCAAGACTGGGAGGAACTGGGGTGATAAGAAATCTATTCACTGTCAAGGTTTATTGAAGTCAAAATGTCCAAAAAAATCAGTGGCGGTTCTGTGGTAGAGATGCAAGGAGATGAAATGACACGAATCATTTGGGAATTGATTAAAGAGAAACTCATTTTTCCCTACGTGGAATTGGATCTACATAGCTATGATTTAGGCATAGAGAATCGTGATGCCACCAACGACCAAGTCACCAAGGATGCTGCAGAAGCTATAAAGAAGCATAATGTTGGCGTCAAATGTGCCACTATCACTCCTGATGAGAAGAGGGTTGAGGAGTTCAAGTTGAAACAAATGTGGAAATCACCAAATGGCACCATACGAAATATTCTGGGTGGCACGGTCTTCAGAGAAGCCATTATCTGCAAAAATATCCCCCGGCTTGTGAGTGGATGGGTAAAACCTATCATCATAGGTCGTCATGCTTATGGGGATCAATACAGAGCAACTGATTTTGTTGTTCCTGGGCCTGGAAAAGTAGAGATAACCTACACACCAAGTGACGGAACCCAAAAGGTGACATACCTGGTACATAACTTTGAAGAAGGTGGTGGTGTTGCCATGGGGATGTATAATCAAGATAAGTCAATTGAAGATTTTGCACACAGTTCCTTCCAAATGGCTCTGTCTAAGGGTTGGCCTTTGTATCTGAGCACCAAAAACACTATTCTGAAGAAATATGATGGGCGTTTTAAAGACATCTTTCAGGAGATATATGACAAGCAGTACAAGTCCCAGTTTG

In [8]:
"""A sequence of a specific Exon or Intron"""
name, number  = 'Exon', 5  # name can be either 'Intron' or 'Exon'. number is the 1-based Intron/Exon number.
# ------------------------------------------------------------------------------
n_seq, n_label = gene_obj.exon_intron_seq(name, number, transcript_id, chrm_path)
print(f"{name}_{number} sequence ({n_label}):\n{n_seq}\n{len(n_seq)} bps.")

name, number  = 'Intron', 5  # name can be either 'Intron' or 'Exon'. number is the 1-based Intron/Exon number.
# ------------------------------------------------------------------------------
n_seq, n_label = gene_obj.exon_intron_seq(name, number, transcript_id, chrm_path)
print(f"{name}_{number} sequence ({n_label}):\n{n_seq}\n{len(n_seq)} bps.")

Exon_5 sequence (Exon_5:chr2:208245424:208245319):
TACAGAGCAACTGATTTTGTTGTTCCTGGGCCTGGAAAAGTAGAGATAACCTACACACCAAGTGACGGAACCCAAAAGGTGACATACCTGGTACATAACTTTGAAG
106 bps.
Intron_5 sequence (Intron_5:chr2:208245318:208243605):
GTATGTATGACTGTAGCATAAGCTTCttttttttttaaacaaatgatttagataatatttttattgctttttttgTTAGTTATGTGACTGCCTAAAAATATAATCTGAATCTACAGTGTTTTTGCAAAACCCGAAACTTGACATAAAGAAGTAATTTGCTTCCATACCTTCCTGTGGCATGAATAAAGGCACAAAAGGGTTACCCACTTAATTGGAAATGCCTTTAGGCAAAGACAAGGAGATGATTATGCATTTATCTTAAAGCACAGAGAGAAGGTTGGCAGAAATCTGCTGTGCATGAATTATATACATTAAATATTAATATTCTCAAATCACCCAGCATATAATCCTGCCCTTTGCTCTGGTAGGGCTCTTGTATCTGGAAGAGGTTTCCTTATATATTTGCTGAAGTTTTCTTCTTGCCCTTCTGGGTATTTATCCTGTGAAAAAGTAGAACATTTGCTTTTTGCCTTTTCTTTCCCAGCTACCCTGCTACCTTTGATGGCAAATCCCACCCAAAAGTTGTTGAAATAATTAGTGGGCCCTTTTCCTTCACTATTGAAGTTGTTGAAAGCACTCACACAAACAAAAATACCCAACACTCACCTACCTTTTGTTTTTCTTGCCTCTTCCAAATAATATCTGTTGGAATCTGGTTCTGCAGGCTTCAGCACCTTTCACCCTGAACCCAAGTTAACCCTGAGCATTGGCATGGATTCAAGTTCTATCTTTAAGTAATAACAAAGTTCAATAATAGTTGTAAAACACAGTGATTACA

In [9]:
# Optional: compare AA sequence with Ensembl REST API
# Note: if you reeceive a "HTTPError: 503 Server Error: Service Unavailable for url", try to run again.
import ensembl_rest_utils as erut
ensb_aa = erut.REST_API().sequence_endpoint_base(transcript_id, seq_type='protein')
if aa_seq is not None:
    if ensb_aa == aa_seq:
        print("AA agrees with REST AA")
    else:
        print("AA and REST mismatch !!!!!!")

AA agrees with REST AA


In [10]:
""" 
Start and stop codon positions.
"""
if (start_end_stop := gene_obj.start_and_stop_codons_pos(transcript_id)) is None:
    raise ValueError(f"{transcript_id=} not recognized !!")
start_codon_info, stop_codon_info = start_end_stop
# each _info contains (chromosome_position_of_first_bp_of_codon, RNA_position_of_first_bp_of_codon)
print(f"{start_codon_info=}\n{stop_codon_info=}")

start_codon_info=(208251551, 224)
stop_codon_info=(208237081, 1466)


In [11]:
"""
Perform some basic queries.

Query a chromosome position.
"""
#chrm_pos = 55_157_663  # query a chromosome position
chrm_pos = 208_243_604-10 # query a chromosome position
#chrm_pos = 179_148_357+800 # query a chromosome position
# -----------------------------------------------------
chrm_info = gene_obj.chrm_pos_info(transcript_id, chrm_pos, chrm_path)
print(chrm_info)

{'region': 'Exon_6', 'region_pos': 11, 'dist_from_region_boundary': (10, 167), 'segment': 'ORF', 'pos_in_segment': 531, 'NT': 'T', 'codon_number': 177, 'nt_in_codon': 3, 'codon': 'GGT', 'aa': 'G'}


In [12]:
"""
Chromosome position to RNA position
"""
#chrm_pos = 55_211_628  # for EGFR
chrm_pos = 208_236_229  # for IDH1
#chrm_pos = 179_199_690  # for PIK3CA
# -----------------------------------------
if (rna_pos := gene_obj.chrm_pos2rna_pos(transcript_id, chrm_pos)) is None:
    print(f"{chrm_pos=:,} is outside of the RNA (outside of {transcript_id} exons)")
else:
    print(f"{chrm_pos=:,} --> {rna_pos=:,}")

chrm_pos=208,236,229 --> rna_pos=2,318


In [13]:
"""
RNA position to chromosome position    
"""
rna_pos: int = 685
# ================
chrm_p = gene_obj.rna_pos2chrm_pos(transcript_id, rna_pos, chrm_path)
print(f"{rna_pos=} --> {chrm_p=}")

rna_pos=685 --> chrm_p=(208245377, 'A')


In [14]:
"""
RNA position to chromosome information   
"""
rna_pos: int = 685
# ================
chrm_info = gene_obj.rna_pos2chrm_info(transcript_id, rna_pos, chrm_path)
print(chrm_info)

{'chrm_pos': 208245377, 'region': 'Exon_5', 'region_pos': 48, 'dist_from_region_boundary': (47, 58), 'segment': 'ORF', 'pos_in_segment': 462, 'NT': 'A', 'codon_number': 154, 'nt_in_codon': 3, 'codon': 'ATA', 'aa': 'I'}


In [15]:
'''
Query a position within an exon.
'''
exon_number = 7  # 1-based exon number
nt_number = 47   # nucleotide 1-based number in the exon 
# -----------------------------------------------------------------------
info = gene_obj.exon_nt_info(transcript_id, exon_number, nt_number, chrm_path)
print(info)

{'segment': 'ORF', 'pos_in_segment': 745, 'NT': 'A', 'codon_number': 249, 'nt_in_codon': 1, 'codon': 'AGG', 'aa': 'R'}


In [16]:
# query an exon segment
exon_number: int = 5
bp_index_in_exon: int = 74
# --------------------------
if (exon_segment_info := gene_obj.exon_nt_segment(transcript_id, exon_number, bp_index_in_exon, chrm_path)) is not None:
    print(f"bp #{bp_index_in_exon} in exon #{exon_number} corresponds to bp #{exon_segment_info[1]:,} in the {exon_segment_info[0]}.")
#print(exon_segment_info)

bp #74 in exon #5 corresponds to bp #488 in the ORF.


In [17]:
'''
Query an amino-acid position.

The 'codon_exon_pos' is a list of 3 positions, each in the form <exon number>:<pos>.
pos in the 1-based position relative to the start of exon <exon number>. The corresponding
chromosome position is then exon_start_coordinate+m*(pos-1), where m=1 [m=-1] for genes encoded on
the positive [negative] DNA strand (in genes encoded on the negative strand, 
exon_start_coordinate>exon_end_coordinate).
'''
aa_number = 49 #1028  # 163 # 78 # 397  # 1-based AA position in the ORF
# --------------------------------------------------
aa_info = gene_obj.aa_exon_info(transcript_id, aa_number, chrm_path)
print(aa_info)

{'codon': 'CGT', 'AA': 'R', 'codon_exon_pos': ['Exon_4:23', 'Exon_4:24', 'Exon_4:25'], 'codon_chromosome_pos': [208248638, 208248637, 208248636], 'mrna_pos': [368, 369, 370]}


In [18]:
"""
Converts a DNA mutation to the corresponding AA mutation.
"""
#mut_id = 'EGFR:chr7:Missense_Mutation:55152609:G:C'
mut_id = 'IDH1:chr2:Missense_Mutation:208248455:C:T'
#mut_id = 'PIK3CA:chr3:Missense_Mutation:179218306:C:A' #'PIK3CA:chr3:Missense_Mutation:179234297:A:G'

mut_id_cols = ['Gene_name', 'Chromosome', 'Variant_Classification', 
               'Start_Position', 'Reference_Allele', 'Tumor_Seq_Allele2']
mut_id_sep = ':'
# ==================================================================================

mut_info = dict(zip(mut_id_cols, mut_id.split(mut_id_sep)))
ref_a, mut_a, pos = mut_info['Reference_Allele'], mut_info['Tumor_Seq_Allele2'], int(mut_info['Start_Position'])

aa_mut = gene_obj.DNA_SNP_mut_to_AA_mut(ref_a, mut_a, pos, transcript_id, chrm_path)
print(f"{mut_id=} --> {aa_mut=}")

mut_id='IDH1:chr2:Missense_Mutation:208248455:C:T' --> aa_mut='E110K'


In [19]:
"""
Converts a AA mutation to (all possible) DNA mutation.
"""
dna_all_muts = gene_obj.AA_mut_to_DNA_SNP_mut(aa_mut, transcript_id, chrm_path) # type: ignore
print(f"{aa_mut=} corresponds to:\n{dna_all_muts}.")

aa_mut='E110K' corresponds to:
{'AAA': {'start_pos': 208248455, 'reference_allele': 'C', 'alternative_allele': 'T'}, 'AAG': {'start_pos': 208248453, 'reference_allele': 'TTC', 'alternative_allele': 'CTT'}}.


# Example - Specifying GFF3 file or a Corresponding DataFrame
The `Gene_cls` loads internally the latest GFF3 file into a dataframe. This load takes few seconds to complete. 
It is possible to specify a GFF3 file, or instead to provide a dataframe of the GFF3 file, when instatntiating the annotation class.

For example, in cases where annotation is needed for multiple genes, it is more efficient to load the GFF3 file into a dataframe once, and then instantiate multiple annotation classes with the dataframe. Below are some examples.

In [20]:
# loading GFF3 file into a dataframe, and returning a tuple containing it, and a subset of it, which contains only rows with Type values that are genes
gff3_dfs = gaut.ensembl_gff3_df()  # analyzing genes, ncRNA_gene and pseudogene
# gff3_dfs = gaut.ensembl_gff3_df(gene_type_values=['gene'])  # analyzing only genes (excluding ncRNA_gene and pseudogene). This slightly reduces processing times.

In [21]:
# now instantiating using the dataframe (much faster)
g_a1 = gaut.Gene_cls('EGFR', gff3_dfs)
g_a1.info()
g_a2 = gaut.Gene_cls('BRCA1', gff3_dfs)
g_a2.info()

Gene=EGFR, Gene name=EGFR, Gene ID=ENSG00000146648
Source=Input DataFrame (ID=6066980048, size=(4105687, 9)). (Gene type DataFrame (ID=14761522384, size=(78932, 11)))
Positive strand=True.
Type=protein_coding, version=21.
Description=epidermal growth factor receptor [Source:HGNC Symbol%3BAcc:HGNC:3236]
Gene region=chr7:55,019,017-55,211,628.
13 transcripts.
Gene=BRCA1, Gene name=BRCA1, Gene ID=ENSG00000012048
Source=Input DataFrame (ID=6066980048, size=(4105687, 9)). (Gene type DataFrame (ID=14761522384, size=(78932, 11)))
Positive strand=False.
Type=protein_coding, version=26.
Description=BRCA1 DNA repair associated [Source:HGNC Symbol%3BAcc:HGNC:1100]
Gene region=chr17:43,044,295-43,170,245.
41 transcripts.


In [22]:
# Another option
g_a1 = gaut.Gene_gff3_cls('EGFR', gff3_dfs)
g_a1.info()
g_a2 = gaut.Gene_gff3_cls('BRCA1', gff3_dfs)
g_a2.info()

Gene=EGFR, Gene name=EGFR, Gene ID=ENSG00000146648
Source=Input DataFrame (ID=6066980048, size=(4105687, 9)). (Gene type DataFrame (ID=14761522384, size=(78932, 11)))
Positive strand=True.
Type=protein_coding, version=21.
Description=epidermal growth factor receptor [Source:HGNC Symbol%3BAcc:HGNC:3236]
Gene region=chr7:55,019,017-55,211,628.
13 transcripts.
Gene=BRCA1, Gene name=BRCA1, Gene ID=ENSG00000012048
Source=Input DataFrame (ID=6066980048, size=(4105687, 9)). (Gene type DataFrame (ID=14761522384, size=(78932, 11)))
Positive strand=False.
Type=protein_coding, version=26.
Description=BRCA1 DNA repair associated [Source:HGNC Symbol%3BAcc:HGNC:1100]
Gene region=chr17:43,044,295-43,170,245.
41 transcripts.


In [23]:
# another example is to instantiate by specifying your GFF3 file
mygff3_file: pathlib.Path = pathlib.Path(si.ROOT_path) / 'Data' / 'Ensembl_gene_annotation' / 'Homo_sapiens.GRCh38.112.gff3.gz'
# -----------------------------------------------------------------------------------------------------------------------------
g_af = gaut.Gene_cls('EGFR', mygff3_file)
g_af.info()

Loading /Users/yoramzarai/work/mystuff/Ramot/Projects/Cancer_mut/Data/Ensembl_gene_annotation/Homo_sapiens.GRCh38.112.gff3.gz to a dataframe...Done.
Gene=EGFR, Gene name=EGFR, Gene ID=ENSG00000146648
Source=/Users/yoramzarai/work/mystuff/Ramot/Projects/Cancer_mut/Data/Ensembl_gene_annotation/Homo_sapiens.GRCh38.112.gff3.gz
Positive strand=True.
Type=protein_coding, version=21.
Description=epidermal growth factor receptor [Source:HGNC Symbol%3BAcc:HGNC:3236]
Gene region=chr7:55,019,017-55,211,628.
13 transcripts.


In [24]:
# Another option
g_af = gaut.Gene_gff3_cls('EGFR', mygff3_file)
g_af.info()

Loading /Users/yoramzarai/work/mystuff/Ramot/Projects/Cancer_mut/Data/Ensembl_gene_annotation/Homo_sapiens.GRCh38.112.gff3.gz to a dataframe...Done.
Gene=EGFR, Gene name=EGFR, Gene ID=ENSG00000146648
Source=/Users/yoramzarai/work/mystuff/Ramot/Projects/Cancer_mut/Data/Ensembl_gene_annotation/Homo_sapiens.GRCh38.112.gff3.gz
Positive strand=True.
Type=protein_coding, version=21.
Description=epidermal growth factor receptor [Source:HGNC Symbol%3BAcc:HGNC:3236]
Gene region=chr7:55,019,017-55,211,628.
13 transcripts.


In [25]:
# finally, you can load your GFF3 file into a dataframe, and instantiate with that dataframe (and its gene subset dataframe)
print("Loading GFF3 file to a dataframe...")
mygff3_dfs = gaut.ensembl_gff3_df(mygff3_file)
print("Instantiating...")
g_ad = gaut.Gene_cls('EGFR', mygff3_dfs)
g_ad.info()

Loading GFF3 file to a dataframe...
Instantiating...
Gene=EGFR, Gene name=EGFR, Gene ID=ENSG00000146648
Source=Input DataFrame (ID=13165298576, size=(3456533, 9)). (Gene type DataFrame (ID=13209463696, size=(63140, 11)))
Positive strand=True.
Type=protein_coding, version=21.
Description=epidermal growth factor receptor [Source:HGNC Symbol%3BAcc:HGNC:3236]
Gene region=chr7:55,019,017-55,211,628.
13 transcripts.


In [26]:
# Another option
mygff3_dfs = gaut.ensembl_gff3_df(mygff3_file)
g_ad = gaut.Gene_gff3_cls('EGFR', mygff3_dfs)
g_ad.info()

Gene=EGFR, Gene name=EGFR, Gene ID=ENSG00000146648
Source=Input DataFrame (ID=13162947344, size=(3456533, 9)). (Gene type DataFrame (ID=13266962576, size=(63140, 11)))
Positive strand=True.
Type=protein_coding, version=21.
Description=epidermal growth factor receptor [Source:HGNC Symbol%3BAcc:HGNC:3236]
Gene region=chr7:55,019,017-55,211,628.
13 transcripts.


# Main Interfaces

In [None]:
exon_list, intron_list = [1, 2, 3 ,4], [5, 6, 7, 8]

In [None]:
# import API
import Gene_annotation as gaut

# gene annotation object
gene_obj = gaut.Gene_cls(gene)

# info
gene_obj.info()
gene_obj.transcript_info(transcript_id, verbose=True)

# maps
gene_obj.exon_intron_map(transcript_id)
gene_obj.exon_map(transcript_id)

# sequences
gene_obj.seq(transcript_id, chrm_path)
gene_obj.rna(transcript_id, chrm_path)
gene_obj.ORF(transcript_id, chrm_path)
gene_obj.AA(transcript_id, chrm_path)
gene_obj.UTR5(transcript_id, chrm_path)
gene_obj.UTR3(transcript_id, chrm_path)
gene_obj.exon_intron_seq(name, number, transcript_id, chrm_path)
gene_obj.modified_transcript(exon_list, intron_list, transcript_id, chrm_path)

# query
gene_obj.chrm_pos_info(transcript_id, chrm_pos, chrm_path)
gene_obj.rna_pos2chrm_pos(transcript_id, rna_pos, chrm_path)
gene_obj.rna_pos2chrm_info(transcript_id, rna_pos, chrm_path)
gene_obj.exon_nt_info(transcript_id, exon_number, nt_number, chrm_path)
gene_obj.aa_exon_info(transcript_id, aa_number, chrm_path)
gene_obj.DNA_SNP_mut_to_AA_mut(ref_a, mut_a, pos, transcript_id, chrm_path)
gene_obj.AA_mut_to_DNA_SNP_mut(aa_mut, transcript_id, chrm_path)                          # type: ignore