# Debug Notebook

This notebook was used to debug the annotation that is based on the GFF3 file directly (without creating the individual json files), which is now available in the file `Utils/Gene_annotation.py`

In [1]:
'''
Global configuration (common to all projects).
'''
import pandas as pd
import numpy as np
import os, sys, re
import pathlib
import pickle
import matplotlib.pyplot as plt
import glob
# import subprocess

# make sure PYTHONPATH is set, containing the Utils folder.
import mysys_init as si

# Class

In [11]:
import ensembl_gene_annotations_utils as egna
from dataclasses import dataclass, field

Ensembl_release: str = "109"
# --------------------------


GFF3_file = pathlib.Path(f"/Users/yoramzarai/work/mystuff/Ramot/Projects/Cancer_mut/Data/Ensembl_gene_annotation/Homo_sapiens.GRCh38.{Ensembl_release}.gff3") if sys.platform == "darwin" \
    else pathlib.Path(f"/tamir2/yoramzar/Projects/Cancer_mut/Data/Ensembl_gene_annotation/Homo_sapiens.GRCh38.{Ensembl_release}.gff3")


def ensembl_gff3_df(file: pathlib.Path = GFF3_file) -> pd.DataFrame:
     """For users of gene annotation."""
     return egna.load_ensembl_human_gff3_annotation_file(file)


@dataclass
class Transcript_gff3_cls:
    # dataframe (result of pd.read_csv(GFF3 file)) or a path to an Ensembl GFF3 file
    gene: str
    gff3_data: pathlib.Path | pd.DataFrame
    verbose: bool = True  # set to False at instantiation to suppress prints

    # these are set by the __post_init__ method.
    gene_ID: str = ''
    chrm: str = ''
    rev: bool = False
    gene_start: int = 0
    gene_end: int = 0
    gene_type: str = ''
    gene_desc: str = ''
    gene_ver: int = 0
    transcripts: list = None 

    source: str = ''

    transcripts_info: dict = None  # keys are transcripts, values are transcript details
    exon_intron_maps: dict = None  # keys are transcripts, values are maps in df format

    __protein_coding_labels_in_biotype: list = None

    def __post_init__(self) -> None:
        if type(self.gff3_data) is not pd.DataFrame:
            self.source = str(self.gff3_data)
            if self.verbose: print(f"Loading {self.gff3_data} to a dataframe...", end='')
            self.gff3_data = egna.load_ensembl_human_gff3_annotation_file(self.gff3_data)
            if self.verbose: print("Done.")
        else:
             self.source = f'Input DataFrame (ID={id(self.gff3_data)}, size={self.gff3_data.shape}).'
        
        # extract gene and generate the description dictionary
        if not (d := egna.extract_gene_dict(self.gff3_data, self.gene)):
             raise ValueError(f"Error in extracting the gene {self.gene} from GFF3 dataframe !! Exiting.")

        self.chrm = d['chrm']
        self.gene_start = d['gene_start']
        self.gene_end = d['gene_end']
        self.rev = d['rev']
        self.gene_ID = d['gene_ID']
        self.gene_type = d['gene_type']
        self.gene_desc = d['gene_desc']
        self.gene_ver = d['gene_ver']
        self.transcripts = d['transcripts']

        if self.__protein_coding_labels_in_biotype is None:
                # see https://www.ensembl.org/info/genome/genebuild/biotypes.html
                self.__protein_coding_labels_in_biotype = [
                    "nonsense_mediated_decay",
                    "protein_coding",
                ]

        # creates transcript information for each transcript. This populates self.transcripts_info
        #self.__gen_transcripts_info()

        # creates exon-intron map for each transcript. This populates self.exon_intron_maps
        #self.__create_exon_intron_map()

        
class Gene_cls(Transcript_gff3_cls):  # CHANGE THIS TO Gene_gff3_cls
    def __init__(self, gene: str, gff3_file: pathlib.Path = GFF3_file, verbose: bool = True) -> None:
        self.gene = gene
        self.gff3_data = gff3_file
        self.verbose = verbose
        

        super().__post_init__()
     

    def __repr__(self) -> str:
        return f"Gene_cls('{self.gene}')"

    def __str__(self) -> str:
        return f"Gene annotation class based on gene name.\n"

In [12]:
# loading GFF3 file (for testing)
gff3_df = ensembl_gff3_df()

In [None]:
# test
gene: str = 'IDH1' #'ENSG00000267318'# 'PIK3CA'
# ================
#a = Transcript_gff3_cls(gene, gff3_df)
#a = Transcript_gff3_cls(gene, GFF3_file, verbose=False)
a = Gene_cls(gene)

print(a.source, a.gene, a.gene_ID, a.chrm, a.gene_start, a.gene_end, a.rev, a.gene_type, a.gene_desc, a.gene_ver, len(a.transcripts), sep='\n')


# Test the (new) API

In [4]:
import Gene_annotation as gaut

In [5]:
# loading GFF3 file (for testing)
gff3_df = gaut.ensembl_gff3_df()

In [9]:
display(gff3_df)
d = gff3_df.query("Type == 'gene'")
display(d)
print(d.iloc[0]['Attributes'])

Unnamed: 0,Chrm,Source,Type,Start,End,Score,Strand,Phase,Attributes
0,1,GRCh38,chromosome,1,248956422,.,.,.,"ID=chromosome:1;Alias=CM000663.2,chr1,NC_00000..."
1,1,.,biological_region,10469,11240,1.3e+03,.,.,external_name=oe %3D 0.79;logic_name=cpg
2,1,.,biological_region,10650,10657,0.999,+,.,logic_name=eponine
3,1,.,biological_region,10655,10657,0.999,-,.,logic_name=eponine
4,1,.,biological_region,10678,10687,0.999,+,.,logic_name=eponine
...,...,...,...,...,...,...,...,...,...
3414006,Y,.,biological_region,26626966,26627137,0.994,-,.,external_name=rank %3D 1;logic_name=firstef
3414007,Y,.,biological_region,26627457,26628186,0.997,+,.,external_name=rank %3D 1;logic_name=firstef
3414008,Y,havana,pseudogene,56855244,56855488,.,+,.,ID=gene:ENSG00000235857;Name=CTBP2P1;biotype=p...
3414009,Y,havana,pseudogenic_transcript,56855244,56855488,.,+,.,ID=transcript:ENST00000431853;Parent=gene:ENSG...


Unnamed: 0,Chrm,Source,Type,Start,End,Score,Strand,Phase,Attributes
84,1,ensembl_havana,gene,65419,71585,.,+,.,ID=gene:ENSG00000186092;Name=OR4F5;biotype=pro...
370,1,ensembl_havana,gene,450740,451678,.,-,.,ID=gene:ENSG00000284733;Name=OR4F29;biotype=pr...
502,1,ensembl_havana,gene,685716,686654,.,-,.,ID=gene:ENSG00000284662;Name=OR4F16;biotype=pr...
1097,1,ensembl_havana,gene,923923,944575,.,+,.,ID=gene:ENSG00000187634;Name=SAMD11;biotype=pr...
1459,1,ensembl_havana,gene,944203,959309,.,-,.,ID=gene:ENSG00000188976;Name=NOC2L;biotype=pro...
...,...,...,...,...,...,...,...,...,...
3412977,Y,ensembl_havana,gene,24607560,24639207,.,+,.,ID=gene:ENSG00000183795;Name=BPY2B;biotype=pro...
3413043,Y,ensembl_havana,gene,24763069,24813492,.,-,.,ID=gene:ENSG00000187191;Name=DAZ3;biotype=prot...
3413169,Y,ensembl_havana,gene,24833843,24907040,.,+,.,ID=gene:ENSG00000205916;Name=DAZ4;biotype=prot...
3413567,Y,ensembl_havana,gene,25030901,25062548,.,-,.,ID=gene:ENSG00000185894;Name=BPY2C;biotype=pro...


ID=gene:ENSG00000186092;Name=OR4F5;biotype=protein_coding;description=olfactory receptor family 4 subfamily F member 5 [Source:HGNC Symbol%3BAcc:HGNC:14825];gene_id=ENSG00000186092;logic_name=ensembl_havana_gene_homo_sapiens;version=7


In [None]:
gene: str = 'IDH1' #'ENSG00000267318'# 'PIK3CA'
# ================
#a = gaut.Transcript_gff3_cls(gene, gff3_df)
#a = gaut.Transcript_gff3_cls(gene, gaut.GFF3_file, verbose=False)
a = gaut.Gene_cls(gene)

print(a.source, a.gene, a.gene_ID, a.chrm, a.gene_start, a.gene_end, a.rev, a.gene_type, a.gene_desc, a.gene_ver, len(a.transcripts), sep='\n')
