# Identify hidden genes using the orthology table

Distinguish hidden gene from a missing gene: a hidden gene is present in at least one organism of a class, but not in all of them. Prioritize (sort) them based on the fullness of their orthology group.

**Input:** Orthology table (.xsl, .csv, .tsv), version of Ensembl compara used, taxa of hidden genes, output filename

**Output:** A JSON-formatted file with prioritized hidden genes (species with hidden gene, species with ortholog, ortholog ensembl id)

In [6]:
import pandas as pd
import numpy as np
import requests
import sys
import json
import seaborn as sns
import gget
from Bio.SeqUtils import GC
import pathlib

In [7]:
ens_api = "https://rest.ensembl.org"

In [8]:
# enter desired ensembl release
ensemble_release = 109

In [9]:
# enter path to orthology table
orthology_table_file = pathlib.Path("/home/vecerkok/orthology_table/orthology_table-68species_ensembl_ids.csv")

In [10]:
# enter taxa of hidden genes
taxon = "Aves"

In [11]:
# enter output filename
hidden_genes_file = pathlib.Path("100_avian_genes_to_search")

In [12]:
# enter the fraction of genes to be output
fraction_output_genes = 0.00185

## Orthology table preprocessing

In [13]:
def get_orthology_table(table_file: str):
    file_type = table_file.suffix
    
    if file_type == ".csv":
        return pd.read_csv(table_file, sep=",", header=0, index_col=0, low_memory=False)
    elif file_type == ".tsv":
        return pd.read_csv(table_file, sep="\t", header=0, index_col=0, low_memory=False)
    elif (file_type == ".xsls") or (file_type == ".xsl"):
        return pd.read_excel(table_file, header=0, index_col=0, low_memory=False)
    else:
        print("Accepted file formats: .csv, .tsv, .xsls, .xsl!")

In [14]:
# upload the orthology table
orthology_df = get_orthology_table(orthology_table_file)
orthology_df

Unnamed: 0,Homo sapiens,Cricetulus griseus,Mesocricetus auratus,Rattus norvegicus,Cavia porcellus,Octodon degus,Heterocephalus glaber,Maylandia zebra,Pteropus vampyrus,Monodelphis domestica,...,Felis catus,Tursiops truncatus,Loxodonta africana,Equus caballus,Bos taurus,Capra hircus,Ovis aries,Ochotona princeps,Oryctolagus cuniculus,Tetraodon nigroviridis
0,ENSG00000167103,ENSCGRG00001009175,,ENSRNOG00000048676,ENSCPOG00000005283,ENSODEG00000003703,ENSHGLG00000018036,ENSMZEG00005000701,ENSPVAG00000000967,,...,ENSFCAG00000001492,ENSTTRG00000016470,ENSLAFG00000000585,,ENSBTAG00000003387,ENSCHIG00000020199,ENSOARG00020015289,ENSOPRG00000016354,ENSOCUG00000026071,
1,ENSG00000258873,,,,,,,,ENSPVAG00000003020,,...,,ENSTTRG00000014200,ENSLAFG00000030167,ENSECAG00000043636,,,,ENSOPRG00000000440,,
2,ENSG00000213088,ENSCGRG00001021594,ENSMAUG00000015131,ENSRNOG00000069330,ENSCPOG00000024213,ENSODEG00000009370,ENSHGLG00000006136,,,ENSMODG00000042514,...,ENSFCAG00000018619,,ENSLAFG00000031828,ENSECAG00000000874,ENSBTAG00000003220,ENSCHIG00000003089,ENSOARG00020009415,ENSOPRG00000018458,,
3,ENSG00000185245,,,ENSRNOG00000025959,,,,,ENSPVAG00000003789,ENSMODG00000005365,...,ENSFCAG00000008042,ENSTTRG00000001296,ENSLAFG00000032715,ENSECAG00000040351,ENSBTAG00000004909,,ENSOARG00020025177,,ENSOCUG00000001099,
4,ENSG00000198870,ENSCGRG00001004554,ENSMAUG00000013976,ENSRNOG00000027911,ENSCPOG00000021170,ENSODEG00000015481,ENSHGLG00000016873,,ENSPVAG00000001428,ENSMODG00000012800,...,ENSFCAG00000003766,ENSTTRG00000007671,ENSLAFG00000008238,ENSECAG00000022240,ENSBTAG00000004140,,ENSOARG00020002366,ENSOPRG00000001115,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
54115,,,,,,,,,,,...,,,,,,,,,,
54116,,,,,,,,,,,...,,,,,,,,,,ENSTNIG00000020644
54117,,,,,,,,,,,...,,,,,,,,,,
54118,,,,,,,,,,,...,,,,,,,,,,ENSTNIG00000020257


In [15]:
def get_list_organisms_orthology_table(orth_df: pd.DataFrame):
    # get a list column labels
    column_names = orth_df.columns.tolist()
    # convert species name to lowercase (useful for ensemble rest api)
    organism_names = [x.lower().replace(" ", "_") for x in column_names]
    organism_names = list(map(lambda x: x.replace("heterocephalus_glaber", "heterocephalus_glaber_female"), organism_names))
    organism_names = list(map(lambda x: x.replace("gorilla_gorilla_gorilla", "gorilla_gorilla"), organism_names))
    organism_names = list(map(lambda x: x.replace("cricetulus_griseus", "cricetulus_griseus_chok1gshd"), organism_names))
    organism_names = list(map(lambda x: x.replace("ovis_aries", "ovis_aries_rambouillet"), organism_names))
    orth_df.columns = organism_names
    return orth_df, organism_names

In [16]:
def transpose_orthology_table(orth_df: pd.DataFrame):
    return orth_df.transpose()

In [17]:
def get_organism_class(organism: str):
    # get taxonomy classification information for organism from Ensembl rest api
    server = "https://rest.ensembl.org"
    ext = f"/taxonomy/classification/{organism}?"

    response = requests.get(server+ext, headers={"Content-Type": "application/json"})
    response = json.dumps(response.json())
    
    # get class from taxonomy info
    classes = ["Mammalia", "Aves", "Reptilia", "Actinopteri", "Amphibia"]
    for cl in classes:
        if (cl in response):
            return cl

In [18]:
# add class info to the orthology table
def add_classes_to_orthology_database(orth_df: pd.DataFrame):
    # get a list of species
    orth_df, species = get_list_organisms_orthology_table(orth_df)
    # transpose the dataframe
    orth_df = transpose_orthology_table(orth_df)
    
    # assign species classes
    for specie in species:
        orth_df.loc[specie, "class"] = get_organism_class(specie)
    return orth_df

In [19]:
orthology_df = add_classes_to_orthology_database(orthology_df)
orthology_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,54111,54112,54113,54114,54115,54116,54117,54118,54119,class
homo_sapiens,ENSG00000167103,ENSG00000258873,ENSG00000213088,ENSG00000185245,ENSG00000198870,ENSG00000256553,ENSG00000255569,ENSG00000120594,ENSG00000161381,ENSG00000100557,...,,,,,,,,,,Mammalia
cricetulus_griseus_chok1gshd,ENSCGRG00001009175,,ENSCGRG00001021594,,ENSCGRG00001004554,,,ENSCGRG00001024180,ENSCGRG00001018272,ENSCGRG00001010407,...,,,,,,,,,,
mesocricetus_auratus,,,ENSMAUG00000015131,,ENSMAUG00000013976,,,,ENSMAUG00000015907,ENSMAUG00000013689,...,,,,,,,,,,Mammalia
rattus_norvegicus,ENSRNOG00000048676,,ENSRNOG00000069330,ENSRNOG00000025959,ENSRNOG00000027911,,,ENSRNOG00000000142,ENSRNOG00000021536,ENSRNOG00000062766,...,,,,,,,,,,Mammalia
cavia_porcellus,ENSCPOG00000005283,,ENSCPOG00000024213,,ENSCPOG00000021170,,,ENSCPOG00000007300,ENSCPOG00000020274,ENSCPOG00000003345,...,,,,,,,,,,Mammalia
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
capra_hircus,ENSCHIG00000020199,,ENSCHIG00000003089,,,,,ENSCHIG00000023670,ENSCHIG00000024239,ENSCHIG00000027108,...,,,,,,,,,,Mammalia
ovis_aries_rambouillet,ENSOARG00020015289,,ENSOARG00020009415,ENSOARG00020025177,ENSOARG00020002366,,,ENSOARG00020023004,ENSOARG00020016631,ENSOARG00020020200,...,,,,,,,,,,
ochotona_princeps,ENSOPRG00000016354,ENSOPRG00000000440,ENSOPRG00000018458,,ENSOPRG00000001115,,,ENSOPRG00000009876,ENSOPRG00000007050,ENSOPRG00000016009,...,,,,,,,,,,Mammalia
oryctolagus_cuniculus,ENSOCUG00000026071,,,ENSOCUG00000001099,,,,ENSOCUG00000014482,ENSOCUG00000011309,ENSOCUG00000015402,...,,,,,,,,,,Mammalia


## Identifying hidden genes

In [20]:
def get_gene_occurence(orth_df):
    # find percentage of occurence per gene
    genes_occurence = pd.DataFrame(orth_df.notna().sum() / len(orth_df), columns=["percentage"])
    return genes_occurence

In [21]:
# scoring function: the fuller the orthology group, the higher the gene score (the more likely it is hidden rather than missing)
def inititate_score_table(orth_df: pd.DataFrame):
    genes = orth_df.drop(columns=["class"]).columns.values.tolist()
    starting_scores = np.zeros(len(genes))
    score_table = pd.DataFrame({"scores": starting_scores}, index=genes)
    return score_table

In [22]:
# add taxa and non taxa occurence percentage to the score sum
# argument factor can be used to increase the significance of gene occurance in taxa
def add_occurence_score(scores_df: pd.DataFrame, occur_df: pd.DataFrame, factor: float = 1.):
    scores_df["scores"] = scores_df["scores"] + occur_df["percentage"]
    return scores_df

In [27]:
# prioritize gene to search for based on score
def get_genes_scores(ortho_df: pd.DataFrame):
    scores_df = inititate_score_table(ortho_df)
    
    # get gene occurence percentage in taxon
    taxon_df = ortho_df.loc[(ortho_df["class"] == taxon)].drop(columns="class")
                            
    # get gene occurence outside of taxon
    taxon_excluded_df = ortho_df.loc[(ortho_df["class"] != taxon)].drop(columns="class")
    
    gene_taxon_occurence = get_gene_occurence(taxon_df)
    gene_non_taxon_occurence = get_gene_occurence(taxon_excluded_df)
    
    scores_df = add_occurence_score(scores_df, gene_taxon_occurence)
    scores_df = add_occurence_score(scores_df, gene_non_taxon_occurence)

    return scores_df

In [28]:
score_df = get_genes_scores(orthology_df)
score_df

Unnamed: 0,scores
0,0.841270
1,0.253968
2,0.987302
3,1.228571
4,1.234921
...,...
54115,0.031746
54116,0.031746
54117,0.031746
54118,0.031746


In [29]:
def filter_genes_with_all_orthologs_present(gene_scores: pd.DataFrame, ortho_df: pd.DataFrame):
    is_perfect_score_all_present = False
    max_score = gene_scores.values.max()
    index_highest_scoring_gene = gene_scores.loc[gene_scores["scores"] == max_score].index[0]
    if orthology_df[index_highest_scoring_gene].isna().sum() == 0:
        is_perfect_score_all_present = True
    # drop all rows with perfect score
    gene_scores = gene_scores[gene_scores["scores"] < max_score].copy(deep=True)
    return gene_scores

filtered_score_df = filter_genes_with_all_orthologs_present(score_df, orthology_df)
filtered_score_df

Unnamed: 0,scores
0,0.841270
1,0.253968
2,0.987302
3,1.228571
4,1.234921
...,...
54115,0.031746
54116,0.031746
54117,0.031746
54118,0.031746


In [30]:
# return a list of prioritized genes
# percentage: 0.1 -> returns 10% best scoring genes
# gene index id corresponds
def prirotize_gene_search(gene_scores: pd.DataFrame, percentage: float):
    max_score = gene_scores.values.max()
    gene_scores.sort_values("scores", ascending=False, inplace=True)
    number_of_genes = int(score_df.shape[0] * percentage)
    
    return gene_scores.iloc[:number_of_genes, :]

prioritized_score_df = prirotize_gene_search(filtered_score_df, fraction_output_genes)
prioritized_score_df

Unnamed: 0,scores
6758,1.984127
10545,1.984127
2150,1.984127
6981,1.984127
6983,1.984127
...,...
3438,1.984127
8839,1.984127
3435,1.984127
9054,1.984127


In [44]:
def get_list_of_hidden_genes_with_ortholog_id(sorted_gene_scores: list, ortho_df: pd.DataFrame):
    missing_present_ensid_tuples = list()
    sorted_gene_ids = sorted_gene_scores.index.tolist()
    
    for gene in sorted_gene_ids:
        species_with_hidden = ortho_df.loc[ortho_df[gene].isna(), gene].index.tolist()
        for species in species_with_hidden:
            # preferably human ortholog (first in table -> index 0)
            species_with_ortholog = ortho_df.loc[ortho_df[gene].notna(), gene].index.tolist()
            if len(species_with_ortholog) > 0:
                specie_with_ortholog = species_with_ortholog[0]
            else:
                print("Gene is not present in any organism! No orthologs!")
                continue
            ensembl_id_ortholog = ortho_df.loc[specie_with_ortholog, gene]

        missing_present_ensid_tuples.append((species, specie_with_ortholog, ensembl_id_ortholog))
        
    return missing_present_ensid_tuples
    
hidden_genes_tuples = get_list_of_hidden_genes_with_ortholog_id(prioritized_score_df, orthology_df)
hidden_genes_tuples

[('vicugna_pacos', 'homo_sapiens', 'ENSG00000132768'),
 ('sorex_araneus', 'homo_sapiens', 'ENSG00000104213'),
 ('petromyzon_marinus', 'homo_sapiens', 'ENSG00000163710'),
 ('ochotona_princeps', 'homo_sapiens', 'ENSG00000104447'),
 ('petromyzon_marinus', 'homo_sapiens', 'ENSG00000103091'),
 ('astyanax_mexicanus', 'homo_sapiens', 'ENSG00000139324'),
 ('petromyzon_marinus', 'homo_sapiens', 'ENSG00000181722'),
 ('jaculus_jaculus', 'homo_sapiens', 'ENSG00000125863'),
 ('dasypus_novemcinctus', 'homo_sapiens', 'ENSG00000170615'),
 ('tetraodon_nigroviridis', 'homo_sapiens', 'ENSG00000157350'),
 ('vicugna_pacos', 'homo_sapiens', 'ENSG00000198400'),
 ('petromyzon_marinus', 'homo_sapiens', 'ENSG00000122299'),
 ('petromyzon_marinus', 'homo_sapiens', 'ENSG00000164532'),
 ('heterocephalus_glaber_female', 'homo_sapiens', 'ENSG00000144747'),
 ('astyanax_mexicanus', 'homo_sapiens', 'ENSG00000136936'),
 ('petromyzon_marinus', 'homo_sapiens', 'ENSG00000142784'),
 ('petromyzon_marinus', 'homo_sapiens', 'EN

In [46]:
def save_genes_to_search_to_json(genes_to_search: list, output_file: pathlib.Path):
    with open(output_file, "w") as f:
        json.dump(genes_to_search, f)
        
save_genes_to_search_to_json(missing_present_ensid_tuples, hidden_genes_file)