In [1]:
import os
os.environ['SPECIES'] = "Mouse"
from utils.utils import get_config
m_config = get_config()
m_config.proj_path

Using mouse project folder: /g/data/yr31/zs2131/tasks/2023/RNA_expr_net/GeneRAIN_Mouse


'/g/data/yr31/zs2131/tasks/2023/RNA_expr_net/GeneRAIN_Mouse/'

In [7]:
def read_homolog_file(file_path):
    """
    Read a file containing homologous genes and return a dictionary with human genes as keys
    and a list of corresponding mouse homolog genes as values. It accounts for multiple DB Class Keys
    for a single human gene.

    Parameters:
    file_path (str): The path to the text file containing the homologous gene data.

    Returns:
    dict: A dictionary with human genes as keys and lists of mouse genes as values.
    """
    # Initialize empty dictionaries to store the human and mouse genes with their DB Class Key
    human_genes = {}
    mouse_genes = {}
    
    with open(file_path, 'r') as file:
        # Skip the header line
        next(file)
        
        for line in file:
            # Split each line into components
            db_class_key, organism, taxon_id, symbol = line.strip().split('\t')[:4]
            
            # Populate the human and mouse dictionaries with their symbols and DB Class Key
            if taxon_id == '9606':  # Human genes have taxon ID 9606
                # Account for multiple DB Class Keys for a single human gene
                if symbol not in human_genes:
                    human_genes[symbol] = [db_class_key]
                else:
                    if db_class_key not in human_genes[symbol]:
                        human_genes[symbol].append(db_class_key)
            elif taxon_id == '10090':  # Mouse genes have taxon ID 10090
                mouse_genes[symbol] = db_class_key

    # Initialize a dictionary to hold the final human to mouse homolog mappings
    homolog_dict = {}

    # Cross-checking human and mouse genes to ensure they share the same DB Class Key
    for human_gene, human_db_keys in human_genes.items():
        # Find mouse genes with the matching DB Class Key
        matching_mouse_genes = [mouse_gene for mouse_gene, mouse_db_key in mouse_genes.items() if mouse_db_key in human_db_keys]
        
        # If there are matching mouse genes, add them to the homolog dictionary
        if matching_mouse_genes:
            # Ensure unique list of matching genes
            unique_mouse_genes = list(set(matching_mouse_genes))
            homolog_dict[human_gene] = unique_mouse_genes

    return homolog_dict

mg_homolog_file = m_config.proj_path + "/data/external/Homolo/MouseGenomeInfo/HOM_MouseHumanSequence.rpt"
h2m_genes = read_homolog_file(mg_homolog_file)

In [8]:
def reverse_homolog_mapping(homolog_dict):
    """
    Create a dictionary mapping mouse genes to lists of corresponding human homolog genes
    based on an existing dictionary mapping human genes to mouse homologs.

    Parameters:
    homolog_dict (dict): A dictionary with human genes as keys and lists of mouse genes as values.

    Returns:
    dict: A dictionary with mouse genes as keys and lists of human genes as values.
    """
    mouse_to_human_dict = {}

    # Iterate through each human gene and its corresponding mouse genes
    for human_gene, mouse_genes in homolog_dict.items():
        for mouse_gene in mouse_genes:
            # If the mouse gene isn't a key in the dictionary yet, add it
            if mouse_gene not in mouse_to_human_dict:
                mouse_to_human_dict[mouse_gene] = [human_gene]
            else:
                # Append the human gene to the existing list if it's not already included
                if human_gene not in mouse_to_human_dict[mouse_gene]:
                    mouse_to_human_dict[mouse_gene].append(human_gene)

    return mouse_to_human_dict

# Get the reverse mapping from mouse to human
m2h_genes = reverse_homolog_mapping(h2m_genes)

In [14]:
# one2one human genes are the key, the keys are good, the value may be not unique
one2one_human_genes = {h:ms[0] for h, ms in h2m_genes.items() if len(ms)==1}

# one2one mouse genes are the key, the keys are good, the value may be not unique
one2one_mouse_genes = {m:hs[0] for m, hs in m2h_genes.items() if len(hs)==1}

h2m_genes_one2one = {h:m for h,m in one2one_human_genes.items() if m in one2one_mouse_genes}
m2h_genes_one2one = {m:h for m,h in one2one_mouse_genes.items() if h in one2one_human_genes}

16983

16983

{'Gdnf': 'GDNF',
 'Hoxa4': 'HOXA4',
 'Cry1': 'CRY1',
 'Alcam': 'ALCAM',
 'Daxx': 'DAXX',
 'Arx': 'ARX',
 'Cpeb1': 'CPEB1',
 'Or12d17': 'OR12D3',
 'Or4e2': 'OR4E2',
 'Or10k2': 'OR10K2',
 'Neurog1': 'NEUROG1',
 'Msx2': 'MSX2',
 'Pax6': 'PAX6',
 'Ddx3y': 'DDX3Y',
 'Adar': 'ADAR',
 'Or2a12': 'OR2A12',
 'Pklr': 'PKLR',
 'Gng3': 'GNG3',
 'Bcl2l1': 'BCL2L1',
 'Atp1a1': 'ATP1A1',
 'Cldn11': 'CLDN11',
 'Gch1': 'GCH1',
 'mt-Nd4l': 'ND4L',
 'Cacna1c': 'CACNA1C',
 'Fth1': 'FTH1',
 'Ftmt': 'FTMT',
 'Ran': 'RAN',
 'Ucp2': 'UCP2',
 'Rho': 'RHO',
 'Adgra2': 'ADGRA2',
 'Myog': 'MYOG',
 'Or2at4': 'OR2AT4',
 'Or4d1': 'OR4D1',
 'Appl1': 'APPL1',
 'Hesx1': 'HESX1',
 'Nkx2-6': 'NKX2-6',
 'Opn1sw': 'OPN1SW',
 'Men1': 'MEN1',
 'Mef2d': 'MEF2D',
 'Hoxd12': 'HOXD12',
 'Tcp1': 'TCP1',
 'Foxn4': 'FOXN4',
 'Six3': 'SIX3',
 'Des': 'DES',
 'Isl2': 'ISL2',
 'Ctbp2': 'CTBP2',
 'Frat2': 'FRAT2',
 'Hoxb5': 'HOXB5',
 'Gdf6': 'GDF6',
 'Notch3': 'NOTCH3',
 'Pax3': 'PAX3',
 'Epb41l4a': 'EPB41L4A',
 'Bax': 'BAX',
 'Hoxd4': '