#  ENSEMBLE gene annotations to gene symbols

In [1]:
import numpy as np
import biomart                                                                                      

In [2]:
Path='/home/icb/manuel.gander/moscotTime_Reproducibility/Data/Miscellaneous'

In [3]:
!pip list

Package                       Version
----------------------------- -------------------------
absl-py                       1.1.0
aiohttp                       3.8.1
aiosignal                     1.2.0
altair                        4.2.0
anndata                       0.8.0
anyio                         3.6.1
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
asttokens                     2.0.5
async-timeout                 4.0.2
attrs                         21.4.0
Babel                         2.10.3
backcall                      0.2.0
backports.functools-lru-cache 1.6.4
beautifulsoup4                4.11.1
biomart                       0.9.2
bleach                        5.0.1
blinker                       1.5
brotlipy                      0.7.0
cachetools                    5.2.0
cellrank                      1.5.1
certifi                       2022.6.15
cffi                          1.15.1
charset-normalizer            2.1.0
chex                          0.1.

## Gene annotation

Up to now, genes are denoted by the ENSEMBL ID, but we will need gene symbols as well. This is done using Biomart (Smedley et al., BMC Genomics, 2009)

In [4]:
### This code is from https://gist.github.com/ben-heil/cffbebf8865795fe2efbbfec041da969,
### I only hat to change in 'http://useast.ensembl.org/biomart' from 'uswest' to 'useast', since uswest is retired

def get_ensembl_mappings():                                                                                                                                                                                 
    # Set up connection to server                                                                                                                                                                           
    server = biomart.BiomartServer('http://useast.ensembl.org/biomart')                                                                                                                                     
    mart = server.datasets['mmusculus_gene_ensembl']                                                                                                                                                        

    # List the types of data we want                                                                                                                                                                        
    attributes = ['ensembl_transcript_id', 'mgi_symbol',                                                                                                                                                    
                  'ensembl_gene_id', 'ensembl_peptide_id']                                                                                                                                                  

    # Get the mapping between the attributes                                                                                                                                                                
    response = mart.search({'attributes': attributes})                                                                                                                                                      
    data = response.raw.data.decode('ascii')                                                                                                                                                                

    ensembl_to_genesymbol = {}                                                                                                                                                                              
    # Store the data in a dict                                                                                                                                                                              
    for line in data.splitlines():                                                                                                                                                                          
        line = line.split('\t')                                                                                                                                                                             
        # The entries are in the same order as in the `attributes` variable                                                                                                                                 
        transcript_id = line[0]                                                                                                                                                                             
        gene_symbol = line[1]                                                                                                                                                                               
        ensembl_gene = line[2]                                                                                                                                                                              
        ensembl_peptide = line[3]                                                                                                                                                                           

        # Some of these keys may be an empty string. If you want, you can
        # avoid having a '' key in your dict by ensuring the
        # transcript/gene/peptide ids have a nonzero length before
        # adding them to the dict
        ensembl_to_genesymbol[transcript_id] = gene_symbol
        ensembl_to_genesymbol[ensembl_gene] = gene_symbol
        ensembl_to_genesymbol[ensembl_peptide] = gene_symbol

    return ensembl_to_genesymbol

In [5]:
# Takes about 1 min
Gene_Dict=get_ensembl_mappings()

In [6]:
Gene_Dict

{'ENSMUST00000082387': 'mt-Tf',
 'ENSMUSG00000064336': 'mt-Tf',
 '': 'Gm14148',
 'ENSMUST00000082388': 'mt-Rnr1',
 'ENSMUSG00000064337': 'mt-Rnr1',
 'ENSMUST00000082389': 'mt-Tv',
 'ENSMUSG00000064338': 'mt-Tv',
 'ENSMUST00000082390': 'mt-Rnr2',
 'ENSMUSG00000064339': 'mt-Rnr2',
 'ENSMUST00000082391': 'mt-Tl1',
 'ENSMUSG00000064340': 'mt-Tl1',
 'ENSMUST00000082392': 'mt-Nd1',
 'ENSMUSG00000064341': 'mt-Nd1',
 'ENSMUSP00000080991': 'mt-Nd1',
 'ENSMUST00000082393': 'mt-Ti',
 'ENSMUSG00000064342': 'mt-Ti',
 'ENSMUST00000082394': 'mt-Tq',
 'ENSMUSG00000064343': 'mt-Tq',
 'ENSMUST00000082395': 'mt-Tm',
 'ENSMUSG00000064344': 'mt-Tm',
 'ENSMUST00000082396': 'mt-Nd2',
 'ENSMUSG00000064345': 'mt-Nd2',
 'ENSMUSP00000080992': 'mt-Nd2',
 'ENSMUST00000082397': 'mt-Tw',
 'ENSMUSG00000064346': 'mt-Tw',
 'ENSMUST00000082398': 'mt-Ta',
 'ENSMUSG00000064347': 'mt-Ta',
 'ENSMUST00000082399': 'mt-Tn',
 'ENSMUSG00000064348': 'mt-Tn',
 'ENSMUST00000082400': 'mt-Tc',
 'ENSMUSG00000064349': 'mt-Tc',
 'ENSMUS

In [7]:
np.save(f"{Path}/ENS_to_genes.npy", Gene_Dict, allow_pickle=True)