In [4]:
import pandas as pd
import numpy as np
import os
import pickle

# set infinite display
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

## This notebook takes the transcript IDs in our TPM dataframe and uses pyensembl's gene_name_of_transcript_id function to create a dictionary that maps transcript_ids to gene_names. This dictionary is then processed to create a dictionary that maps gene_names to transcript_ids. There is a lot of information loss as many transcript IDs don't seem to match to a gene_name. There might be a better way to do this.

In [8]:
os.environ['PYENSEMBL_CACHE_DIR'] = '/home/waqaas/CLL/data/'
# check if pyensembl is installed, if not, run 'pyensembl install --release 110 --species human'
try:
    data = EnsemblRelease(102)
except:
    print('pyensembl not installed, installing now...')
    os.system('pip install pyensembl')
    os.system('pyensembl install --release 102 --species mouse')
    data = EnsemblRelease(102)

In [9]:
from pyensembl import EnsemblRelease
from pyensembl import genome

In [10]:
# import the TPM.tsv file
TPM = pd.read_csv('../../results/TPM.tsv', sep='\t')
#preview the dataframe only showing the first 10 columns
TPM.iloc[:, :10].head()

Unnamed: 0,Name,SRR16522870_GSM5639190_RS24_RNA-seq_Mus_musculus_RNA-Seq_,SRR16522866_GSM5639186_RS11_RNA-seq_Mus_musculus_RNA-Seq_,SRR16522874_GSM5639194_RS5_RNA-seq_Mus_musculus_RNA-Seq_,SRR16522873_GSM5639193_RS4_T_RNA-seq_Mus_musculus_RNA-Seq_,SRR16522868_GSM5639188_RS18_RNA-seq_Mus_musculus_RNA-Seq_,SRR16522867_GSM5639187_RS13_RNA-seq_Mus_musculus_RNA-Seq_,SRR16522872_GSM5639192_RS4_RNA-seq_Mus_musculus_RNA-Seq_,SRR16522871_GSM5639191_RS3_RNA-seq_Mus_musculus_RNA-Seq_,SRR16522869_GSM5639189_RS20_RNA-seq_Mus_musculus_RNA-Seq_
0,ENSMUST00000196221.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,ENSMUST00000179664.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,ENSMUST00000177564.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,ENSMUST00000178537.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,ENSMUST00000178862.2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [11]:
# create a list called transcript_id_list out of the transcript_ttest_df['transcript_id'] column
transcript_id_list = TPM['Name'].tolist()
# unload TPM dataframe from memory
del(TPM)

In [12]:
# Remove the . and the number after the . from the transcript_id_list
transcript_id_list = [i.split('.')[0] for i in transcript_id_list]

In [13]:
# Create a dictionary to store the mapping of transcript IDs to gene IDs
transcript_to_gene_mapping = {}

# call relevant classes from the pyensembl object
reference_name = data.reference_name
gtf_path = '/home/waqaas/CLL/data/pyensembl/GRCm38/ensembl102/Mus_musculus.GRCm38.102.gtf.db'

# Create the Genome object outside the loop
ref = genome.Genome(
    reference_name=reference_name,
    annotation_name='ensembl',
    gtf_path_or_url=gtf_path
)

# Retrieve gene IDs from transcript IDs
for transcript_id in transcript_id_list:
    try:
        gene_name = ref.gene_name_of_transcript_id(transcript_id)
        transcript_to_gene_mapping[transcript_id] = gene_name
    except ValueError:
        print(f"No results found for transcript ID: {transcript_id}")


No results found for transcript ID: ENSMUST00000249191
No results found for transcript ID: ENSMUST00000249208
No results found for transcript ID: ENSMUST00000249207
No results found for transcript ID: ENSMUST00000249272
No results found for transcript ID: ENSMUST00000249273
No results found for transcript ID: ENSMUST00000249274
No results found for transcript ID: ENSMUST00000249275
No results found for transcript ID: ENSMUST00000249276
No results found for transcript ID: ENSMUST00000249277
No results found for transcript ID: ENSMUST00000249278
No results found for transcript ID: ENSMUST00000249279
No results found for transcript ID: ENSMUST00000249281
No results found for transcript ID: ENSMUST00000249282
No results found for transcript ID: ENSMUST00000249283
No results found for transcript ID: ENSMUST00000249285
No results found for transcript ID: ENSMUST00000249286
No results found for transcript ID: ENSMUST00000249288
No results found for transcript ID: ENSMUST00000249289
No results

In [14]:
# Create a gene to transcript map
gene_to_transcript_mapping = {}

# Iterate through the transcript to gene map
for transcript, gene in transcript_to_gene_mapping.items():
    # Skip empty genes
    if gene:
        # If the gene is not in the gene to transcript map, create an empty list
        if gene not in gene_to_transcript_mapping:
            gene_to_transcript_mapping[gene] = []
        # Append the transcript to the gene in the gene to transcript map
        gene_to_transcript_mapping[gene].append(transcript)

In [17]:
# save these dictionaries as .pkl files
with open('../../results/transcript_to_gene_mapping.pkl', 'wb') as file:
    pickle.dump(transcript_to_gene_mapping, file)
with open('../../results/gene_to_transcript_mapping.pkl', 'wb') as file:
    pickle.dump(gene_to_transcript_mapping, file)