Generate a gene level transcriptome.

In [167]:
# Download the transcriptome sequence and single-cell sequencing data
!mkdir temporary_data
!mkdir temporary_data/gene_level_transcriptome
!wget http://ftp.ensembl.org/pub/release-104/fasta/mus_musculus/cdna/Mus_musculus.GRCm39.cdna.all.fa.gz -O temporary_data/gene_level_transcriptome/Mus_musculus.GRCm39.cdna.all.fa.gz
!gzip -d temporary_data/gene_level_transcriptome/Mus_musculus.GRCm39.cdna.all.fa.gz -f
!wget http://ftp.ensembl.org/pub/release-104/fasta/mus_musculus/ncrna/Mus_musculus.GRCm39.ncrna.fa.gz -O temporary_data/gene_level_transcriptome/Mus_musculus.GRCm39.ncrna.fa.gz
!gzip -d temporary_data/gene_level_transcriptome/Mus_musculus.GRCm39.ncrna.fa.gz -f

mkdir: cannot create directory ‘temporary_data’: File exists
--2021-08-15 13:26:12--  http://ftp.ensembl.org/pub/release-104/fasta/mus_musculus/cdna/Mus_musculus.GRCm39.cdna.all.fa.gz
Resolving ftp.ensembl.org (ftp.ensembl.org)... 193.62.193.139
Connecting to ftp.ensembl.org (ftp.ensembl.org)|193.62.193.139|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 51166836 (49M) [application/octet-stream]
Saving to: ‘temporary_data/gene_level_transcriptome/Mus_musculus.GRCm39.cdna.all.fa.gz’


2021-08-15 13:28:08 (432 KB/s) - ‘temporary_data/gene_level_transcriptome/Mus_musculus.GRCm39.cdna.all.fa.gz’ saved [51166836/51166836]

--2021-08-15 13:28:09--  http://ftp.ensembl.org/pub/release-104/fasta/mus_musculus/ncrna/Mus_musculus.GRCm39.ncrna.fa.gz
Resolving ftp.ensembl.org (ftp.ensembl.org)... 193.62.197.76
Connecting to ftp.ensembl.org (ftp.ensembl.org)|193.62.197.76|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6160484 (5.9M) [application/octe

In [185]:
# Input files
cdna_fasta_file = 'temporary_data/gene_level_transcriptome/Mus_musculus.GRCm39.cdna.all.fa'
ncrna_fasta_file = 'temporary_data/gene_level_transcriptome/Mus_musculus.GRCm39.ncrna.fa'
APPRIS_file = 'data/isoforms/mouse_appris_data.principal.txt'
# Output file
gene_level_transcriptome_file = 'temporary_data/gene_level_transcriptome/gene_level_transcriptome.csv'

In [186]:
import numpy as np
import pandas as pd
from Bio import SeqIO

In [187]:
def load_transcriptome_fasta_into_dataframe(fasta_file):
    d = {'transcript_id':[], 'seq_type':[], 'location':[], 'gene_id':[],
        'gene_biotype':[], 'transcript_biotype':[], 'gene_symbol':[],
        'description':[], 'seq_length':[], 'sequence':[]}
    
    try:
        for record in SeqIO.parse(fasta_file, 'fasta'):
            split_rd = record.description.split()            
            d['transcript_id'].append(split_rd[0].split('.')[0])
            d['seq_type'].append(split_rd[1])
            d['location'].append(split_rd[2])
            d['gene_id'].append(split_rd[3][5:].split('.')[0])
            d['gene_biotype'].append(split_rd[4][13:])
            d['transcript_biotype'].append(split_rd[5][19:])
            if len(split_rd) > 6:
                d['gene_symbol'].append(split_rd[6][12:])
            else:
                d['gene_symbol'].append(pd.NA)
            if len(split_rd) > 7:
                d['description'].append(' '.join(split_rd[7:])[12:])
            else:
                d['description'].append(pd.NA)
            
            d['seq_length'].append(len(str(record.seq)))
            d['sequence'].append(str(record.seq))
    
    except:
        print(split_rd)
        raise
    
    df = pd.DataFrame.from_dict(d)
    df = df.set_index('transcript_id')
    return df

In [188]:
# Load the transcriptome fasta
transcriptome_cdna = load_transcriptome_fasta_into_dataframe(cdna_fasta_file)
transcriptome_ncrna = load_transcriptome_fasta_into_dataframe(ncrna_fasta_file)
transcriptome = pd.concat((transcriptome_cdna, transcriptome_ncrna), axis=0) 

In [189]:
# Load the annotating principal splice isoforms (APPRIS) data
APPRIS = pd.read_csv(APPRIS_file, sep='\t', header=None)
# Find the main isoforms of protein coding each gene
transcriptome = transcriptome.merge(APPRIS.set_index(2).rename(
                   columns={4:'APPRIS'})['APPRIS'],
                   how='left', left_index=True, right_index=True)

# Find a unique transcript for each gene
transcriptome_grouped = transcriptome.groupby(transcriptome['gene_id'])
unique_transcripts = []
for i in np.unique(transcriptome['gene_id']):
    df = transcriptome_grouped.get_group(i)
    # Only consider the main isoforms if the annotation exists
    df_main_isoforms = df.loc[df['APPRIS'] == 'PRINCIPAL:1']
    if df_main_isoforms.shape[0] == 0:
        df_main_isoforms = df
    # Find the shortest isoforms    
    unique_transcripts.append(df_main_isoforms[['seq_length']].idxmin()[0])

transcriptome_unique = transcriptome.loc[unique_transcripts]

In [190]:
transcriptome_unique

Unnamed: 0_level_0,seq_type,location,gene_id,gene_biotype,transcript_biotype,gene_symbol,description,seq_length,sequence,APPRIS
transcript_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
ENSMUST00000000001,cdna,chromosome:GRCm39:3:108014596:108053462:-1,ENSMUSG00000000001,protein_coding,protein_coding,Gnai3,guanine nucleotide binding protein (G protein)...,3262,CACACATCCGGTTCTTCCGGGAGCTAGGGGAGCTGACGGAGAAGGC...,PRINCIPAL:1
ENSMUST00000000003,cdna,chromosome:GRCm39:X:76881507:76897229:-1,ENSMUSG00000000003,protein_coding,protein_coding,Pbsn,probasin [Source:MGI Symbol;Acc:MGI:1860484],902,GTCAGTGCACAACTGCCAACTGGGATGCAGAACACTGCTCACGCCA...,PRINCIPAL:1
ENSMUST00000000028,cdna,chromosome:GRCm39:16:18599197:18630722:-1,ENSMUSG00000000028,protein_coding,protein_coding,Cdc45,cell division cycle 45 [Source:MGI Symbol;Acc:...,2143,TGGAAACACATTCAAATAATGTGTGACTGAATTTACTTTATGTCTA...,PRINCIPAL:1
ENSMUST00000228259,ncrna,chromosome:GRCm39:7:142130719:142131670:-1,ENSMUSG00000000031,lncRNA,lncRNA,H19,"H19, imprinted maternally expressed transcript...",761,GGTTGGAGAGGAATGGGGAGCCAGACATTCATCCCGGTTACTTTTG...,
ENSMUST00000124775,cdna,chromosome:GRCm39:X:160017569:160022860:1,ENSMUSG00000000037,protein_coding,processed_transcript,Scml2,Scm polycomb group protein like 2 [Source:MGI ...,517,ATGTACACAATTTTTGTTTTCATTTCATAACTCAACAAAGCTGAAA...,
...,...,...,...,...,...,...,...,...,...,...
ENSMUST00020182589,ncrna,chromosome:GRCm39:11:11090514:11090643:1,ENSMUSG00002076988,rRNA,rRNA,5S_rRNA,5S ribosomal RNA [Source:RFAM;Acc:RF00001],130,GTCTATGGCCATACCACCCTGAAAGCATGTGATCTCAGAAACTAAA...,
ENSMUST00000083836,ncrna,chromosome:GRCm39:6:97549037:97549199:-1,ENSMUSG00002076989,snRNA,snRNA,U1,U1 spliceosomal RNA [Source:RFAM;Acc:RF00003],163,ATACTTACTTGGCTGCGGAGGTACCATGATCACAAAGGTGGTTTTC...,
ENSMUST00020183326,ncrna,chromosome:GRCm39:11:106392069:106392203:1,ENSMUSG00002076990,snoRNA,snoRNA,,,135,ACGCGCTGCCTTTGAGCCCCCGGCCCACCTTCTCGTGGTGCCGGGG...,
ENSMUST00020182837,ncrna,chromosome:GRCm39:7:74924837:74925159:1,ENSMUSG00002076991,misc_RNA,misc_RNA,7SK,7SK RNA [Source:RFAM;Acc:RF00100],323,GCATCTGAGGGCAGTTTGGCTGAAACTGCATCCCTGTCCATCCATA...,


In [191]:
transcriptome_unique.to_csv(gene_level_transcriptome_file)