Generate a gene level transcriptome.

In [1]:
# Download the transcriptome sequence and single-cell sequencing data
!mkdir temporary_data
!mkdir temporary_data/gene_level_transcriptome
!wget http://ftp.ensembl.org/pub/release-104/fasta/mus_musculus/cdna/Mus_musculus.GRCm39.cdna.all.fa.gz -O temporary_data/gene_level_transcriptome/Mus_musculus.GRCm39.cdna.all.fa.gz
!gzip -d temporary_data/gene_level_transcriptome/Mus_musculus.GRCm39.cdna.all.fa.gz -f
!wget http://ftp.ensembl.org/pub/release-104/fasta/mus_musculus/ncrna/Mus_musculus.GRCm39.ncrna.fa.gz -O temporary_data/gene_level_transcriptome/Mus_musculus.GRCm39.ncrna.fa.gz
!gzip -d temporary_data/gene_level_transcriptome/Mus_musculus.GRCm39.ncrna.fa.gz -f

mkdir: cannot create directory ‘temporary_data’: File exists
--2021-08-16 12:09:42--  http://ftp.ensembl.org/pub/release-104/fasta/mus_musculus/cdna/Mus_musculus.GRCm39.cdna.all.fa.gz
Resolving ftp.ensembl.org (ftp.ensembl.org)... 193.62.197.76
Connecting to ftp.ensembl.org (ftp.ensembl.org)|193.62.197.76|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 51166836 (49M) [application/x-gzip]
Saving to: ‘temporary_data/gene_level_transcriptome/Mus_musculus.GRCm39.cdna.all.fa.gz’


2021-08-16 12:10:46 (783 KB/s) - ‘temporary_data/gene_level_transcriptome/Mus_musculus.GRCm39.cdna.all.fa.gz’ saved [51166836/51166836]

--2021-08-16 12:10:48--  http://ftp.ensembl.org/pub/release-104/fasta/mus_musculus/ncrna/Mus_musculus.GRCm39.ncrna.fa.gz
Resolving ftp.ensembl.org (ftp.ensembl.org)... 193.62.197.76
Connecting to ftp.ensembl.org (ftp.ensembl.org)|193.62.197.76|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6160484 (5.9M) [application/x-gzip]
Savi

In [2]:
# Input files
cdna_fasta_file = 'temporary_data/gene_level_transcriptome/Mus_musculus.GRCm39.cdna.all.fa'
ncrna_fasta_file = 'temporary_data/gene_level_transcriptome/Mus_musculus.GRCm39.ncrna.fa'
APPRIS_file = 'data/isoforms/mouse_appris_data.principal.txt'
# Output file
gene_level_transcriptome_file = 'temporary_data/gene_level_transcriptome/gene_level_transcriptome.csv'

In [3]:
import numpy as np
import pandas as pd
from Bio import SeqIO

In [4]:
def load_transcriptome_fasta_into_dataframe(fasta_file):
    d = {'transcript_id':[], 'seq_type':[], 'location':[], 'gene_id':[],
        'gene_biotype':[], 'transcript_biotype':[], 'gene_symbol':[],
        'description':[], 'seq_length':[], 'sequence':[]}
    
    try:
        for record in SeqIO.parse(fasta_file, 'fasta'):
            split_rd = record.description.split()            
            d['transcript_id'].append(split_rd[0].split('.')[0])
            d['seq_type'].append(split_rd[1])
            d['location'].append(split_rd[2])
            d['gene_id'].append(split_rd[3][5:].split('.')[0])
            d['gene_biotype'].append(split_rd[4][13:])
            d['transcript_biotype'].append(split_rd[5][19:])
            if len(split_rd) > 6:
                d['gene_symbol'].append(split_rd[6][12:])
            else:
                d['gene_symbol'].append(pd.NA)
            if len(split_rd) > 7:
                d['description'].append(' '.join(split_rd[7:])[12:])
            else:
                d['description'].append(pd.NA)
            
            d['seq_length'].append(len(str(record.seq)))
            d['sequence'].append(str(record.seq))
    
    except:
        print(split_rd)
        raise
    
    df = pd.DataFrame.from_dict(d)
    df = df.set_index('transcript_id')
    return df

In [5]:
# Load the transcriptome fasta
transcriptome_cdna = load_transcriptome_fasta_into_dataframe(cdna_fasta_file)
transcriptome_ncrna = load_transcriptome_fasta_into_dataframe(ncrna_fasta_file)
transcriptome = pd.concat((transcriptome_cdna, transcriptome_ncrna), axis=0) 

In [6]:
# Load the annotating principal splice isoforms (APPRIS) data
APPRIS = pd.read_csv(APPRIS_file, sep='\t', header=None)
# Find the main isoforms of protein coding each gene
transcriptome = transcriptome.merge(APPRIS.set_index(2).rename(
                   columns={4:'APPRIS'})['APPRIS'],
                   how='left', left_index=True, right_index=True)

# Find a unique transcript for each gene
transcriptome_grouped = transcriptome.groupby(transcriptome['gene_id'])
unique_transcripts = []
for i in np.unique(transcriptome['gene_id']):
    df = transcriptome_grouped.get_group(i)
    # Only consider the main isoforms if the annotation exists
    df_main_isoforms = df.loc[df['APPRIS'].notnull()]
    if df_main_isoforms.shape[0] == 0:
        df_main_isoforms = df
    # Find the shortest isoforms    
    unique_transcripts.append(df_main_isoforms[['seq_length']].idxmin()[0])

transcriptome_unique = transcriptome.loc[unique_transcripts]

In [7]:
transcriptome_unique.to_csv(gene_level_transcriptome_file)