# Process GTF file from Karri et al. for mm10 to mm39 Liftover

## <br> 1. Import Required Packages

In [24]:
import numpy as np
import pandas as pd


## <br> 2. Liftover Karri et al. from mm10 to mm39

In [25]:
# Define the column names
columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attributes']

# Read the GTF file into a DataFrame
RAW_Master = pd.read_csv('./RAW_Data/GTFB_FullGeneBody_MouseLiver_snRNAseq.gtf', 
                         delimiter='\t',
                         header=None,  # No header in the file
                         names=columns)


  RAW_Master = pd.read_csv('./RAW_Data/GTFB_FullGeneBody_MouseLiver_snRNAseq.gtf',


In [43]:
# Extract relevant columns
bed_df = RAW_Master.copy()

# Add the 'ID' column as the row index + 1 to start from 1
bed_df['ID'] = bed_df.index

# Add 'chr' to every seqname value
bed_df['seqname'] = 'chr' + bed_df['seqname'].astype(str)

# Reorder columns to place 'ID' between 'end' and 'score'
bed_df_filtered = bed_df[['seqname', 'start', 'end', 'ID', 'score', 'strand']]

# Define the list of standard chromosomes to keep
standard_chromosomes = {f'chr{i}' for i in range(1, 20)} | {'chrX', 'chrY', 'chrMT'}

# Filter the DataFrame
bed_df_filtered = bed_df_filtered[bed_df_filtered['seqname'].isin(standard_chromosomes)]
bed_df_filtered['seqname'] = bed_df_filtered['seqname'].replace({'chrMT': 'chrM'})


bed_df_filtered.head()

Unnamed: 0,seqname,start,end,ID,score,strand
0,chr1,4470732,4471325,0,1000,-
1,chr1,4744316,4745480,1,1000,-
2,chr1,21176511,21180041,2,1000,+
3,chr1,191420199,191424644,3,1000,+
4,chr11,94731060,94731724,4,1000,-


In [45]:
# Export to BED format (tab-delimited)
bed_df_filtered.to_csv('./RAW_Data/mm10_Karri_Annotations_Restructured_For_Liftovers.bed', sep='\t', 
              header=False, index=False)

In [46]:
# Download the UCSC LiftOver executable
!wget -O liftOver https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver

# Make the file executable
!chmod +x liftOver

# Download mm10 to rn6 liftover chain
!wget -O mm10Tomm39.over.chain.gz https://hgdownload.soe.ucsc.edu/goldenPath/mm10/liftOver/mm10ToMm39.over.chain.gz

--2025-02-25 11:11:00--  https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver
Resolving hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)... 128.114.119.163
Connecting to hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)|128.114.119.163|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24387728 (23M)
Saving to: ‘liftOver’


2025-02-25 11:11:03 (9.87 MB/s) - ‘liftOver’ saved [24387728/24387728]

--2025-02-25 11:11:04--  https://hgdownload.soe.ucsc.edu/goldenPath/mm10/liftOver/mm10ToMm39.over.chain.gz
Resolving hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)... 128.114.119.163
Connecting to hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)|128.114.119.163|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 25048 (24K) [application/x-gzip]
Saving to: ‘mm10Tomm39.over.chain.gz’


2025-02-25 11:11:04 (384 KB/s) - ‘mm10Tomm39.over.chain.gz’ saved [25048/25048]



In [47]:
# Run UCSC LiftOver to map the coordinates
# Define variables
min_match = 0.7
input_bed = "./RAW_Data/mm10_Karri_Annotations_Restructured_For_Liftovers.bed"
chain_file = "mm10Tomm39.over.chain.gz"
output_bed = f"./RAW_Data/Karri_et_al_mm39_liftover_annotations_ratio_{min_match}.bed"
unmapped_bed = f"./RAW_Data/unmapped_coordinates_ratio_{min_match}.bed"

# Construct and run the command
!./liftOver -minMatch={min_match} {input_bed} {chain_file} {output_bed} {unmapped_bed}

Reading liftover chains
Mapping coordinates


In [48]:
# cleanup directory
!rm -r liftOver
!rm -r mm10Tomm39.over.chain.gz

In [49]:
# Read LiftOver output (only successfully mapped entries)
liftover_df = pd.read_csv('./RAW_Data/Karri_et_al_mm39_liftover_annotations_ratio_'+ str(min_match) +'.bed', sep='\t', header=None,
                          names=['seqname', 'start', 'end', 'ID', 'score', 'strand'])

In [50]:
#Extract relevant annotation columns from bed_df for merging
original_annot = bed_df[['ID', 'source', 'feature', 'attributes','frame',]]

In [51]:
# Merge liftover_df with original_annot using 'ID', keeping all rows from liftover_df
merged_df = liftover_df.merge(original_annot, on='ID', how='left')

In [54]:
#Reorder and select final columns for the merged annotation DataFrame
merged_df2 = merged_df[['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attributes']]

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attributes
0,chr1,Genebody_mm10_lnc48261,exon,4540955,4541548,1000,-,.,"gene_id ""lnc_inter_chr1_1""; transcript_id ""lnc..."
1,chr1,Genebody_mm10_lnc48261,exon,4814539,4815703,1000,-,.,"gene_id ""lnc_inter_chr1_10""; transcript_id ""ln..."
2,chr1,Genebody_mm10_lnc48261,exon,21246735,21250265,1000,+,.,"gene_id ""lnc_inter_chr1_100""; transcript_id ""l..."
3,chr1,Genebody_mm10_lnc48261,exon,191152396,191156841,1000,+,.,"gene_id ""lnc_inter_chr1_1000""; transcript_id ""..."
4,chr11,Genebody_mm10_lnc48261,exon,94621886,94622550,1000,-,.,"gene_id ""lnc_inter_chr11_10000""; transcript_id..."
...,...,...,...,...,...,...,...,...,...
82653,chrM,insdc,exon,9877,10173,.,+,.,"gene_id ""ENSMUSG00000065947""; transcript_id ""E..."
82654,chrM,insdc,exon,10167,11544,.,+,.,"gene_id ""ENSMUSG00000064363""; transcript_id ""E..."
82655,chrM,insdc,exon,11742,13565,.,+,.,"gene_id ""ENSMUSG00000064367""; transcript_id ""E..."
82656,chrM,insdc,exon,13552,14070,.,-,.,"gene_id ""ENSMUSG00000064368""; transcript_id ""E..."


## 3. Save restructured metadata file

In [58]:
#Export merged_df2 to a GTF file without headers or quotes, using tab as separator
import csv
merged_df2.to_csv('./Karri_Annotations_Restructured_mm39.gtf', sep='\t', 
                  index=False, header=False, quoting=csv.QUOTE_NONE, escapechar='')