# Merge GTF file from Karri et al. with MGI Annotations

## <br> 1. Import Required Packages

In [53]:
import numpy as np
import pandas as pd
import re

## <br> 2. Import and Process Data

In [54]:
# Convert GTF to BED format
!gff2bed < ./RAW_Data/Karri_Annotations_Restructured_mm39.gtf > ./RAW_Data/Karri_Annotations_Restructured_mm39.bed




In [55]:
# Sort BED files by chromosome and start coordinate
!sort -k1,1 -k2,2n ./RAW_Data/Karri_Annotations_Restructured_mm39.bed > ./RAW_Data/sorted_Karri_Annotations_Restructured_mm39.bed
!sort -k1,1 -k2,2n ./RAW_Data/MGI_lncRNAs_mm39_Processed.txt > ./RAW_Data/sorted_MGI_lncRNAs_mm39_Processed.bed


In [56]:
# Perform strand-specific intersection using bedtools
!bedtools intersect -sorted -loj -a ./RAW_Data/sorted_Karri_Annotations_Restructured_mm39.bed -b ./RAW_Data/sorted_MGI_lncRNAs_mm39_Processed.bed -s > ./RAW_Data/Karri_vs_MGI_lncRNA_Intersection.bed


In [57]:
# Create a temporary file with headers
!echo -e "chrom\tstart\tend\tblank\tscore\tstrand\tsource\tfeature\tframe\tattributes\tmgi_chr\tmgi_start\tmgi_end\tmgi_symbol\tmgi_score\tmgi_strand" > ./RAW_Data/Karri_vs_MGI_lncRNA_Intersection_temp.bed

# Append the original intersection data to the temporary file
!tail -n +2 ./RAW_Data/Karri_vs_MGI_lncRNA_Intersection.bed >> ./RAW_Data/Karri_vs_MGI_lncRNA_Intersection_temp.bed

# Replace the original file with the updated one
!mv ./RAW_Data/Karri_vs_MGI_lncRNA_Intersection_temp.bed ./RAW_Data/Karri_vs_MGI_lncRNA_Intersection.txt



In [58]:
# Read the GTF file into a DataFrame
RAW_Master = pd.read_csv('./RAW_Data/Karri_vs_MGI_lncRNA_Intersection.txt', 
                         delimiter='\t', header=0)


RAW_Master

  RAW_Master = pd.read_csv('./RAW_Data/Karri_vs_MGI_lncRNA_Intersection.txt',


Unnamed: 0,chrom,start,end,blank,score,strand,source,feature,frame,attributes,mgi_chr,mgi_start,mgi_end,mgi_symbol,mgi_score,mgi_strand
0,chr1,3224554,3224816,.,1000,-,Genebody_mm10_lnc48261,exon,.,"gene_id ""lnc_inter_chr1_15560""; transcript_id ...",.,-1,-1,.,-1,.
1,chr1,3273138,3274559,.,1000,-,Genebody_mm10_lnc48261,exon,.,"gene_id ""lnc_inter_chr1_15561""; transcript_id ...",.,-1,-1,.,-1,.
2,chr1,3275710,3276991,.,1000,-,Genebody_mm10_lnc48261,exon,.,"gene_id ""lnc_inter_chr1_15562""; transcript_id ...",.,-1,-1,.,-1,.
3,chr1,3284704,3741721,.,1000,-,GB_RefseqNM19801,exon,.,"gene_id ""Xkr4""; transcript_id ""Xkr4""; gene_nam...",chr1,3717532,3729127,Gm19938,1000,-
4,chr1,3536809,3583776,.,1000,+,EnsmblNR4698,exon,.,"gene_id ""Gm1992""; transcript_id ""Gm1992""; gene...",chr1,3536810,3583776,Gm1992,1000,+
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84244,chrY,90614769,90617133,.,1000,-,EnsmblNR4698,exon,.,"gene_id ""Gm28300""; transcript_id ""Gm28300""; ge...",chrY,90614770,90617133,Gm28300,1000,-
84245,chrY,90676614,90678894,.,1000,-,EnsmblNR4698,exon,.,"gene_id ""Gm28301""; transcript_id ""Gm28301""; ge...",chrY,90676615,90678894,Gm28301,1000,-
84246,chrY,90762408,90766319,.,1000,-,GB_RefseqNR2077,exon,.,"gene_id ""G530011O06Rik@chrY(-)""; transcript_id...",chrY,90763696,90766736,G530011O06Riky,1000,-
84247,chrY,90796710,90827734,.,1000,+,GB_RefseqNM#GB_RefseqNR1159,exon,.,"gene_id ""Erdr1""; transcript_id ""Erdr1""; gene_n...",chrY,90796007,90827734,Erdr1y,1000,+


In [59]:
# Extract information from the 'attributes' column
attributes = RAW_Master['attributes'].str.split(';', expand=True)

attributes


# Extract specific attributes you're interested in
RAW_Master['gene_id'] = attributes[0].str.strip().str.split(' ', expand=True)[1].str.strip('"')
RAW_Master['transcript_id'] = attributes[1].str.strip().str.split(' ', expand=True)[1].str.strip('"')
RAW_Master['gene_name'] = attributes[2].str.strip().str.split(' ', expand=True)[1].str.strip('"')
RAW_Master['gene_biotype'] = attributes[4].str.strip().str.split(' ', expand=True)[1].str.strip('"')
RAW_Master['NR_annotation'] = attributes[5].str.strip().str.split(' ', expand=True)[1].str.strip('"')
RAW_Master['NR_annotation2'] = RAW_Master['NR_annotation'].str.split('_').str[1]

# Drop the original 'attributes' column
RAW_Master.drop(columns=['attributes'], inplace=True)

RAW_Master

Unnamed: 0,chrom,start,end,blank,score,strand,source,feature,frame,mgi_chr,...,mgi_end,mgi_symbol,mgi_score,mgi_strand,gene_id,transcript_id,gene_name,gene_biotype,NR_annotation,NR_annotation2
0,chr1,3224554,3224816,.,1000,-,Genebody_mm10_lnc48261,exon,.,.,...,-1,.,-1,.,lnc_inter_chr1_15560,lnc_inter_chr1_15560,lnc15560,lncRNA,lnc15560,
1,chr1,3273138,3274559,.,1000,-,Genebody_mm10_lnc48261,exon,.,.,...,-1,.,-1,.,lnc_inter_chr1_15561,lnc_inter_chr1_15561,lnc15561,lncRNA,lnc15561,
2,chr1,3275710,3276991,.,1000,-,Genebody_mm10_lnc48261,exon,.,.,...,-1,.,-1,.,lnc_inter_chr1_15562,lnc_inter_chr1_15562,lnc15562,lncRNA,lnc15562,
3,chr1,3284704,3741721,.,1000,-,GB_RefseqNM19801,exon,.,chr1,...,3729127,Gm19938,1000,-,Xkr4,Xkr4,Xkr4,NM,,
4,chr1,3536809,3583776,.,1000,+,EnsmblNR4698,exon,.,chr1,...,3583776,Gm1992,1000,+,Gm1992,Gm1992,Gm1992,antisense,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
84244,chrY,90614769,90617133,.,1000,-,EnsmblNR4698,exon,.,chrY,...,90617133,Gm28300,1000,-,Gm28300,Gm28300,Gm28300,lincRNA,,
84245,chrY,90676614,90678894,.,1000,-,EnsmblNR4698,exon,.,chrY,...,90678894,Gm28301,1000,-,Gm28301,Gm28301,Gm28301,lincRNA,,
84246,chrY,90762408,90766319,.,1000,-,GB_RefseqNR2077,exon,.,chrY,...,90766736,G530011O06Riky,1000,-,G530011O06Rik@chrY(-),G530011O06Rik@chrY(-),G530011O06Rik@chrY(-),NR,,
84247,chrY,90796710,90827734,.,1000,+,GB_RefseqNM#GB_RefseqNR1159,exon,.,chrY,...,90827734,Erdr1y,1000,+,Erdr1,Erdr1,Erdr1,NM#NR,,


In [60]:
# Check unique gene biotypes in RAW_Master
RAW_Master['gene_biotype'].unique()

array(['lncRNA', 'NM', 'antisense', 'lincRNA', 'NM#NR', 'NR',
       'lncOfInterest', None], dtype=object)

In [61]:
# Identify rows where gene_biotype is the string 'None'
RAW_Master[RAW_Master['gene_biotype'].astype(str) == 'None']

Unnamed: 0,chrom,start,end,blank,score,strand,source,feature,frame,mgi_chr,...,mgi_end,mgi_symbol,mgi_score,mgi_strand,gene_id,transcript_id,gene_name,gene_biotype,NR_annotation,NR_annotation2
80642,chrM,2750,3707,.,.,+,insdc,exon,.,.,...,-1,.,-1,.,ENSMUSG00000064341,ENSMUSG00000064341,mt-Nd1,,,
80643,chrM,3913,4951,.,.,+,insdc,exon,.,.,...,-1,.,-1,.,ENSMUSG00000064345,ENSMUSG00000064345,mt-Nd2,,,
80644,chrM,5327,6872,.,.,+,insdc,exon,.,.,...,-1,.,-1,.,ENSMUSG00000064351,ENSMUSG00000064351,mt-Co1,,,
80645,chrM,7012,7696,.,.,+,insdc,exon,.,.,...,-1,.,-1,.,ENSMUSG00000064354,ENSMUSG00000064354,mt-Co2,,,
80646,chrM,7765,7969,.,.,+,insdc,exon,.,.,...,-1,.,-1,.,ENSMUSG00000064356,ENSMUSG00000064356,mt-Atp8,,,
80647,chrM,7926,8607,.,.,+,insdc,exon,.,.,...,-1,.,-1,.,ENSMUSG00000064357,ENSMUSG00000064357,mt-Atp6,,,
80648,chrM,8606,9390,.,.,+,insdc,exon,.,.,...,-1,.,-1,.,ENSMUSG00000064358,ENSMUSG00000064358,mt-Co3,,,
80649,chrM,9458,9806,.,.,+,insdc,exon,.,.,...,-1,.,-1,.,ENSMUSG00000064360,ENSMUSG00000064360,mt-Nd3,,,
80650,chrM,9876,10173,.,.,+,insdc,exon,.,.,...,-1,.,-1,.,ENSMUSG00000065947,ENSMUSG00000065947,mt-Nd4l,,,
80651,chrM,10166,11544,.,.,+,insdc,exon,.,.,...,-1,.,-1,.,ENSMUSG00000064363,ENSMUSG00000064363,mt-Nd4,,,


In [62]:
# Replace NaNs in gene_biotype with 'mitochondrial protein-coding gene' and check updated unique values
RAW_Master.loc[RAW_Master['gene_biotype'].isna(), 'gene_biotype'] = 'mitochondrial protein-coding gene'
RAW_Master['gene_biotype'].unique()

array(['lncRNA', 'NM', 'antisense', 'lincRNA', 'NM#NR', 'NR',
       'lncOfInterest', 'mitochondrial protein-coding gene'], dtype=object)

## <br> 2. Export Processed Dataframe

In [63]:
RAW_Master.to_csv('MGI_and_Karri_Annotations_Restructured_mm39.txt', sep='\t', index=False, header=True)