# Process GTF file from Karri et al. for mm10 to rn7 Liftover

## <br> 1. Import Required Packages

In [1]:
import numpy as np
import pandas as pd


## <br> 2. Liftover Karri et al. from mm10 to mm39

In [2]:
# Define the column names
columns = ['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attributes']

# Read the GTF file into a DataFrame
RAW_Master = pd.read_csv('./RAW_Data/GTFB_FullGeneBody_MouseLiver_snRNAseq.gtf', 
                         delimiter='\t',
                         header=None,  # No header in the file
                         names=columns)


  RAW_Master = pd.read_csv('./RAW_Data/GTFB_FullGeneBody_MouseLiver_snRNAseq.gtf',


In [3]:
# Extract relevant columns
bed_df = RAW_Master.copy()

# Add the 'ID' column as the row index + 1 to start from 1
bed_df['ID'] = bed_df.index

# Add 'chr' to every seqname value
bed_df['seqname'] = 'chr' + bed_df['seqname'].astype(str)

# Reorder columns to place 'ID' between 'end' and 'score'
bed_df_filtered = bed_df[['seqname', 'start', 'end', 'ID', 'score', 'strand']]

# Define the list of standard chromosomes to keep
standard_chromosomes = {f'chr{i}' for i in range(1, 20)} | {'chrX', 'chrY', 'chrMT'}

# Filter the DataFrame
bed_df_filtered = bed_df_filtered[bed_df_filtered['seqname'].isin(standard_chromosomes)]
bed_df_filtered['seqname'] = bed_df_filtered['seqname'].replace({'chrMT': 'chrM'})


bed_df_filtered.head()

Unnamed: 0,seqname,start,end,ID,score,strand
0,chr1,4470732,4471325,0,1000,-
1,chr1,4744316,4745480,1,1000,-
2,chr1,21176511,21180041,2,1000,+
3,chr1,191420199,191424644,3,1000,+
4,chr11,94731060,94731724,4,1000,-


In [4]:
# Export to BED format (tab-delimited)
bed_df_filtered.to_csv('./RAW_Data/mm10_Karri_Annotations_Restructured_For_Liftovers.bed', sep='\t', 
              header=False, index=False)

In [5]:
# Download the UCSC LiftOver executable
!wget -O liftOver https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver

# Make the file executable
!chmod +x liftOver

# Download mm10 to rn6 liftover chain
!wget -O mm10ToRn7.over.chain.gz https://hgdownload.soe.ucsc.edu/goldenPath/mm10/liftOver/mm10ToRn7.over.chain.gz

--2025-03-25 14:20:04--  https://hgdownload.soe.ucsc.edu/admin/exe/linux.x86_64/liftOver
Resolving hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)... 128.114.119.163
Connecting to hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)|128.114.119.163|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 24388592 (23M)
Saving to: ‘liftOver’


2025-03-25 14:20:07 (10.1 MB/s) - ‘liftOver’ saved [24388592/24388592]

--2025-03-25 14:20:07--  https://hgdownload.soe.ucsc.edu/goldenPath/mm10/liftOver/mm10ToRn7.over.chain.gz
Resolving hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)... 128.114.119.163
Connecting to hgdownload.soe.ucsc.edu (hgdownload.soe.ucsc.edu)|128.114.119.163|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 76330380 (73M) [application/x-gzip]
Saving to: ‘mm10ToRn7.over.chain.gz’


2025-03-25 14:20:11 (19.2 MB/s) - ‘mm10ToRn7.over.chain.gz’ saved [76330380/76330380]



In [6]:
# Run UCSC LiftOver to map the coordinates
# Define variables
min_match = 0.7
input_bed = "./RAW_Data/mm10_Karri_Annotations_Restructured_For_Liftovers.bed"
chain_file = "mm10ToRn7.over.chain.gz"
output_bed = f"./RAW_Data/Karri_et_al_rn7_liftover_annotations_ratio_{min_match}.bed"
unmapped_bed = f"./RAW_Data/unmapped_coordinates_ratio_{min_match}.bed"

# Construct and run the command
!./liftOver -minMatch={min_match} {input_bed} {chain_file} {output_bed} {unmapped_bed}

Reading liftover chains
Mapping coordinates


In [7]:
# cleanup directory
!rm -r liftOver
!rm -r mm10ToRn7.over.chain.gz

In [8]:
# Read LiftOver output (only successfully mapped entries)
columns = ['rat_chr', 'rat_start', 'rat_end', 'mouse_ID', 'rat_score', 'rat_strand']

liftover_df = pd.read_csv('./RAW_Data/Karri_et_al_rn7_liftover_annotations_ratio_'+ str(min_match) +'.bed', sep='\t', header=None,
                          names=columns)

In [9]:
# Define the column names
columns = ['mouse_chr', 'mouse_source', 'mouse_feature', 'mouse_start', 'mouse_end', 'mouse_score', 'mouse_strand', 'mouse_frame', 'mouse_attributes']

# Read the GTF file into a DataFrame
Karri_Master = pd.read_csv('../00_Karri_et_al_GTF/RAW_Data/GTFB_FullGeneBody_MouseLiver_snRNAseq.gtf', 
                         delimiter='\t',
                         header=None,  # No header in the file
                         names=columns)

# Add the 'ID' column as the row index + 1 to start from 1
Karri_Master['mouse_ID'] = Karri_Master.index 


  Karri_Master = pd.read_csv('../00_Karri_et_al_GTF/RAW_Data/GTFB_FullGeneBody_MouseLiver_snRNAseq.gtf',


In [10]:
# Merge Rat_Master and Karri_Master on 'ID' (right) and 'rat_ID' (left)
merged_df = pd.merge(liftover_df, Karri_Master, left_on='mouse_ID', right_on='mouse_ID', how='left')
merged_df_trimmed = merged_df[['rat_chr', 'rat_start', 'rat_end', 'mouse_ID', 'rat_score','rat_strand','mouse_attributes']]
merged_df_trimmed = merged_df_trimmed.drop_duplicates(keep='first')

In [11]:
# Define a function to parse attributes and extract key-value pairs
def parse_attributes(attribute_string):
    attr_dict = {}
    if pd.notnull(attribute_string):  # Check if the string is not NaN
        attributes = attribute_string.split(';')
        for attribute in attributes:
            if attribute.strip():  # Ensure it's not an empty string
                key, value = attribute.strip().split(' ', 1)
                attr_dict[key] = value.strip('"')  # Strip quotes from value
    return attr_dict

# Apply the function to extract the information into a DataFrame
attributes_df = merged_df_trimmed['mouse_attributes'].apply(parse_attributes).apply(pd.Series)

# Drop the 'gene_id' column
attributes_df = attributes_df.drop(columns=['gene_id', 'transcript_id', 'gene_name2', 'NR_annotation'])

# Rename 'gene_name' to 'gene_id'
attributes_df = attributes_df.rename(columns={'gene_name': 'gene_id'})
attributes_df = attributes_df.fillna('')

In [12]:
# Concatenate the two DataFrames side by side
merged_combined = pd.concat([merged_df_trimmed, attributes_df], axis=1)

# Define the values to keep
biotypes_to_keep = ['lncRNA', 'NR', 'lincRNA', 'antisense', 'lncOfInterest']

# Filter the DataFrame
merged_combined_trimmed = merged_combined[merged_combined['gene_biotype'].isin(biotypes_to_keep)]
#merged_combined_trimmed[['gene_biotype']] = 'lncRNA'


In [13]:
#Build GTF-style 'attribute' column with mouse and gene IDs
merged_combined_trimmed['attribute'] = (
    'Mouse_ID "' + merged_combined_trimmed['mouse_ID'].astype(str) + 
    '"; gene_id "' + merged_combined_trimmed['gene_id'] + '";'
)


#Set GTF-required columns
merged_combined_trimmed['source'] = 'liftover'
merged_combined_trimmed['feature'] = 'exon'
merged_combined_trimmed['frame'] = '.'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_combined_trimmed['attribute'] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_combined_trimmed['source'] = 'liftover'
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  merged_combined_trimmed['feature'] = 'exon'
A value is trying to be set on a copy of a slice from a DataFrame.
Try us

In [14]:
# Select and order columns for final LiftOver GTF output
liftover_GTF = merged_combined_trimmed[['rat_chr', 'source', 'feature', 
                          'rat_start', 'rat_end', 'rat_score', 
                          'rat_strand', 'frame', 'attribute']]

# Define the list of standard chromosomes to keep
standard_chromosomes = {f'chr{i}' for i in range(1, 20)} | {'chrX', 'chrY', 'chrM'}

# Filter the DataFrame
liftover_GTF = liftover_GTF[liftover_GTF['rat_chr'].isin(standard_chromosomes)]
liftover_GTF[['transcript_type']] = 'lncRNA'

In [15]:
liftover_GTF

Unnamed: 0,rat_chr,source,feature,rat_start,rat_end,rat_score,rat_strand,frame,attribute,transcript_type
0,chr5,liftover,exon,15038462,15039082,1000,+,.,"Mouse_ID ""0""; gene_id ""lnc1"";",lncRNA
1,chr5,liftover,exon,14766774,14767820,1000,+,.,"Mouse_ID ""1""; gene_id ""lnc10"";",lncRNA
2,chr13,liftover,exon,103001424,103005924,1000,+,.,"Mouse_ID ""3""; gene_id ""lnc1000"";",lncRNA
3,chr10,liftover,exon,79676628,79677257,1000,-,.,"Mouse_ID ""4""; gene_id ""lnc10000"";",lncRNA
4,chr10,liftover,exon,79677716,79678362,1000,-,.,"Mouse_ID ""5""; gene_id ""lnc10001"";",lncRNA
...,...,...,...,...,...,...,...,...,...,...
65892,chr3,liftover,exon,155794311,155794889,1000,+,.,"Mouse_ID ""82486""; gene_id ""Zfas1"";",lncRNA
66085,chr5,liftover,exon,158072676,158072721,1000,-,.,"Mouse_ID ""82711""; gene_id ""Znf41-ps"";",lncRNA
66086,chr5,liftover,exon,157249657,157249708,1000,+,.,"Mouse_ID ""82712""; gene_id ""Znf41-ps"";",lncRNA
66087,chr5,liftover,exon,157912765,157912821,1000,-,.,"Mouse_ID ""82713""; gene_id ""Znf41-ps"";",lncRNA


In [16]:
#Rename columns to match standard GTF field names
liftover_GTF = liftover_GTF.rename(columns={
    'rat_chr': 'seqname',
    'rat_start': 'start',
    'rat_end': 'end',
    'rat_score': 'score',
    'rat_strand': 'strand'
})

In [17]:
liftover_GTF

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute,transcript_type
0,chr5,liftover,exon,15038462,15039082,1000,+,.,"Mouse_ID ""0""; gene_id ""lnc1"";",lncRNA
1,chr5,liftover,exon,14766774,14767820,1000,+,.,"Mouse_ID ""1""; gene_id ""lnc10"";",lncRNA
2,chr13,liftover,exon,103001424,103005924,1000,+,.,"Mouse_ID ""3""; gene_id ""lnc1000"";",lncRNA
3,chr10,liftover,exon,79676628,79677257,1000,-,.,"Mouse_ID ""4""; gene_id ""lnc10000"";",lncRNA
4,chr10,liftover,exon,79677716,79678362,1000,-,.,"Mouse_ID ""5""; gene_id ""lnc10001"";",lncRNA
...,...,...,...,...,...,...,...,...,...,...
65892,chr3,liftover,exon,155794311,155794889,1000,+,.,"Mouse_ID ""82486""; gene_id ""Zfas1"";",lncRNA
66085,chr5,liftover,exon,158072676,158072721,1000,-,.,"Mouse_ID ""82711""; gene_id ""Znf41-ps"";",lncRNA
66086,chr5,liftover,exon,157249657,157249708,1000,+,.,"Mouse_ID ""82712""; gene_id ""Znf41-ps"";",lncRNA
66087,chr5,liftover,exon,157912765,157912821,1000,-,.,"Mouse_ID ""82713""; gene_id ""Znf41-ps"";",lncRNA


In [2]:
# URL to the file
url = "http://hgdownload.soe.ucsc.edu/goldenPath/rn7/bigZips/genes/ncbiRefSeq.gtf.gz"

# Download and read the .gtf.gz file
rn7_GTF = pd.read_csv(url, 
                 compression='gzip', 
                 sep='\t', 
                 comment='#', 
                 header=None, 
                 names=['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute'])

#Filter transcript entries from GTF
rn7_GTF_transcripts = rn7_GTF[rn7_GTF['feature'] == 'transcript']
rn7_GTF_transcripts

#Filter exon entries from GTF
rn7_GTF_exons = rn7_GTF[rn7_GTF['feature'] == 'exon']
rn7_GTF_exons

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute
1,chr1,ncbiRefSeq,exon,76909,77114,.,+,.,"gene_id ""Vom2r3""; transcript_id ""XM_039088200...."
3,chr1,ncbiRefSeq,exon,79753,80035,.,+,.,"gene_id ""Vom2r3""; transcript_id ""XM_039088200...."
5,chr1,ncbiRefSeq,exon,80483,81286,.,+,.,"gene_id ""Vom2r3""; transcript_id ""XM_039088200...."
7,chr1,ncbiRefSeq,exon,83590,83713,.,+,.,"gene_id ""Vom2r3""; transcript_id ""XM_039088200...."
9,chr1,ncbiRefSeq,exon,84879,85762,.,+,.,"gene_id ""Vom2r3""; transcript_id ""XM_039088200...."
...,...,...,...,...,...,...,...,...,...
2529504,chrY_NW_023637722v1_random,ncbiRefSeq,exon,293338,293506,.,+,.,"gene_id ""LOC103694730""; transcript_id ""XR_0054..."
2529505,chrY_NW_023637722v1_random,ncbiRefSeq,exon,300877,300988,.,+,.,"gene_id ""LOC103694730""; transcript_id ""XR_0054..."
2529507,chrY_NW_023637722v1_random,ncbiRefSeq,exon,317993,319851,.,-,.,"gene_id ""LOC120099645""; transcript_id ""XR_0054..."
2529508,chrY_NW_023637722v1_random,ncbiRefSeq,exon,320383,320541,.,-,.,"gene_id ""LOC120099645""; transcript_id ""XR_0054..."


In [3]:
# Define a function to parse attributes and extract key-value pairs
def parse_attributes(attribute_string):
    attr_dict = {}
    if pd.notnull(attribute_string):  # Check if the string is not NaN
        attributes = attribute_string.split(';')
        for attribute in attributes:
            if attribute.strip():  # Ensure it's not an empty string
                key, value = attribute.strip().split(' ', 1)
                attr_dict[key] = value.strip('"')  # Strip quotes from value
    return attr_dict

# Apply the function to extract the information into a DataFrame
attributes_df = rn7_GTF_exons['attribute'].apply(parse_attributes).apply(pd.Series)

# Drop the 'gene_id' column
attributes_df = attributes_df.drop(columns=['gene_id'])
attributes_df = attributes_df.drop(columns=['transcript_id'])

# Rename 'gene_name' to 'gene_id'
attributes_df = attributes_df.rename(columns={'gene_name': 'gene_id'})
attributes_df = attributes_df.fillna('')
attributes_df

Unnamed: 0,exon_number,exon_id,gene_id
1,1,XM_039088200.1.1,Vom2r3
3,2,XM_039088200.1.2,Vom2r3
5,3,XM_039088200.1.3,Vom2r3
7,4,XM_039088200.1.4,Vom2r3
9,5,XM_039088200.1.5,Vom2r3
...,...,...,...
2529504,3,XR_005498756.1.3,LOC103694730
2529505,4,XR_005498756.1.4,LOC103694730
2529507,3,XR_005498757.1.3,LOC120099645
2529508,2,XR_005498757.1.2,LOC120099645


In [4]:
attributes_df['gene_id'].nunique()

33294

In [5]:
# Concatenate the two DataFrames side by side
merged_combined = pd.concat([rn7_GTF_exons, attributes_df], axis=1)



merged_combined

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute,exon_number,exon_id,gene_id
1,chr1,ncbiRefSeq,exon,76909,77114,.,+,.,"gene_id ""Vom2r3""; transcript_id ""XM_039088200....",1,XM_039088200.1.1,Vom2r3
3,chr1,ncbiRefSeq,exon,79753,80035,.,+,.,"gene_id ""Vom2r3""; transcript_id ""XM_039088200....",2,XM_039088200.1.2,Vom2r3
5,chr1,ncbiRefSeq,exon,80483,81286,.,+,.,"gene_id ""Vom2r3""; transcript_id ""XM_039088200....",3,XM_039088200.1.3,Vom2r3
7,chr1,ncbiRefSeq,exon,83590,83713,.,+,.,"gene_id ""Vom2r3""; transcript_id ""XM_039088200....",4,XM_039088200.1.4,Vom2r3
9,chr1,ncbiRefSeq,exon,84879,85762,.,+,.,"gene_id ""Vom2r3""; transcript_id ""XM_039088200....",5,XM_039088200.1.5,Vom2r3
...,...,...,...,...,...,...,...,...,...,...,...,...
2529504,chrY_NW_023637722v1_random,ncbiRefSeq,exon,293338,293506,.,+,.,"gene_id ""LOC103694730""; transcript_id ""XR_0054...",3,XR_005498756.1.3,LOC103694730
2529505,chrY_NW_023637722v1_random,ncbiRefSeq,exon,300877,300988,.,+,.,"gene_id ""LOC103694730""; transcript_id ""XR_0054...",4,XR_005498756.1.4,LOC103694730
2529507,chrY_NW_023637722v1_random,ncbiRefSeq,exon,317993,319851,.,-,.,"gene_id ""LOC120099645""; transcript_id ""XR_0054...",3,XR_005498757.1.3,LOC120099645
2529508,chrY_NW_023637722v1_random,ncbiRefSeq,exon,320383,320541,.,-,.,"gene_id ""LOC120099645""; transcript_id ""XR_0054...",2,XR_005498757.1.2,LOC120099645


In [6]:
merged_combined[merged_combined['gene_id'] == "Cyp1a1"]

Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,attribute,exon_number,exon_id,gene_id
2229150,chr8,ncbiRefSeq,exon,58096021,58096146,.,+,.,"gene_id ""Cyp1a1""; transcript_id ""XM_006243150....",1,XM_006243150.3.1,Cyp1a1
2229152,chr8,ncbiRefSeq,exon,58098636,58099483,.,+,.,"gene_id ""Cyp1a1""; transcript_id ""XM_006243150....",2,XM_006243150.3.2,Cyp1a1
2229155,chr8,ncbiRefSeq,exon,58099948,58100074,.,+,.,"gene_id ""Cyp1a1""; transcript_id ""XM_006243150....",3,XM_006243150.3.3,Cyp1a1
2229157,chr8,ncbiRefSeq,exon,58100159,58100248,.,+,.,"gene_id ""Cyp1a1""; transcript_id ""XM_006243150....",4,XM_006243150.3.4,Cyp1a1
2229159,chr8,ncbiRefSeq,exon,58100342,58100465,.,+,.,"gene_id ""Cyp1a1""; transcript_id ""XM_006243150....",5,XM_006243150.3.5,Cyp1a1
2229161,chr8,ncbiRefSeq,exon,58100614,58100700,.,+,.,"gene_id ""Cyp1a1""; transcript_id ""XM_006243150....",6,XM_006243150.3.6,Cyp1a1
2229163,chr8,ncbiRefSeq,exon,58100847,58102130,.,+,.,"gene_id ""Cyp1a1""; transcript_id ""XM_006243150....",7,XM_006243150.3.7,Cyp1a1
2229169,chr8,ncbiRefSeq,exon,58096089,58096146,.,+,.,"gene_id ""Cyp1a1""; transcript_id ""NM_012540.3"";...",1,NM_012540.3.1,Cyp1a1
2229171,chr8,ncbiRefSeq,exon,58098633,58099483,.,+,.,"gene_id ""Cyp1a1""; transcript_id ""NM_012540.3"";...",2,NM_012540.3.2,Cyp1a1
2229174,chr8,ncbiRefSeq,exon,58099948,58100074,.,+,.,"gene_id ""Cyp1a1""; transcript_id ""NM_012540.3"";...",3,NM_012540.3.3,Cyp1a1


In [7]:
# Create a new column 'transcript_type' based on the conditions
merged_combined['transcript_type'] = np.where(
    merged_combined['attribute'].str.contains('NR_', na=False), 'lncRNA',
    np.where(
        merged_combined['attribute'].str.contains('XR_', na=False), 'predicted_lncRNA',
        np.where(
            merged_combined['attribute'].str.contains('XM_', na=False), 'predicted_mRNA',
            np.where(
                merged_combined['attribute'].str.contains('NM_', na=False), 'mRNA',
                'unknown'
            )
        )
    )
)

#Drop old 'attribute' column and rename 'gene_id' as the new 'attribute'
merged_combined = merged_combined.drop(columns=['attribute'])
merged_combined = merged_combined.rename(columns={'gene_id': 'attribute'})

merged_combined


Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,exon_number,exon_id,attribute,transcript_type
1,chr1,ncbiRefSeq,exon,76909,77114,.,+,.,1,XM_039088200.1.1,Vom2r3,predicted_mRNA
3,chr1,ncbiRefSeq,exon,79753,80035,.,+,.,2,XM_039088200.1.2,Vom2r3,predicted_mRNA
5,chr1,ncbiRefSeq,exon,80483,81286,.,+,.,3,XM_039088200.1.3,Vom2r3,predicted_mRNA
7,chr1,ncbiRefSeq,exon,83590,83713,.,+,.,4,XM_039088200.1.4,Vom2r3,predicted_mRNA
9,chr1,ncbiRefSeq,exon,84879,85762,.,+,.,5,XM_039088200.1.5,Vom2r3,predicted_mRNA
...,...,...,...,...,...,...,...,...,...,...,...,...
2529504,chrY_NW_023637722v1_random,ncbiRefSeq,exon,293338,293506,.,+,.,3,XR_005498756.1.3,LOC103694730,predicted_lncRNA
2529505,chrY_NW_023637722v1_random,ncbiRefSeq,exon,300877,300988,.,+,.,4,XR_005498756.1.4,LOC103694730,predicted_lncRNA
2529507,chrY_NW_023637722v1_random,ncbiRefSeq,exon,317993,319851,.,-,.,3,XR_005498757.1.3,LOC120099645,predicted_lncRNA
2529508,chrY_NW_023637722v1_random,ncbiRefSeq,exon,320383,320541,.,-,.,2,XR_005498757.1.2,LOC120099645,predicted_lncRNA


In [8]:
#Count unique values per column for rows where 'transcript_type' is 'mRNA'
merged_combined[merged_combined['transcript_type'] == 'mRNA'].nunique()

seqname                26
source                  1
feature                 1
start              162274
end                162283
score                   1
strand                  2
frame                   1
exon_number           118
exon_id            172276
attribute           16765
transcript_type         1
dtype: int64

In [9]:
#Calculate transcript length
merged_combined['length'] = merged_combined['end'] - merged_combined['start']

#Filter out rows where transcript_type is 'lncRNA' and length is less than 200
rn7_GTF_transcripts_filtered = merged_combined[
    ~((merged_combined['transcript_type'] == 'lncRNA') & (merged_combined['length'] < 200))
]

rn7_GTF_transcripts_filtered = rn7_GTF_transcripts_filtered#.drop_duplicates(subset=['attribute'], keep='first')

#Assign unique Rat_IDs to each remaining transcript
rn7_GTF_transcripts_filtered['Rat_ID'] = range(len(rn7_GTF_transcripts_filtered))

#Create a copy for further processing
rn7_GTF_transcripts_filtered2 = rn7_GTF_transcripts_filtered.copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rn7_GTF_transcripts_filtered['Rat_ID'] = range(len(rn7_GTF_transcripts_filtered))


In [10]:
# Create a new DataFrame without the 'length' column
rn7_GTF_transcripts_filtered['attribute'] = (
    'Rat_ID "' + rn7_GTF_transcripts_filtered['Rat_ID'].astype(str) + 
    '"; gene_id "' + rn7_GTF_transcripts_filtered['attribute'] + '";'
)

rn7_GTF_transcripts_filtered = rn7_GTF_transcripts_filtered.drop(columns=['length', 'Rat_ID'])


# Display the new DataFrame
rn7_GTF_transcripts_filtered

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rn7_GTF_transcripts_filtered['attribute'] = (


Unnamed: 0,seqname,source,feature,start,end,score,strand,frame,exon_number,exon_id,attribute,transcript_type
1,chr1,ncbiRefSeq,exon,76909,77114,.,+,.,1,XM_039088200.1.1,"Rat_ID ""0""; gene_id ""Vom2r3"";",predicted_mRNA
3,chr1,ncbiRefSeq,exon,79753,80035,.,+,.,2,XM_039088200.1.2,"Rat_ID ""1""; gene_id ""Vom2r3"";",predicted_mRNA
5,chr1,ncbiRefSeq,exon,80483,81286,.,+,.,3,XM_039088200.1.3,"Rat_ID ""2""; gene_id ""Vom2r3"";",predicted_mRNA
7,chr1,ncbiRefSeq,exon,83590,83713,.,+,.,4,XM_039088200.1.4,"Rat_ID ""3""; gene_id ""Vom2r3"";",predicted_mRNA
9,chr1,ncbiRefSeq,exon,84879,85762,.,+,.,5,XM_039088200.1.5,"Rat_ID ""4""; gene_id ""Vom2r3"";",predicted_mRNA
...,...,...,...,...,...,...,...,...,...,...,...,...
2529504,chrY_NW_023637722v1_random,ncbiRefSeq,exon,293338,293506,.,+,.,3,XR_005498756.1.3,"Rat_ID ""1141644""; gene_id ""LOC103694730"";",predicted_lncRNA
2529505,chrY_NW_023637722v1_random,ncbiRefSeq,exon,300877,300988,.,+,.,4,XR_005498756.1.4,"Rat_ID ""1141645""; gene_id ""LOC103694730"";",predicted_lncRNA
2529507,chrY_NW_023637722v1_random,ncbiRefSeq,exon,317993,319851,.,-,.,3,XR_005498757.1.3,"Rat_ID ""1141646""; gene_id ""LOC120099645"";",predicted_lncRNA
2529508,chrY_NW_023637722v1_random,ncbiRefSeq,exon,320383,320541,.,-,.,2,XR_005498757.1.2,"Rat_ID ""1141647""; gene_id ""LOC120099645"";",predicted_lncRNA


In [11]:
# Concatenate the DataFrames row-wise
stacked_df = pd.concat([liftover_GTF, rn7_GTF_transcripts_filtered], axis=0, ignore_index=True)

stacked_df['new'] = stacked_df['attribute'] + ' transcript_type "' + stacked_df['transcript_type'] + '"'

# Drop the old 'attribute' and 'type' columns
stacked_df = stacked_df.drop(columns=['attribute', 'transcript_type'])

# Rename the 'new' column to 'attribute'
stacked_df = stacked_df.rename(columns={'new': 'attribute'})

stacked_df['score'] = 1000

# Display the resulting DataFrame
stacked_df

NameError: name 'liftover_GTF' is not defined

In [None]:
rn7_GTF_transcripts_filtered2

In [None]:
liftover_GTF

In [None]:
# Define a function to parse attributes and extract key-value pairs
def parse_attributes(attribute_string):
    attr_dict = {}
    if pd.notnull(attribute_string):  # Check if the string is not NaN
        attributes = attribute_string.split(';')
        for attribute in attributes:
            if attribute.strip():  # Ensure it's not an empty string
                key, value = attribute.strip().split(' ', 1)
                attr_dict[key] = value.strip('"')  # Strip quotes from value
    return attr_dict

# Apply the function to extract the information into a DataFrame
attributes_df = liftover_GTF['attribute'].apply(parse_attributes).apply(pd.Series)

liftover_GTF_temp = pd.concat([liftover_GTF, attributes_df], axis=1)
liftover_GTF_temp = liftover_GTF_temp.rename(columns={'gene_id': 'Gene'})


In [None]:
#Extract unique gene and transcript type pairs from LiftOver GTF
liftover_genes = liftover_GTF_temp[['Gene', 'transcript_type']].drop_duplicates(keep='first')

In [12]:
#Rename 'attribute' to 'Gene' and filter for lncRNA or mRNA transcript types
rn7_genes = rn7_GTF_transcripts_filtered2[['attribute','transcript_type']].rename(columns={'attribute': 'Gene'})
rn7_genes = rn7_genes[(rn7_genes['transcript_type'] == 'lncRNA') |
                      (rn7_genes['transcript_type'] == 'mRNA')].drop_duplicates(keep='first')
rn7_genes

Unnamed: 0,Gene,transcript_type
29,Vom2r3,mRNA
175,Vom2r4,mRNA
356,Vom2r1,mRNA
616,Vom2r5,mRNA
704,Vom2r6,mRNA
...,...,...
2528626,Ddx3,mRNA
2528665,Usp9y,mRNA
2529213,Dkc1,mRNA
2529349,Tspy1,mRNA


In [13]:
#Get unique lncRNA gene entries
rn7_genes[(rn7_genes['transcript_type'] == 'lncRNA')].drop_duplicates(keep='first')

Unnamed: 0,Gene,transcript_type
14762,LOC100910237,lncRNA
29268,Oprm1,lncRNA
34373,Cahm,lncRNA
43580,Rps9l1,lncRNA
45919,Apeg3,lncRNA
...,...,...
2469326,Kantr,lncRNA
2482025,RGD1564534,lncRNA
2488552,LOC100911498,lncRNA
2488601,Ftx,lncRNA


In [14]:
# Get unique mRNA gene entries
rn7_genes[(rn7_genes['transcript_type'] == 'mRNA')].drop_duplicates(keep='first')

Unnamed: 0,Gene,transcript_type
29,Vom2r3,mRNA
175,Vom2r4,mRNA
356,Vom2r1,mRNA
616,Vom2r5,mRNA
704,Vom2r6,mRNA
...,...,...
2528626,Ddx3,mRNA
2528665,Usp9y,mRNA
2529213,Dkc1,mRNA
2529349,Tspy1,mRNA


In [58]:
#Merge LiftOver and rn7 gene lists, removing duplicates
gene_lists_merged = pd.concat([liftover_genes, rn7_genes], axis=0).drop_duplicates(keep='first')
gene_lists_merged

Unnamed: 0,Gene,transcript_type
0,lnc1,lncRNA
1,lnc10,lncRNA
2,lnc1000,lncRNA
3,lnc10000,lncRNA
4,lnc10001,lncRNA
...,...,...
2528626,Ddx3,mRNA
2528665,Usp9y,mRNA
2529213,Dkc1,mRNA
2529349,Tspy1,mRNA


In [59]:
#Save merged gene list to a tab-separated text file
gene_lists_merged.to_csv('RN_and_Karri_Annotations_Restructured_rn7.txt', sep='\t', index=False)

In [44]:
#Export stacked_df as GTF for gene expression analysis without headers or quotes
import csv
stacked_df.to_csv('./Karri_Annotations_Restructured_rn7_for_Gene_Expression.gtf', sep='\t', 
                  index=False, header=False, quoting=csv.QUOTE_NONE, escapechar='')