# Processing of rn7 DREs

### 1. Import Packages

In [40]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

### 2. Import raw data

In [42]:
# Define the file path
file_path = "../01_Intersect_DRE_and_GTF/DREs_in_lncRNAs_MSS_0.825.bed"

# Read the file into a DataFrame
df = pd.read_csv(file_path, sep='\t', header=0, index_col=None)

# Display the DataFrame
df

Unnamed: 0,Chromosome,DRE_Start,DRE_End,DRE_ID,DRE_Strand,DRE_MSS,GTF_Chromosome,Gene_Start,Gene_End,Gene_Strand,Gene_ORIG_Start,Metadata
0,chr1,35835,35854,1,+,0.896882,.,-1,-1,.,.,.
1,chr1,43577,43596,2,-,0.907811,.,-1,-1,.,.,.
2,chr1,106312,106331,3,-,0.887080,.,-1,-1,.,.,.
3,chr1,106695,106714,4,-,0.866600,.,-1,-1,.,.,.
4,chr1,140907,140926,5,-,0.887080,.,-1,-1,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...
296989,chrY,18104559,18104578,223934,-,0.842006,.,-1,-1,.,.,.
296990,chrY,18258652,18258671,223935,-,0.843007,.,-1,-1,.,.,.
296991,chrY,18264228,18264247,223936,-,0.872179,chrY,18260839,18276569,-,18266569,"Rat_ID""32809"";gene_id""LOC120099597"";transcript_type""predicted_lncRNA"""
296992,chrY,18277330,18277349,223937,+,0.913325,.,-1,-1,.,.,.


In [43]:
# Extract the "gene_id" information and create a new column
df['gene_id'] = df['Metadata'].str.extract(r'gene_id"([^"]+)"', expand=False)
# Fill rows without "gene_id" with NA
df['gene_id'].fillna('NA', inplace=True)

# Extract the "gene_name" information and create a new column
df['Rat_ID'] = df['Metadata'].str.extract(r'Rat_ID"([^"]+)"', expand=False)
# Fill rows without "gene_name" with NA
df['Rat_ID'].fillna('NA', inplace=True)

# Extract the "gene_name2" information and create a new column
df['Mouse_ID'] = df['Metadata'].str.extract(r'Mouse_ID"([^"]+)"', expand=False)
# Fill rows without "gene_name2" with NA
df['Mouse_ID'].fillna('NA', inplace=True)

# Extract the "gene_biotype" information and create a new column
df['transcript_type'] = df['Metadata'].str.extract(r'transcript_type"([^"]+)"', expand=False)
# Fill rows without "gene_biotype" with NA
df['transcript_type'].fillna('NA', inplace=True)

# Remove the "Metadata" column
df.drop(columns=['Metadata'], inplace=True)

# Display the DataFrame
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['gene_id'].fillna('NA', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Rat_ID'].fillna('NA', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behav

Unnamed: 0,Chromosome,DRE_Start,DRE_End,DRE_ID,DRE_Strand,DRE_MSS,GTF_Chromosome,Gene_Start,Gene_End,Gene_Strand,Gene_ORIG_Start,gene_id,Rat_ID,Mouse_ID,transcript_type
0,chr1,35835,35854,1,+,0.896882,.,-1,-1,.,.,,,,
1,chr1,43577,43596,2,-,0.907811,.,-1,-1,.,.,,,,
2,chr1,106312,106331,3,-,0.887080,.,-1,-1,.,.,,,,
3,chr1,106695,106714,4,-,0.866600,.,-1,-1,.,.,,,,
4,chr1,140907,140926,5,-,0.887080,.,-1,-1,.,.,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296989,chrY,18104559,18104578,223934,-,0.842006,.,-1,-1,.,.,,,,
296990,chrY,18258652,18258671,223935,-,0.843007,.,-1,-1,.,.,,,,
296991,chrY,18264228,18264247,223936,-,0.872179,chrY,18260839,18276569,-,18266569,LOC120099597,32809,,predicted_lncRNA
296992,chrY,18277330,18277349,223937,+,0.913325,.,-1,-1,.,.,,,,


In [45]:
# Drop duplicate rows and keep the first occurrence
df = df.drop_duplicates(keep='first')

# Display the DataFrame
df

Unnamed: 0,Chromosome,DRE_Start,DRE_End,DRE_ID,DRE_Strand,DRE_MSS,GTF_Chromosome,Gene_Start,Gene_End,Gene_Strand,Gene_ORIG_Start,gene_id,Rat_ID,Mouse_ID,transcript_type
0,chr1,35835,35854,1,+,0.896882,.,-1,-1,.,.,,,,
1,chr1,43577,43596,2,-,0.907811,.,-1,-1,.,.,,,,
2,chr1,106312,106331,3,-,0.887080,.,-1,-1,.,.,,,,
3,chr1,106695,106714,4,-,0.866600,.,-1,-1,.,.,,,,
4,chr1,140907,140926,5,-,0.887080,.,-1,-1,.,.,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296989,chrY,18104559,18104578,223934,-,0.842006,.,-1,-1,.,.,,,,
296990,chrY,18258652,18258671,223935,-,0.843007,.,-1,-1,.,.,,,,
296991,chrY,18264228,18264247,223936,-,0.872179,chrY,18260839,18276569,-,18266569,LOC120099597,32809,,predicted_lncRNA
296992,chrY,18277330,18277349,223937,+,0.913325,.,-1,-1,.,.,,,,


In [47]:
# Count unique values and their counts in the 'gene_name' column
gene_name_counts = df['gene_id'].value_counts().reset_index()
gene_name_counts.columns = ['gene_id', 'count']

# Create a new DataFrame 'df2' with the unique values and their counts
df2 = gene_name_counts

df2

Unnamed: 0,gene_id,count
0,,74872
1,lnc10851,273
2,Tenm3,249
3,Auts2l1,193
4,Camta1,179
...,...,...
52829,lnc5039,1
52830,lnc42061,1
52831,lnc41572,1
52832,LOC102551736,1


In [48]:
# Save the DataFrame
df2.to_csv('DRE_Counts_by_Gene.txt', sep='\t', index=False)


In [50]:
df3 = df.copy()

# Create a new 'DRE_Avg' column by calculating the average of 'DRE_End' and 'DRE_Start'
df3['DRE_Avg'] = np.ceil((df3['DRE_End'] + df3['DRE_Start']) / 2)


# Convert 'Gene_ORIG_Start' column to a numeric data type, handling the "." values by converting them to NaN
df3['Gene_ORIG_Start_Temp'] = pd.to_numeric(df3['Gene_ORIG_Start'], errors='coerce')


# Initialize the 'Dist_to_Start' column with NaN values
df3['Dist_to_Start'] = None

# Calculate 'Dist_to_Start' based on 'Gene_Strand'
df3.loc[df3['Gene_Strand'] == '-', 'Dist_to_Start'] = df3['Gene_ORIG_Start_Temp'] - df3['DRE_Avg']
df3.loc[df3['Gene_Strand'] == '+', 'Dist_to_Start'] = df3['DRE_Avg'] - df3['Gene_ORIG_Start_Temp']


df3

Unnamed: 0,Chromosome,DRE_Start,DRE_End,DRE_ID,DRE_Strand,DRE_MSS,GTF_Chromosome,Gene_Start,Gene_End,Gene_Strand,Gene_ORIG_Start,gene_id,Rat_ID,Mouse_ID,transcript_type,DRE_Avg,Gene_ORIG_Start_Temp,Dist_to_Start
0,chr1,35835,35854,1,+,0.896882,.,-1,-1,.,.,,,,,35845.0,,
1,chr1,43577,43596,2,-,0.907811,.,-1,-1,.,.,,,,,43587.0,,
2,chr1,106312,106331,3,-,0.887080,.,-1,-1,.,.,,,,,106322.0,,
3,chr1,106695,106714,4,-,0.866600,.,-1,-1,.,.,,,,,106705.0,,
4,chr1,140907,140926,5,-,0.887080,.,-1,-1,.,.,,,,,140917.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
296989,chrY,18104559,18104578,223934,-,0.842006,.,-1,-1,.,.,,,,,18104569.0,,
296990,chrY,18258652,18258671,223935,-,0.843007,.,-1,-1,.,.,,,,,18258662.0,,
296991,chrY,18264228,18264247,223936,-,0.872179,chrY,18260839,18276569,-,18266569,LOC120099597,32809,,predicted_lncRNA,18264238.0,18266569.0,2331.0
296992,chrY,18277330,18277349,223937,+,0.913325,.,-1,-1,.,.,,,,,18277340.0,,


In [51]:
# Rename the columns
df3 = df3.rename(columns={
    'Chromosome': 'Chr',
    'DRE_Start': 'Start',
    'DRE_End': 'End',
    'DRE_ID': 'DRE_ID',
    'DRE_MSS': 'MSS',
    'GTF_Chromosome': 'GTF_Chrom',
    'Gene_Start': 'Gene_Start',
    'Gene_End': 'Gene_End',
    'Gene_Strand': 'Strand',
    'Gene_ORIG_Start': 'Gene_ORIG_Start',
    'gene_id': 'Gene_ID',
    'transcript_type': 'Gene_Biotype',
    'Dist_to_Start': 'Dist_to_Start'
})

# Select the desired columns to create a new DataFrame
df_trimmed = df3[['DRE_ID', 'Chr', 'Start', 'End', 'DRE_Avg',
                  'Dist_to_Start',  'Gene_Biotype',
                  'Gene_ID', 'MSS', 'Rat_ID', 'Mouse_ID']]

df_trimmed

Unnamed: 0,DRE_ID,Chr,Start,End,DRE_Avg,Dist_to_Start,Gene_Biotype,Gene_ID,MSS,Rat_ID,Mouse_ID
0,1,chr1,35835,35854,35845.0,,,,0.896882,,
1,2,chr1,43577,43596,43587.0,,,,0.907811,,
2,3,chr1,106312,106331,106322.0,,,,0.887080,,
3,4,chr1,106695,106714,106705.0,,,,0.866600,,
4,5,chr1,140907,140926,140917.0,,,,0.887080,,
...,...,...,...,...,...,...,...,...,...,...,...
296989,223934,chrY,18104559,18104578,18104569.0,,,,0.842006,,
296990,223935,chrY,18258652,18258671,18258662.0,,,,0.843007,,
296991,223936,chrY,18264228,18264247,18264238.0,2331.0,predicted_lncRNA,LOC120099597,0.872179,32809,
296992,223937,chrY,18277330,18277349,18277340.0,,,,0.913325,,


In [52]:
df_trimmed.to_csv('RN_Chromosomal_Location_of_DREs.txt', sep='\t', index=False)
