# Visualization of mm39 DREs (histograms)

### 1. Import packages

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

### 2. Import DRE genomic coordinates

In [3]:
# Define the file path
file_path = "../01_Intersect_DRE_and_GTF/DREs_in_lncRNAs_MSS_0.861.bed"

# Read the file into a DataFrame
df = pd.read_csv(file_path, sep='\t', header=0, index_col=None)

# Display the DataFrame
df

Unnamed: 0,Chromosome,DRE_Start,DRE_End,DRE_ID,DRE_Strand,DRE_MSS,GTF_Chromosome,Gene_Start,Gene_End,Gene_Strand,Gene_ORIG_Start,Metadata
0,chr1,3493457,3493476,1,+,0.870102,chr1,3284705,3751721,-,3741721,"gene_id""Xkr4"";transcript_id""Xkr4"";gene_name""Xkr4"";gene_name2""NM_001011874(Non-Dups)"";gene_biotyp..."
1,chr1,4240913,4240932,2,-,0.877970,chr1,4190089,4489464,-,4479464,"gene_id""Rp1"";transcript_id""Rp1"";gene_name""Rp1"";gene_name2""NM_001195662#NM_001370921#NM_011283(No..."
2,chr1,4427961,4427980,3,+,0.872268,chr1,4190089,4489464,-,4479464,"gene_id""Rp1"";transcript_id""Rp1"";gene_name""Rp1"";gene_name2""NM_001195662#NM_001370921#NM_011283(No..."
3,chr1,4476897,4476916,4,-,0.901051,chr1,4190089,4489464,-,4479464,"gene_id""Rp1"";transcript_id""Rp1"";gene_name""Rp1"";gene_name2""NM_001195662#NM_001370921#NM_011283(No..."
4,chr1,4491129,4491148,5,-,0.867355,.,-1,-1,.,.,.
...,...,...,...,...,...,...,...,...,...,...,...,...
60905,chrY,90644962,90644981,44321,+,0.871128,.,-1,-1,.,.,.
60906,chrY,90644966,90644985,44322,+,0.861266,.,-1,-1,.,.,.
60907,chrY,90776986,90777005,44323,-,0.869231,.,-1,-1,.,.,.
60908,chrY,90782870,90782889,44324,+,0.887751,.,-1,-1,.,.,.


In [4]:
# Extract the "gene_id" information and create a new column
df['gene_id'] = df['Metadata'].str.extract(r'gene_id"([^"]+)"', expand=False)
# Fill rows without "gene_id" with NA
df['gene_id'].fillna('NA', inplace=True)

# Extract the "transcript_id" information and create a new column
df['transcript_id'] = df['Metadata'].str.extract(r'transcript_id"([^"]+)"', expand=False)
# Fill rows without "transcript_id" with NA
df['transcript_id'].fillna('NA', inplace=True)

# Extract the "gene_name" information and create a new column
df['gene_name'] = df['Metadata'].str.extract(r'gene_name"([^"]+)"', expand=False)
# Fill rows without "gene_name" with NA
df['gene_name'].fillna('NA', inplace=True)

# Extract the "gene_name2" information and create a new column
df['gene_name2'] = df['Metadata'].str.extract(r'gene_name2"([^"]+)"', expand=False)
# Fill rows without "gene_name2" with NA
df['gene_name2'].fillna('NA', inplace=True)

# Extract the "gene_biotype" information and create a new column
df['gene_biotype'] = df['Metadata'].str.extract(r'gene_biotype"([^"]+)"', expand=False)
# Fill rows without "gene_biotype" with NA
df['gene_biotype'].fillna('NA', inplace=True)

# Extract the "NR_annotation" information and create a new column
df['NR_annotation'] = df['Metadata'].str.extract(r'NR_annotation"([^"]+)"', expand=False)
# Fill rows without "NR_annotation" with NA
df['NR_annotation'].fillna('NA', inplace=True)

# Remove the "Metadata" column
df.drop(columns=['Metadata'], inplace=True)

# Drop duplicate rows and keep the first occurrence
df = df.drop_duplicates(keep='first')

# Display the DataFrame
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['gene_id'].fillna('NA', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['transcript_id'].fillna('NA', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alway

Unnamed: 0,Chromosome,DRE_Start,DRE_End,DRE_ID,DRE_Strand,DRE_MSS,GTF_Chromosome,Gene_Start,Gene_End,Gene_Strand,Gene_ORIG_Start,gene_id,transcript_id,gene_name,gene_name2,gene_biotype,NR_annotation
0,chr1,3493457,3493476,1,+,0.870102,chr1,3284705,3751721,-,3741721,Xkr4,Xkr4,Xkr4,NM_001011874(Non-Dups),NM,
1,chr1,4240913,4240932,2,-,0.877970,chr1,4190089,4489464,-,4479464,Rp1,Rp1,Rp1,NM_001195662#NM_001370921#NM_011283(Non-Dups),NM,
2,chr1,4427961,4427980,3,+,0.872268,chr1,4190089,4489464,-,4479464,Rp1,Rp1,Rp1,NM_001195662#NM_001370921#NM_011283(Non-Dups),NM,
3,chr1,4476897,4476916,4,-,0.901051,chr1,4190089,4489464,-,4479464,Rp1,Rp1,Rp1,NM_001195662#NM_001370921#NM_011283(Non-Dups),NM,
4,chr1,4491129,4491148,5,-,0.867355,.,-1,-1,.,.,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60905,chrY,90644962,90644981,44321,+,0.871128,.,-1,-1,.,.,,,,,,
60906,chrY,90644966,90644985,44322,+,0.861266,.,-1,-1,.,.,,,,,,
60907,chrY,90776986,90777005,44323,-,0.869231,.,-1,-1,.,.,,,,,,
60908,chrY,90782870,90782889,44324,+,0.887751,.,-1,-1,.,.,,,,,,


In [5]:
df[df['gene_name'] == 'Cyp1a1']

Unnamed: 0,Chromosome,DRE_Start,DRE_End,DRE_ID,DRE_Strand,DRE_MSS,GTF_Chromosome,Gene_Start,Gene_End,Gene_Strand,Gene_ORIG_Start,gene_id,transcript_id,gene_name,gene_name2,gene_biotype,NR_annotation
56655,chr9,57601285,57601304,40865,-,0.920791,chr9,57585211,57611107,+,57595211,Cyp1a1,Cyp1a1,Cyp1a1,NM_009992#NM_001136059(Non-Dups),NM,
56656,chr9,57603680,57603699,40866,-,0.980536,chr9,57585211,57611107,+,57595211,Cyp1a1,Cyp1a1,Cyp1a1,NM_009992#NM_001136059(Non-Dups),NM,
56657,chr9,57603902,57603921,40867,-,0.914418,chr9,57585211,57611107,+,57595211,Cyp1a1,Cyp1a1,Cyp1a1,NM_009992#NM_001136059(Non-Dups),NM,
56658,chr9,57604015,57604034,40868,+,0.934834,chr9,57585211,57611107,+,57595211,Cyp1a1,Cyp1a1,Cyp1a1,NM_009992#NM_001136059(Non-Dups),NM,
56659,chr9,57604395,57604414,40869,-,0.905623,chr9,57585211,57611107,+,57595211,Cyp1a1,Cyp1a1,Cyp1a1,NM_009992#NM_001136059(Non-Dups),NM,


In [6]:
# Count unique values and their counts in the 'gene_name' column
gene_name_counts = df['gene_name'].value_counts().reset_index()
gene_name_counts.columns = ['gene_name', 'count']

# Create a new DataFrame 'df2' with the unique values and their counts
df2 = gene_name_counts

df2

Unnamed: 0,gene_name,count
0,,13328
1,Spire1,75
2,Wwox,47
3,Apbb2,43
4,Sdk1,38
...,...,...
23597,lnc14044,1
23598,lnc30181,1
23599,lnc30180,1
23600,Akain1,1


In [7]:
# Save the DataFrame
df2.to_csv('DRE_Counts_by_Gene.txt', sep='\t', index=False)


In [8]:
df3 = df.copy()

In [9]:
# Create a new 'DRE_Avg' column by calculating the average of 'DRE_End' and 'DRE_Start'
df3['DRE_Avg'] = np.ceil((df3['DRE_End'] + df3['DRE_Start']) / 2)


# Convert 'Gene_ORIG_Start' column to a numeric data type, handling the "." values by converting them to NaN
df3['Gene_ORIG_Start_Temp'] = pd.to_numeric(df3['Gene_ORIG_Start'], errors='coerce')


# Initialize the 'Dist_to_Start' column with NaN values
df3['Dist_to_Start'] = None

# Calculate 'Dist_to_Start' based on 'Gene_Strand'
df3.loc[df3['Gene_Strand'] == '-', 'Dist_to_Start'] = df3['Gene_ORIG_Start_Temp'] - df3['DRE_Avg']
df3.loc[df3['Gene_Strand'] == '+', 'Dist_to_Start'] = df3['DRE_Avg'] - df3['Gene_ORIG_Start_Temp']


df3

Unnamed: 0,Chromosome,DRE_Start,DRE_End,DRE_ID,DRE_Strand,DRE_MSS,GTF_Chromosome,Gene_Start,Gene_End,Gene_Strand,Gene_ORIG_Start,gene_id,transcript_id,gene_name,gene_name2,gene_biotype,NR_annotation,DRE_Avg,Gene_ORIG_Start_Temp,Dist_to_Start
0,chr1,3493457,3493476,1,+,0.870102,chr1,3284705,3751721,-,3741721,Xkr4,Xkr4,Xkr4,NM_001011874(Non-Dups),NM,,3493467.0,3741721.0,248254.0
1,chr1,4240913,4240932,2,-,0.877970,chr1,4190089,4489464,-,4479464,Rp1,Rp1,Rp1,NM_001195662#NM_001370921#NM_011283(Non-Dups),NM,,4240923.0,4479464.0,238541.0
2,chr1,4427961,4427980,3,+,0.872268,chr1,4190089,4489464,-,4479464,Rp1,Rp1,Rp1,NM_001195662#NM_001370921#NM_011283(Non-Dups),NM,,4427971.0,4479464.0,51493.0
3,chr1,4476897,4476916,4,-,0.901051,chr1,4190089,4489464,-,4479464,Rp1,Rp1,Rp1,NM_001195662#NM_001370921#NM_011283(Non-Dups),NM,,4476907.0,4479464.0,2557.0
4,chr1,4491129,4491148,5,-,0.867355,.,-1,-1,.,.,,,,,,,4491139.0,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60905,chrY,90644962,90644981,44321,+,0.871128,.,-1,-1,.,.,,,,,,,90644972.0,,
60906,chrY,90644966,90644985,44322,+,0.861266,.,-1,-1,.,.,,,,,,,90644976.0,,
60907,chrY,90776986,90777005,44323,-,0.869231,.,-1,-1,.,.,,,,,,,90776996.0,,
60908,chrY,90782870,90782889,44324,+,0.887751,.,-1,-1,.,.,,,,,,,90782880.0,,


In [10]:
df3[df3['gene_name'] == 'Cyp1a1']

Unnamed: 0,Chromosome,DRE_Start,DRE_End,DRE_ID,DRE_Strand,DRE_MSS,GTF_Chromosome,Gene_Start,Gene_End,Gene_Strand,Gene_ORIG_Start,gene_id,transcript_id,gene_name,gene_name2,gene_biotype,NR_annotation,DRE_Avg,Gene_ORIG_Start_Temp,Dist_to_Start
56655,chr9,57601285,57601304,40865,-,0.920791,chr9,57585211,57611107,+,57595211,Cyp1a1,Cyp1a1,Cyp1a1,NM_009992#NM_001136059(Non-Dups),NM,,57601295.0,57595211.0,6084.0
56656,chr9,57603680,57603699,40866,-,0.980536,chr9,57585211,57611107,+,57595211,Cyp1a1,Cyp1a1,Cyp1a1,NM_009992#NM_001136059(Non-Dups),NM,,57603690.0,57595211.0,8479.0
56657,chr9,57603902,57603921,40867,-,0.914418,chr9,57585211,57611107,+,57595211,Cyp1a1,Cyp1a1,Cyp1a1,NM_009992#NM_001136059(Non-Dups),NM,,57603912.0,57595211.0,8701.0
56658,chr9,57604015,57604034,40868,+,0.934834,chr9,57585211,57611107,+,57595211,Cyp1a1,Cyp1a1,Cyp1a1,NM_009992#NM_001136059(Non-Dups),NM,,57604025.0,57595211.0,8814.0
56659,chr9,57604395,57604414,40869,-,0.905623,chr9,57585211,57611107,+,57595211,Cyp1a1,Cyp1a1,Cyp1a1,NM_009992#NM_001136059(Non-Dups),NM,,57604405.0,57595211.0,9194.0


In [11]:
# Rename the columns
df3 = df3.rename(columns={
    'Chromosome': 'Chr',
    'DRE_Start': 'Start',
    'DRE_End': 'End',
    'DRE_ID': 'DRE_ID',
    'DRE_MSS': 'MSS',
    'GTF_Chromosome': 'GTF_Chrom',
    'Gene_Start': 'Gene_Start',
    'Gene_End': 'Gene_End',
    'Gene_Strand': 'Strand',
    'Gene_ORIG_Start': 'Gene_ORIG_Start',
    'gene_id': 'Gene_ID',
    'transcript_id': 'Transcript_ID',
    'gene_name': 'Gene_Name',
    'gene_name2': 'Gene_Name2',
    'gene_biotype': 'Gene_Biotype',
    'NR_annotation': 'NR_Annotation',
    'Dist_to_Start': 'Dist_to_Start'
})

# Select the desired columns to create a new DataFrame
df_trimmed = df3[['DRE_ID', 'Chr', 'Start', 'End', 
                  'Dist_to_Start',  'Gene_Biotype',
                  'Gene_ID', 'Transcript_ID', 'Gene_Name', 'Gene_Name2','NR_Annotation', 'MSS']]



In [12]:
df_trimmed.to_csv('MMU_Chromosomal_Location_of_DREs.txt', sep='\t', index=False)
