# Process AHR Enrichment sites

### 1. Import Packages

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

### 2. Import raw data

In [3]:
# Define the file path
file_path = "../01_Intersect_AHR-CHIP_and_GTF/rn7_AHR_Binding_Regions.bed"

# Read the file into a DataFrame
df = pd.read_csv(file_path, sep='\t', header=0, index_col=None)

# Display the DataFrame
df

Unnamed: 0,Chromosome,AHR_Bound_Start,AHR_Bound_End,AHR_Site_Unique_ID,AHR_Binding_FC,AHR_Binding_FDR,GTF_Chromosome,Gene_Start,Gene_End,Gene_Strand,Gene_ORIG_Start,Metadata
0,chr2,149883636,149884785,1,33.877817,0.000000,chr2,149874112,149884250,+,149884112,"Mouse_ID""66905"";gene_id""Mir8120"";transcript_type""lncRNA"""
1,chr2,149883636,149884785,1,33.877817,0.000000,chr2,149879615,150004079,+,149889615,"Rat_ID""16451"";gene_id""Lekr1"";transcript_type""predicted_mRNA"""
2,chr8,55557059,55557654,2,14.597577,0.000000,chr8,55508402,55578342,+,55518402,"Rat_ID""29149"";gene_id""Ube2q2"";transcript_type""predicted_mRNA"""
3,chr8,58093927,58095676,3,16.320779,0.000000,chr8,58086021,58102130,+,58096021,"Rat_ID""29202"";gene_id""Cyp1a1"";transcript_type""predicted_mRNA"""
4,chr1,29032241,29033115,4,19.422729,0.000000,chr1,28983634,29088673,+,28993634,"Rat_ID""289"";gene_id""Ahrr"";transcript_type""mRNA"""
...,...,...,...,...,...,...,...,...,...,...,...,...
22752,chr1,227814734,227814833,13867,1.820856,0.926112,chr1,227804639,227868351,+,227814639,"Rat_ID""3776"";gene_id""Uhrf2"";transcript_type""predicted_mRNA"""
22753,chr1,227814734,227814833,13867,1.820856,0.926112,chr1,227808283,227824889,-,227814889,"Mouse_ID""4539"";gene_id""lnc14931"";transcript_type""lncRNA"""
22754,chr1,227814734,227814833,13867,1.820856,0.926112,chr1,227798283,227814820,+,227808283,"Mouse_ID""22893"";gene_id""lnc32178"";transcript_type""lncRNA"""
22755,chr1,47011019,47011118,13868,1.782129,0.926112,chr1,46967961,47021505,-,47011505,"Rat_ID""440"";gene_id""Ezr"";transcript_type""mRNA"""


In [4]:
# Extract the "gene_id" information and create a new column
df['gene_id'] = df['Metadata'].str.extract(r'gene_id"([^"]+)"', expand=False)
# Fill rows without "gene_id" with NA
df['gene_id'].fillna('NA', inplace=True)

# Extract the "gene_name" information and create a new column
df['Rat_ID'] = df['Metadata'].str.extract(r'Rat_ID"([^"]+)"', expand=False)
# Fill rows without "gene_name" with NA
df['Rat_ID'].fillna('NA', inplace=True)

# Extract the "gene_name2" information and create a new column
df['Mouse_ID'] = df['Metadata'].str.extract(r'Mouse_ID"([^"]+)"', expand=False)
# Fill rows without "gene_name2" with NA
df['Mouse_ID'].fillna('NA', inplace=True)

# Extract the "gene_biotype" information and create a new column
df['transcript_type'] = df['Metadata'].str.extract(r'transcript_type"([^"]+)"', expand=False)
# Fill rows without "gene_biotype" with NA
df['transcript_type'].fillna('NA', inplace=True)

# Remove the "Metadata" column
df.drop(columns=['Metadata'], inplace=True)

# Display the DataFrame
df

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['gene_id'].fillna('NA', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Rat_ID'].fillna('NA', inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behav

Unnamed: 0,Chromosome,AHR_Bound_Start,AHR_Bound_End,AHR_Site_Unique_ID,AHR_Binding_FC,AHR_Binding_FDR,GTF_Chromosome,Gene_Start,Gene_End,Gene_Strand,Gene_ORIG_Start,gene_id,Rat_ID,Mouse_ID,transcript_type
0,chr2,149883636,149884785,1,33.877817,0.000000,chr2,149874112,149884250,+,149884112,Mir8120,,66905,lncRNA
1,chr2,149883636,149884785,1,33.877817,0.000000,chr2,149879615,150004079,+,149889615,Lekr1,16451,,predicted_mRNA
2,chr8,55557059,55557654,2,14.597577,0.000000,chr8,55508402,55578342,+,55518402,Ube2q2,29149,,predicted_mRNA
3,chr8,58093927,58095676,3,16.320779,0.000000,chr8,58086021,58102130,+,58096021,Cyp1a1,29202,,predicted_mRNA
4,chr1,29032241,29033115,4,19.422729,0.000000,chr1,28983634,29088673,+,28993634,Ahrr,289,,mRNA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22752,chr1,227814734,227814833,13867,1.820856,0.926112,chr1,227804639,227868351,+,227814639,Uhrf2,3776,,predicted_mRNA
22753,chr1,227814734,227814833,13867,1.820856,0.926112,chr1,227808283,227824889,-,227814889,lnc14931,,4539,lncRNA
22754,chr1,227814734,227814833,13867,1.820856,0.926112,chr1,227798283,227814820,+,227808283,lnc32178,,22893,lncRNA
22755,chr1,47011019,47011118,13868,1.782129,0.926112,chr1,46967961,47021505,-,47011505,Ezr,440,,mRNA


In [5]:
# Drop duplicate rows and keep the first occurrence
df = df.drop_duplicates(keep='first')

# Display the DataFrame
df

Unnamed: 0,Chromosome,AHR_Bound_Start,AHR_Bound_End,AHR_Site_Unique_ID,AHR_Binding_FC,AHR_Binding_FDR,GTF_Chromosome,Gene_Start,Gene_End,Gene_Strand,Gene_ORIG_Start,gene_id,Rat_ID,Mouse_ID,transcript_type
0,chr2,149883636,149884785,1,33.877817,0.000000,chr2,149874112,149884250,+,149884112,Mir8120,,66905,lncRNA
1,chr2,149883636,149884785,1,33.877817,0.000000,chr2,149879615,150004079,+,149889615,Lekr1,16451,,predicted_mRNA
2,chr8,55557059,55557654,2,14.597577,0.000000,chr8,55508402,55578342,+,55518402,Ube2q2,29149,,predicted_mRNA
3,chr8,58093927,58095676,3,16.320779,0.000000,chr8,58086021,58102130,+,58096021,Cyp1a1,29202,,predicted_mRNA
4,chr1,29032241,29033115,4,19.422729,0.000000,chr1,28983634,29088673,+,28993634,Ahrr,289,,mRNA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22752,chr1,227814734,227814833,13867,1.820856,0.926112,chr1,227804639,227868351,+,227814639,Uhrf2,3776,,predicted_mRNA
22753,chr1,227814734,227814833,13867,1.820856,0.926112,chr1,227808283,227824889,-,227814889,lnc14931,,4539,lncRNA
22754,chr1,227814734,227814833,13867,1.820856,0.926112,chr1,227798283,227814820,+,227808283,lnc32178,,22893,lncRNA
22755,chr1,47011019,47011118,13868,1.782129,0.926112,chr1,46967961,47021505,-,47011505,Ezr,440,,mRNA


In [6]:
# Count unique values and their counts in the 'gene_name' column
gene_name_counts = df['gene_id'].value_counts().reset_index()
gene_name_counts.columns = ['gene_id', 'count']

# Create a new DataFrame 'df2' with the unique values and their counts
df2 = gene_name_counts

df2

Unnamed: 0,gene_id,count
0,,3089
1,lnc10851,144
2,lnc16454,41
3,lnc4726,23
4,Prkce,19
...,...,...
12380,lnc30283,1
12381,Rbm17,1
12382,Rgs2,1
12383,lnc17084,1


In [8]:
# Save the DataFrame
df2.to_csv('rn7_AHR_Binding_Counts.txt', sep='\t', index=False)


In [9]:
df3 = df.copy()

In [11]:
# Create a new 'length' column by subtracting 'AHR_Bound_Start' from 'AHR_Bound_End'
df3['Length'] = df3['AHR_Bound_End'] - df3['AHR_Bound_Start']

# Create a new 'AHR_Bound_Avg' column by calculating the average of 'AHR_Bound_End' and 'AHR_Bound_Start'
df3['AHR_Bound_Avg'] = (df3['AHR_Bound_End'] + df3['AHR_Bound_Start']) / 2


# Convert 'Gene_ORIG_Start' column to a numeric data type, handling the "." values by converting them to NaN
df3['Gene_ORIG_Start_Temp'] = pd.to_numeric(df3['Gene_ORIG_Start'], errors='coerce')


# Initialize the 'Dist_to_Start' column with NaN values
df3['Dist_to_Start'] = None

# Calculate 'Dist_to_Start' based on 'Gene_Strand'
df3.loc[df3['Gene_Strand'] == '-', 'Dist_to_Start'] = df3['Gene_ORIG_Start_Temp'] - df3['AHR_Bound_Avg']
df3.loc[df3['Gene_Strand'] == '+', 'Dist_to_Start'] = df3['AHR_Bound_Avg'] - df3['Gene_ORIG_Start_Temp']

df3

Unnamed: 0,Chromosome,AHR_Bound_Start,AHR_Bound_End,AHR_Site_Unique_ID,AHR_Binding_FC,AHR_Binding_FDR,GTF_Chromosome,Gene_Start,Gene_End,Gene_Strand,Gene_ORIG_Start,gene_id,Rat_ID,Mouse_ID,transcript_type,Length,AHR_Bound_Avg,Gene_ORIG_Start_Temp,Dist_to_Start
0,chr2,149883636,149884785,1,33.877817,0.000000,chr2,149874112,149884250,+,149884112,Mir8120,,66905,lncRNA,1149,149884210.5,149884112.0,98.5
1,chr2,149883636,149884785,1,33.877817,0.000000,chr2,149879615,150004079,+,149889615,Lekr1,16451,,predicted_mRNA,1149,149884210.5,149889615.0,-5404.5
2,chr8,55557059,55557654,2,14.597577,0.000000,chr8,55508402,55578342,+,55518402,Ube2q2,29149,,predicted_mRNA,595,55557356.5,55518402.0,38954.5
3,chr8,58093927,58095676,3,16.320779,0.000000,chr8,58086021,58102130,+,58096021,Cyp1a1,29202,,predicted_mRNA,1749,58094801.5,58096021.0,-1219.5
4,chr1,29032241,29033115,4,19.422729,0.000000,chr1,28983634,29088673,+,28993634,Ahrr,289,,mRNA,874,29032678.0,28993634.0,39044.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
22752,chr1,227814734,227814833,13867,1.820856,0.926112,chr1,227804639,227868351,+,227814639,Uhrf2,3776,,predicted_mRNA,99,227814783.5,227814639.0,144.5
22753,chr1,227814734,227814833,13867,1.820856,0.926112,chr1,227808283,227824889,-,227814889,lnc14931,,4539,lncRNA,99,227814783.5,227814889.0,105.5
22754,chr1,227814734,227814833,13867,1.820856,0.926112,chr1,227798283,227814820,+,227808283,lnc32178,,22893,lncRNA,99,227814783.5,227808283.0,6500.5
22755,chr1,47011019,47011118,13868,1.782129,0.926112,chr1,46967961,47021505,-,47011505,Ezr,440,,mRNA,99,47011068.5,47011505.0,436.5


In [12]:
# Rename the columns
df3 = df3.rename(columns={
    'Chromosome': 'Chr',
    'AHR_Bound_Start': 'Start',
    'AHR_Bound_End': 'End',
    'AHR_Binding_FC': 'Fold-Change',
    'AHR_Binding_FDR': 'FDR',
    'AHR_Site_Unique_ID': 'Active_Region',
    'GTF_Chromosome': 'GTF_Chrom',
    'Gene_Start': 'Gene_Start',
    'Gene_End': 'Gene_End',
    'Gene_Strand': 'Strand',
    'Gene_ORIG_Start': 'Gene_ORIG_Start',
    'gene_id': 'Gene_ID',
    'transcript_type': 'Transcript_Type',
    'Dist_to_Start': 'Dist_to_Start'
})

# Select the desired columns to create a new DataFrame
df_trimmed = df3[['Active_Region', 'Chr', 'Start', 'End', 'Length', 
                  'Dist_to_Start', 'Fold-Change', 'FDR', 'Transcript_Type',
                  'Gene_ID', 'Rat_ID', 'Mouse_ID']]

df_trimmed

Unnamed: 0,Active_Region,Chr,Start,End,Length,Dist_to_Start,Fold-Change,FDR,Transcript_Type,Gene_ID,Rat_ID,Mouse_ID
0,1,chr2,149883636,149884785,1149,98.5,33.877817,0.000000,lncRNA,Mir8120,,66905
1,1,chr2,149883636,149884785,1149,-5404.5,33.877817,0.000000,predicted_mRNA,Lekr1,16451,
2,2,chr8,55557059,55557654,595,38954.5,14.597577,0.000000,predicted_mRNA,Ube2q2,29149,
3,3,chr8,58093927,58095676,1749,-1219.5,16.320779,0.000000,predicted_mRNA,Cyp1a1,29202,
4,4,chr1,29032241,29033115,874,39044.0,19.422729,0.000000,mRNA,Ahrr,289,
...,...,...,...,...,...,...,...,...,...,...,...,...
22752,13867,chr1,227814734,227814833,99,144.5,1.820856,0.926112,predicted_mRNA,Uhrf2,3776,
22753,13867,chr1,227814734,227814833,99,105.5,1.820856,0.926112,lncRNA,lnc14931,,4539
22754,13867,chr1,227814734,227814833,99,6500.5,1.820856,0.926112,lncRNA,lnc32178,,22893
22755,13868,chr1,47011019,47011118,99,436.5,1.782129,0.926112,mRNA,Ezr,440,


In [13]:
df_trimmed.to_csv('rn7_Chromosomal_Location_of_AHR_Binding.txt', sep='\t', index=False)
