# Concatonate All Deseq2 Data with Karri RefGenome Metadata

## <br> 1. Import Required Packages

In [1]:
import numpy as np
import pandas as pd

## <br> 2. Import Deseq2 Data and All Other Metadata

In [3]:
pDREs_Master = pd.read_csv('../../99_rn7_DREs/02_CleanUp_DRE_BED_File/DRE_Counts_by_Gene.txt', 
                            delimiter = '\t')

pDREs_Master = pDREs_Master.dropna()

pDREs_Master = pDREs_Master.rename(columns={'gene_id': 'Gene', 'count': 'pDRE'})

pDREs_Master.head(5)

Unnamed: 0,Gene,pDRE
1,lnc10851,273
2,Tenm3,249
3,Auts2l1,193
4,Camta1,179
5,Cdh13,141


In [5]:
AHR_Master = pd.read_csv('../../99_rn7_AHR_Enrichment/02_CleanUp_BED_File/rn7_AHR_Binding_Counts.txt', 
                            delimiter = '\t')

AHR_Master = AHR_Master.rename(columns={'gene_id': 'Gene', 'count': 'AHR'})

AHR_Master.head(5)

Unnamed: 0,Gene,AHR
0,,3089
1,lnc10851,144
2,lnc16454,41
3,lnc4726,23
4,Prkce,19


In [9]:
# Define column names based on GTF format
gtf_columns = [
    "chrom", "source", "feature", "start", "end", "score", "strand", "frame", "space1", "space2", "attribute"
]

Annotations_Master = pd.read_csv('../../00_Karri_et_al_GTF/Karri_Annotations_Restructured_rn7_for_Gene_Expression.gtf', 
                                delimiter = '\t',
                                names=gtf_columns)  # Assign column names)

# Drop unwanted columns
Annotations_Master = Annotations_Master.drop(columns=["space1", "space2"])

Annotations_Master

Unnamed: 0,chrom,source,feature,start,end,score,strand,frame,attribute
0,chr5,liftover,exon,15038462,15039082,1000,+,.,"Mouse_ID ""0""; gene_id ""lnc1""; transcript_type ..."
1,chr5,liftover,exon,14766774,14767820,1000,+,.,"Mouse_ID ""1""; gene_id ""lnc10""; transcript_type..."
2,chr13,liftover,exon,103001424,103005924,1000,+,.,"Mouse_ID ""3""; gene_id ""lnc1000""; transcript_ty..."
3,chr10,liftover,exon,79676628,79677257,1000,-,.,"Mouse_ID ""4""; gene_id ""lnc10000""; transcript_t..."
4,chr10,liftover,exon,79677716,79678362,1000,-,.,"Mouse_ID ""5""; gene_id ""lnc10001""; transcript_t..."
...,...,...,...,...,...,...,...,...,...
1182956,chrY_NW_023637722v1_random,ncbiRefSeq,exon,293338,293506,1000,+,.,"Rat_ID ""1141644""; gene_id ""LOC103694730""; tran..."
1182957,chrY_NW_023637722v1_random,ncbiRefSeq,exon,300877,300988,1000,+,.,"Rat_ID ""1141645""; gene_id ""LOC103694730""; tran..."
1182958,chrY_NW_023637722v1_random,ncbiRefSeq,exon,317993,319851,1000,-,.,"Rat_ID ""1141646""; gene_id ""LOC120099645""; tran..."
1182959,chrY_NW_023637722v1_random,ncbiRefSeq,exon,320383,320541,1000,-,.,"Rat_ID ""1141647""; gene_id ""LOC120099645""; tran..."


In [10]:
# Define a function to parse attributes and extract key-value pairs
def parse_attributes(attribute_string):
    attr_dict = {}
    if pd.notnull(attribute_string):  # Check if the string is not NaN
        attributes = attribute_string.split(';')
        for attribute in attributes:
            if attribute.strip():  # Ensure it's not an empty string
                key, value = attribute.strip().split(' ', 1)
                attr_dict[key] = value.strip('"')  # Strip quotes from value
    return attr_dict

# Apply the function to extract the information into a DataFrame
attributes_df = Annotations_Master['attribute'].apply(parse_attributes).apply(pd.Series)

attributes_df

Unnamed: 0,Mouse_ID,gene_id,transcript_type,Rat_ID
0,0,lnc1,lncRNA,
1,1,lnc10,lncRNA,
2,3,lnc1000,lncRNA,
3,4,lnc10000,lncRNA,
4,5,lnc10001,lncRNA,
...,...,...,...,...
1182956,,LOC103694730,predicted_lncRNA,1141644
1182957,,LOC103694730,predicted_lncRNA,1141645
1182958,,LOC120099645,predicted_lncRNA,1141646
1182959,,LOC120099645,predicted_lncRNA,1141647


In [11]:
# Concatenate the two DataFrames side by side
merged_combined = pd.concat([Annotations_Master, attributes_df], axis=1)
merged_combined

Unnamed: 0,chrom,source,feature,start,end,score,strand,frame,attribute,Mouse_ID,gene_id,transcript_type,Rat_ID
0,chr5,liftover,exon,15038462,15039082,1000,+,.,"Mouse_ID ""0""; gene_id ""lnc1""; transcript_type ...",0,lnc1,lncRNA,
1,chr5,liftover,exon,14766774,14767820,1000,+,.,"Mouse_ID ""1""; gene_id ""lnc10""; transcript_type...",1,lnc10,lncRNA,
2,chr13,liftover,exon,103001424,103005924,1000,+,.,"Mouse_ID ""3""; gene_id ""lnc1000""; transcript_ty...",3,lnc1000,lncRNA,
3,chr10,liftover,exon,79676628,79677257,1000,-,.,"Mouse_ID ""4""; gene_id ""lnc10000""; transcript_t...",4,lnc10000,lncRNA,
4,chr10,liftover,exon,79677716,79678362,1000,-,.,"Mouse_ID ""5""; gene_id ""lnc10001""; transcript_t...",5,lnc10001,lncRNA,
...,...,...,...,...,...,...,...,...,...,...,...,...,...
1182956,chrY_NW_023637722v1_random,ncbiRefSeq,exon,293338,293506,1000,+,.,"Rat_ID ""1141644""; gene_id ""LOC103694730""; tran...",,LOC103694730,predicted_lncRNA,1141644
1182957,chrY_NW_023637722v1_random,ncbiRefSeq,exon,300877,300988,1000,+,.,"Rat_ID ""1141645""; gene_id ""LOC103694730""; tran...",,LOC103694730,predicted_lncRNA,1141645
1182958,chrY_NW_023637722v1_random,ncbiRefSeq,exon,317993,319851,1000,-,.,"Rat_ID ""1141646""; gene_id ""LOC120099645""; tran...",,LOC120099645,predicted_lncRNA,1141646
1182959,chrY_NW_023637722v1_random,ncbiRefSeq,exon,320383,320541,1000,-,.,"Rat_ID ""1141647""; gene_id ""LOC120099645""; tran...",,LOC120099645,predicted_lncRNA,1141647


In [16]:
merged_combined.columns

Index(['chrom', 'source', 'feature', 'start', 'end', 'score', 'strand',
       'frame', 'attribute', 'Mouse_ID', 'gene_id', 'transcript_type',
       'Rat_ID'],
      dtype='object')

In [None]:
merged_combined2 = merged_combined[[]]

In [17]:
directory = './Deseq2_Results/'

Deseq2_0_01_v_0 = pd.read_csv(directory + '0.01_vs_0.txt', delimiter = '\t').reset_index()
Deseq2_0_03_v_0 = pd.read_csv(directory + '0.03_vs_0.txt', delimiter = '\t').reset_index()
Deseq2_0_1_v_0 = pd.read_csv(directory + '0.1_vs_0.txt', delimiter = '\t').reset_index()
Deseq2_0_3_v_0 = pd.read_csv(directory + '0.3_vs_0.txt', delimiter = '\t').reset_index()
Deseq2_1_v_0 = pd.read_csv(directory + '1_vs_0.txt', delimiter = '\t').reset_index()
Deseq2_3_v_0 = pd.read_csv(directory + '3_vs_0.txt', delimiter = '\t').reset_index()
Deseq2_10_v_0 = pd.read_csv(directory + '10_vs_0.txt', delimiter = '\t').reset_index()


In [18]:
Deseq2_0_01_v_0['Dose'] = '0.01'
Deseq2_0_03_v_0['Dose'] = '0.03'
Deseq2_0_1_v_0['Dose'] = '0.1'
Deseq2_0_3_v_0['Dose'] = '0.3'
Deseq2_1_v_0['Dose'] = '1'
Deseq2_3_v_0['Dose'] = '3'
Deseq2_10_v_0['Dose'] = '10'


In [19]:
Deseq2_10_v_0.head(5)

Unnamed: 0,index,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,Dose
0,lnc1000,1.021823,1.224408,1.866052,0.656149,0.511728,,10
1,lnc10000,0.046372,-1.398125,6.005823,-0.232795,0.815921,,10
2,lnc10001,0.146825,0.0,6.005823,0.0,1.0,,10
3,lnc10003,0.177874,-1.398177,6.005823,-0.232804,0.815914,,10
4,lnc10004,0.740087,3.883753,3.422047,1.134921,0.256409,,10


## <br> 3. Concat All Data Into One Dataframe

In [20]:
Deseq2_Master = pd.concat([Deseq2_0_01_v_0,
                          Deseq2_0_03_v_0,
                          Deseq2_0_1_v_0,
                          Deseq2_0_3_v_0,
                          Deseq2_1_v_0,
                          Deseq2_3_v_0,
                          Deseq2_10_v_0],
                          ignore_index=True)


In [21]:
Deseq2_Master = Deseq2_Master.rename(columns={'index': 'Gene'})
Deseq2_Master['FoldChange'] = pow(2, Deseq2_Master['log2FoldChange'])

Deseq2_Master.head(5)

Unnamed: 0,Gene,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,Dose,FoldChange
0,lnc1000,1.021823,-0.020692,1.991121,-0.010392,0.991708,,0.01,0.98576
1,lnc10000,0.046372,-0.929325,6.005823,-0.154737,0.877028,,0.01,0.525104
2,lnc10001,0.146825,0.0,6.005823,0.0,1.0,,0.01,1.0
3,lnc10003,0.177874,0.032402,6.005823,0.005395,0.995695,,0.01,1.022714
4,lnc10004,0.740087,0.0,3.59288,0.0,1.0,,0.01,1.0


In [22]:
merged_combined.columns

Index(['chrom', 'source', 'feature', 'start', 'end', 'score', 'strand',
       'frame', 'attribute', 'Mouse_ID', 'gene_id', 'transcript_type',
       'Rat_ID'],
      dtype='object')

In [23]:
merged_combined2 = merged_combined[['chrom', 'start', 'end','frame', 'score','strand', 'source', 'gene_id', 'transcript_type', 'Mouse_ID', 'Rat_ID']]

In [24]:
# Merge Deseq2_Master with all metadata tables
Deseq2_Master2 = pd.merge(Deseq2_Master, merged_combined2, left_on='Gene', right_on='gene_id', how='left').fillna("NA")

Deseq2_Master2

Unnamed: 0,Gene,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,Dose,FoldChange,chrom,start,end,frame,score,strand,source,gene_id,transcript_type,Mouse_ID,Rat_ID
0,lnc1000,1.021823,-0.020692,1.991121,-0.010392,0.991708,,0.01,0.985760,chr13,103001424,103005924,.,1000,+,liftover,lnc1000,lncRNA,3,
1,lnc10000,0.046372,-0.929325,6.005823,-0.154737,0.877028,,0.01,0.525104,chr10,79676628,79677257,.,1000,-,liftover,lnc10000,lncRNA,4,
2,lnc10001,0.146825,0.000000,6.005823,0.000000,1.0,,0.01,1.000000,chr10,79677716,79678362,.,1000,-,liftover,lnc10001,lncRNA,5,
3,lnc10003,0.177874,0.032402,6.005823,0.005395,0.995695,,0.01,1.022714,chr10,79954144,79959715,.,1000,-,liftover,lnc10003,lncRNA,7,
4,lnc10004,0.740087,0.000000,3.592880,0.000000,1.0,,0.01,1.000000,chr10,80089972,80105010,.,1000,+,liftover,lnc10004,lncRNA,8,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7722262,LOC103694537,17.353741,0.172073,0.466195,0.369101,0.712052,0.850128,10,1.126676,chrY_NW_023637718v1_random,142349,142592,.,1000,-,ncbiRefSeq,LOC103694537,predicted_mRNA,,1141569
7722263,LOC103694537,17.353741,0.172073,0.466195,0.369101,0.712052,0.850128,10,1.126676,chrY_NW_023637718v1_random,142792,143078,.,1000,-,ncbiRefSeq,LOC103694537,predicted_mRNA,,1141570
7722264,LOC120099632,0.048942,0.000000,6.005823,0.000000,1.0,,10,1.000000,chrY_NW_023637718v1_random,187188,191437,.,1000,-,ncbiRefSeq,LOC120099632,predicted_mRNA,,1141571
7722265,LOC120099632,0.048942,0.000000,6.005823,0.000000,1.0,,10,1.000000,chrY_NW_023637718v1_random,191439,192518,.,1000,-,ncbiRefSeq,LOC120099632,predicted_mRNA,,1141572


In [25]:
# Drop the 'Gene' column
#Deseq2_Master2 = Deseq2_Master2.drop(columns=['Gene'])

#Deseq2_Master2 = Deseq2_Master2.rename(columns={'gene_name': 'Gene'})
Deseq2_Master2 = Deseq2_Master2.rename(columns={'FoldChange': 'Fold-Change'})
Deseq2_Master2 = Deseq2_Master2.rename(columns={'log2FoldChange': 'Log2FC'})

Deseq2_Master2 = pd.merge(Deseq2_Master2, pDREs_Master, left_on='Gene', right_on='Gene', how='left').fillna(0)
Deseq2_Master2 = pd.merge(Deseq2_Master2, AHR_Master, left_on='Gene', right_on='Gene', how='left').fillna(0)

Deseq2_Master2[['pDRE', 'AHR']] = Deseq2_Master2[['pDRE', 'AHR']].astype(int)

Deseq2_Master_Working = Deseq2_Master2[['Gene', 'Dose', 'pDRE', 'AHR', 'transcript_type',  'baseMean',
                                      'Log2FC', 'Fold-Change', 'lfcSE', 'stat', 'pvalue', 'padj',
                                       'chrom', 'start', 'end', 'frame', 'score','strand', 'source',
                                        'gene_id', 'Mouse_ID', 'Rat_ID']].drop_duplicates(keep='first')

Deseq2_Master_Working

Unnamed: 0,Gene,Dose,pDRE,AHR,transcript_type,baseMean,Log2FC,Fold-Change,lfcSE,stat,...,chrom,start,end,frame,score,strand,source,gene_id,Mouse_ID,Rat_ID
0,lnc1000,0.01,0,1,lncRNA,1.021823,-0.020692,0.985760,1.991121,-0.010392,...,chr13,103001424,103005924,.,1000,+,liftover,lnc1000,3,
1,lnc10000,0.01,1,0,lncRNA,0.046372,-0.929325,0.525104,6.005823,-0.154737,...,chr10,79676628,79677257,.,1000,-,liftover,lnc10000,4,
2,lnc10001,0.01,0,0,lncRNA,0.146825,0.000000,1.000000,6.005823,0.000000,...,chr10,79677716,79678362,.,1000,-,liftover,lnc10001,5,
3,lnc10003,0.01,2,0,lncRNA,0.177874,0.032402,1.022714,6.005823,0.005395,...,chr10,79954144,79959715,.,1000,-,liftover,lnc10003,7,
4,lnc10004,0.01,7,1,lncRNA,0.740087,0.000000,1.000000,3.592880,0.000000,...,chr10,80089972,80105010,.,1000,+,liftover,lnc10004,8,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7722262,LOC103694537,10,0,0,predicted_mRNA,17.353741,0.172073,1.126676,0.466195,0.369101,...,chrY_NW_023637718v1_random,142349,142592,.,1000,-,ncbiRefSeq,LOC103694537,,1141569
7722263,LOC103694537,10,0,0,predicted_mRNA,17.353741,0.172073,1.126676,0.466195,0.369101,...,chrY_NW_023637718v1_random,142792,143078,.,1000,-,ncbiRefSeq,LOC103694537,,1141570
7722264,LOC120099632,10,0,0,predicted_mRNA,0.048942,0.000000,1.000000,6.005823,0.000000,...,chrY_NW_023637718v1_random,187188,191437,.,1000,-,ncbiRefSeq,LOC120099632,,1141571
7722265,LOC120099632,10,0,0,predicted_mRNA,0.048942,0.000000,1.000000,6.005823,0.000000,...,chrY_NW_023637718v1_random,191439,192518,.,1000,-,ncbiRefSeq,LOC120099632,,1141572


In [27]:
Deseq2_Master_Working2 = Deseq2_Master_Working[['Gene', 'Dose', 'pDRE', 'AHR', 'transcript_type',  'baseMean',
                                      'Log2FC', 'Fold-Change', 'lfcSE', 'stat', 'pvalue', 'padj','source' ]].drop_duplicates(keep='first')
Deseq2_Master_Working2

Unnamed: 0,Gene,Dose,pDRE,AHR,transcript_type,baseMean,Log2FC,Fold-Change,lfcSE,stat,pvalue,padj,source
0,lnc1000,0.01,0,1,lncRNA,1.021823,-0.020692,0.985760,1.991121,-0.010392,0.991708,,liftover
1,lnc10000,0.01,1,0,lncRNA,0.046372,-0.929325,0.525104,6.005823,-0.154737,0.877028,,liftover
2,lnc10001,0.01,0,0,lncRNA,0.146825,0.000000,1.000000,6.005823,0.000000,1.0,,liftover
3,lnc10003,0.01,2,0,lncRNA,0.177874,0.032402,1.022714,6.005823,0.005395,0.995695,,liftover
4,lnc10004,0.01,7,1,lncRNA,0.740087,0.000000,1.000000,3.592880,0.000000,1.0,,liftover
...,...,...,...,...,...,...,...,...,...,...,...,...,...
7722234,LOC120099597,10,1,0,predicted_lncRNA,0.092333,-1.398142,0.379417,6.005823,-0.232798,0.815918,,ncbiRefSeq
7722237,Dkc1,10,0,2,mRNA,277.395024,1.901728,3.736606,0.248099,7.665213,0.0,0.0,ncbiRefSeq
7722252,LOC103694537,10,0,0,predicted_mRNA,17.353741,0.172073,1.126676,0.466195,0.369101,0.712052,0.850128,ncbiRefSeq
7722264,LOC120099632,10,0,0,predicted_mRNA,0.048942,0.000000,1.000000,6.005823,0.000000,1.0,,ncbiRefSeq


In [28]:
Deseq2_Master_Working2.to_csv('./RDDR_Deseq2_Master_Wald_Rat_with_Mouse_MGI.txt', sep='\t')