In [1]:
library(tidyverse)
library(parallel)

read_overview_file <- function(dir_path) {
  overview_file <- list.files(dir_path, pattern = "_overview\\.txt$", full.names = TRUE)
  if (length(overview_file) == 1) {
    read_delim(overview_file, delim = "\t") %>%
      mutate(source = basename(dir_path))
  } else {
    NULL
  }
}

read_all_overviews_parallel <- function(base_dir) {
  dirs <- list.dirs(base_dir, full.names = TRUE, recursive = TRUE)[-1] # Exclude the base dir itself
  results <- mclapply(dirs, read_overview_file, mc.cores = detectCores() - 1)
  do.call(rbind, results)
}

# Usage
base_dir <- "/media/HDD2/donghui/bulk_ATAC_DM1_DM2d/TOBIAS/BINDetect_output_v3"
all_overviews <- read_all_overviews_parallel(base_dir)


── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.3     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
Ath_TF_list <- read.csv('./scan_motif/Ath_TF_list.txt', sep = '\t') %>% select(Gene_ID, Family) %>% distinct
Ath_TF_list_duplicated <- Ath_TF_list %>% filter( Gene_ID %in% 
        (Ath_TF_list %>% group_by(Gene_ID) %>% summarize(n = n()) %>% arrange(desc(n)) %>% filter(n > 1)  %>% pull(Gene_ID))
                       ) %>% arrange(Gene_ID)
#Ath_TF_list_duplicated
Ath_TF_list <- Ath_TF_list %>% filter( ! Gene_ID %in% Ath_TF_list_duplicated$Gene_ID)  %>% rbind(Ath_TF_list_duplicated %>% filter(!Family %in% c('MYB_related', 'CO-like', 'AP2')))
Ath_TF_list <- Ath_TF_list %>% filter( ! Gene_ID %in% Ath_TF_list_duplicated$Gene_ID)  %>% rbind(Ath_TF_list_duplicated %>% filter(!Family %in% c('MYB_related', 'CO-like', 'AP2')))
Ath_TF_list

Gene_ID,Family
<chr>,<chr>
AT3G25730,RAV
AT1G68840,RAV
AT1G13260,RAV
AT1G25560,RAV
AT1G50680,RAV
AT1G51120,RAV
AT1G01010,NAC
AT1G01260,bHLH
AT1G01720,NAC
AT1G02065,SBP


In [3]:
library(dplyr)

In [4]:
all_overviews %>% str

tibble [11,565,072 × 20] (S3: tbl_df/tbl/data.frame)
 $ TFBS_chr     : chr [1:11565072] "chr1" "chr1" "chr1" "chr1" ...
 $ TFBS_start   : num [1:11565072] 9758 26397 26906 37935 52623 ...
 $ TFBS_end     : num [1:11565072] 9777 26416 26925 37954 52642 ...
 $ TFBS_name    : chr [1:11565072] "ABF1_AT1G49720" "ABF1_AT1G49720" "ABF1_AT1G49720" "ABF1_AT1G49720" ...
 $ TFBS_score   : num [1:11565072] 4.48 5.08 4.55 3.66 10.09 ...
 $ TFBS_strand  : chr [1:11565072] "-" "+" "-" "-" ...
 $ peak_chr     : chr [1:11565072] "chr1" "chr1" "chr1" "chr1" ...
 $ peak_start   : num [1:11565072] 9700 26307 26307 37804 52567 ...
 $ peak_end     : num [1:11565072] 11334 29900 29900 39749 54462 ...
 $ peak_id      : chr [1:11565072] "peak_6" "peak_13" "peak_13" "peak_17" ...
 $ peak_score   : chr [1:11565072] "." "." "." "." ...
 $ peak_strand  : chr [1:11565072] "." "." "." "." ...
 $ gene_id      : chr [1:11565072] "AT1G03987" "AT1G01046" "AT1G01046" "AT1G01060" ...
 $ gene_name    : chr [1:11565072] "AT

In [5]:
all_overviews %>% sample_n(10)

TFBS_chr,TFBS_start,TFBS_end,TFBS_name,TFBS_score,TFBS_strand,peak_chr,peak_start,peak_end,peak_id,peak_score,peak_strand,gene_id,gene_name,COL_score,DM_score,COL_bound,DM_bound,COL_DM_log2fc,source
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>
chr4,8250800,8250821,MYB15_AT3G23250,8.38079,+,chr4,8248076,8250967,peak_25834,.,.,AT4G14340,CKI1,0.64358,0.59229,1,0,0.09726,MYB15_AT3G23250
chr5,7564430,7564451,PEAR2_AT5G02460,9.7868,-,chr5,7564316,7575116,peak_31861,.,.,AT5G22750,RAD5,0.0,0.0,0,0,0.0,PEAR2_AT5G02460
chr3,18188968,18188983,NAC062_AT3G49530,9.8648,-,chr3,18186850,18190173,peak_21587,.,.,AT3G49055,AT3G49055,0.30031,0.32387,0,0,-0.07468,NAC062_AT3G49530
chr3,17401534,17401549,bZIP2_AT2G18160,5.16558,+,chr3,17400002,17402123,peak_21357,.,.,AT3G47250,AT3G47250,0.41742,0.3928,0,0,0.0648,bZIP2_AT2G18160
chr1,5505758,5505767,ABI3_AT3G24650,8.23405,+,chr1,5500773,5507055,peak_1943,.,.,AT1G16030,Hsp70b,0.24022,0.21625,0,0,0.09315,ABI3_AT3G24650
chr1,20956958,20956973,EIN3_AT3G20770,7.46622,+,chr1,20955217,20957455,peak_6391,.,.,AT1G56030,AT1G56030,0.65499,0.3745,1,0,0.62469,EIN3_AT3G20770
chr3,3858209,3858236,CDF5_AT1G69570,11.92167,+,chr3,3857009,3861591,peak_17134,.,.,AT3G12110,ACT11,0.21452,0.30337,0,0,-0.32012,CDF5_AT1G69570
chr1,12867749,12867765,NGA4_AT4G01500,8.82356,+,chr1,12866907,12869168,peak_4206,.,.,AT1G35160,GF14 PHI,0.0,0.0,0,0,0.0,NGA4_AT4G01500
chr1,18667977,18667998,DOF1_AT1G51700,8.20072,+,chr1,18667323,18670112,peak_5702,.,.,AT1G50400,AT1G50400,0.34493,0.46155,0,0,-0.30912,DOF1_AT1G51700
chr5,3831676,3831697,IDD11_AT3G13810,8.2966,+,chr5,3827803,3835663,peak_30631,.,.,AT5G11890,EMB3135,0.53102,0.39513,0,0,0.32477,IDD11_AT3G13810


In [6]:
all_overviews_diff <- all_overviews %>% filter(COL_bound != DM_bound) 

all_overviews_diff <- all_overviews_diff %>% mutate( TF_gene_id = TFBS_name %>% str_extract("AT\\dG\\d+$"), TF_gene_name = TFBS_name %>% str_remove("_AT\\dG\\d+$"))

In [7]:
all_overviews_diff

TFBS_chr,TFBS_start,TFBS_end,TFBS_name,TFBS_score,TFBS_strand,peak_chr,peak_start,peak_end,peak_id,⋯,gene_id,gene_name,COL_score,DM_score,COL_bound,DM_bound,COL_DM_log2fc,source,TF_gene_id,TF_gene_name
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<chr>,⋯,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>
chr1,99431,99450,ABF1_AT1G49720,8.76893,-,chr1,98692,99969,peak_38,⋯,AT1G01240,AT1G01240,0.73900,0.57699,1,0,0.29276,ABF1_AT1G49720,AT1G49720,ABF1
chr1,113545,113564,ABF1_AT1G49720,6.19401,-,chr1,109941,114262,peak_44,⋯,AT1G01270,,0.49767,0.62584,0,1,-0.26306,ABF1_AT1G49720,AT1G49720,ABF1
chr1,127405,127424,ABF1_AT1G49720,4.36561,-,chr1,127066,129490,peak_48,⋯,AT1G01320,AT1G01320,0.59524,0.55805,1,0,0.07456,ABF1_AT1G49720,AT1G49720,ABF1
chr1,185754,185773,ABF1_AT1G49720,9.90047,+,chr1,184569,189554,peak_71,⋯,AT1G01510,AN,0.59252,0.62805,0,1,-0.06805,ABF1_AT1G49720,AT1G49720,ABF1
chr1,219284,219303,ABF1_AT1G49720,7.31334,+,chr1,218753,220585,peak_86,⋯,AT1G01600,CYP86A4,0.59746,0.53326,1,0,0.13082,ABF1_AT1G49720,AT1G49720,ABF1
chr1,346273,346292,ABF1_AT1G49720,4.11855,-,chr1,346218,347584,peak_131,⋯,AT1G02010,SEC1A,0.55104,0.59831,0,1,-0.09504,ABF1_AT1G49720,AT1G49720,ABF1
chr1,426412,426431,ABF1_AT1G49720,3.92804,+,chr1,426379,427157,peak_157,⋯,AT1G02210,AT1G02210,0.67119,0.54692,1,0,0.23890,ABF1_AT1G49720,AT1G49720,ABF1
chr1,482433,482452,ABF1_AT1G49720,3.35768,-,chr1,482061,484073,peak_177,⋯,AT1G02390,GPAT2,0.57727,0.64330,0,1,-0.12652,ABF1_AT1G49720,AT1G49720,ABF1
chr1,511287,511306,ABF1_AT1G49720,6.44328,-,chr1,511044,511343,peak_188,⋯,AT1G02470,AT1G02470,0.71268,0.59109,1,0,0.22108,ABF1_AT1G49720,AT1G49720,ABF1
chr1,564517,564536,ABF1_AT1G49720,4.11340,-,chr1,564442,567539,peak_208,⋯,AT1G02640,BXL2,0.63258,0.59025,1,0,0.08096,ABF1_AT1G49720,AT1G49720,ABF1


In [8]:
all_overviews_diff %>% pull(COL_bound) %>% unique

In [9]:
all_overviews_diff %>% filter(COL_bound == 'NA')

“number of rows of result is not a multiple of vector length (arg 2)”
“number of rows of result is not a multiple of vector length (arg 2)”
“number of rows of result is not a multiple of vector length (arg 2)”
“number of rows of result is not a multiple of vector length (arg 2)”


TFBS_chr,TFBS_start,TFBS_end,TFBS_name,TFBS_score,TFBS_strand,peak_chr,peak_start,peak_end,peak_id,⋯,gene_id,gene_name,COL_score,DM_score,COL_bound,DM_bound,COL_DM_log2fc,source,TF_gene_id,TF_gene_name
<chr>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<chr>,⋯,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>


In [10]:
Res_combined <-  readRDS('../Pathogen_JC14_3DM/Rdata/Res_combined.rds')

In [11]:
Res_combined

AGI,log2FC,source
<chr>,<dbl>,<fct>
AT1G30814,5.18459620,DM6/DM7
AT1G78930,0.01991274,DM6/DM7
AT1G71695,-2.30890364,DM6/DM7
AT1G58983,-0.17951020,DM6/DM7
AT1G12980,2.27948272,DM6/DM7
AT1G56250,0.77412597,DM6/DM7
AT1G66852,0.00000000,DM6/DM7
AT1G69810,2.76176191,DM6/DM7
AT1G72450,-0.41945456,DM6/DM7
AT1G76280,-0.42485352,DM6/DM7


In [12]:
NLRs_and_TIRs <- read.csv(file = "../gene_table_and_other_scripts/NLRs_and_TIR_only.csv") 

In [50]:
NLRs_and_TIRs <- NLRs_and_TIRs %>% filter(Class != 'TIR') %>% mutate(type = 'NLR')

In [51]:
Pathways_Ngou_2021 <- readxl::read_excel("../gene_table_and_other_scripts/Ngou_2021_Nature_figure2E_pathways_mod.xlsx")

In [52]:
Pathways <- Pathways_Ngou_2021 %>% dplyr::select(Target, GeneFamily)
Pathways <- Pathways %>%as.data.frame  %>% dplyr::rename('AGI' = 'Target')
LRRs <- Pathways %>% filter(GeneFamily %>% str_detect('LRR')) %>% dplyr::rename('Class' = 'GeneFamily') 

In [53]:
LRRs <- LRRs %>% mutate(type = case_when(Class == 'LRR-RLKs' ~ 'RLK', Class == 'LRR-RLPs' ~ 'RLP'))
LRRs

AGI,Class,type
<chr>,<chr>,<chr>
AT1G51820,LRR-RLKs,RLK
AT2G19190,LRR-RLKs,RLK
AT1G17750,LRR-RLKs,RLK
AT1G55610,LRR-RLKs,RLK
AT1G51890,LRR-RLKs,RLK
AT1G74360,LRR-RLKs,RLK
AT1G73080,LRR-RLKs,RLK
AT3G13380,LRR-RLKs,RLK
AT1G56140,LRR-RLKs,RLK
AT1G51800,LRR-RLKs,RLK


In [56]:
NLRs_and_TIRs %>% dplyr::select(AGI, Class, Symbol, type) %>% 
    rbind(LRRs %>% mutate(Symbol = NA), by = 'AGI') %>% filter(AGI != 'AGI')   -> NLRs_and_LRRs
NLRs_and_LRRs

AGI,Class,Symbol,type
<chr>,<chr>,<chr>,<chr>
AT1G10920,NBS-LRR,LOV1,NLR
AT1G12210,CC-NBS-LRR,RFL1,NLR
AT1G12220,CC-NBS-LRR,RPS5,NLR
AT1G12280,CC-NBS-LRR,SUMM2,NLR
AT1G12290,CC-NBS-LRR,L5,NLR
AT1G15890,CC-NBS-LRR,AT1G15890,NLR
AT1G17600,TIR-NBS-LRR,SOC3,NLR
AT1G17610,TIR-NBS,CHS1,NLR
AT1G17615,TIR-NBS,TN2,NLR
AT1G27170,TIR-NBS-LRR,AT1G27170,NLR


In [83]:
#Res_TF_bound <- Res_combined %>% filter(source == 'DM1/DM2d') %>% 
#    right_join(NLRs_and_LRRs, by = 'AGI')  %>% 
#    left_join(all_overviews_diff %>% dplyr::select(-source), by = c('AGI' = 'gene_id')) %>% 
#    mutate(Symbol = ifelse(is.na(Symbol), gene_name, Symbol)) %>% 
#    filter(COL_bound ==1 | DM_bound == 1) %>% filter( log2FC %>% abs > 2)
#NA is not bound in COL or DM
Res_TF_bound <- Res_combined %>% 
    filter(source == 'DM1/DM2d') %>% 
    right_join(NLRs_and_LRRs, by = 'AGI')  %>% 
    left_join(all_overviews_diff %>% dplyr::select(-source), by = c('AGI' = 'gene_id')) %>% 
    mutate(Symbol = ifelse(is.na(Symbol) | grepl("^AT\\dG\\d{5}$", Symbol), gene_name, Symbol)) %>% 
    filter(COL_bound == 1 | DM_bound == 1) %>% 
    filter(abs(log2FC) > 2)


Res_TF_bound <- Res_TF_bound %>%
  mutate(bound = case_when(
      COL_bound == 1 ~ 'COL',
      DM_bound == 1 ~ 'DM',
      COL_bound == 0 & DM_bound == 0 ~ '0',
      COL_bound == 1 & DM_bound == 1 ~ '2',
      TRUE ~ 'NA'
  ))%>%left_join(Ath_TF_list %>% dplyr::rename('TF_Family' = 'Family'), by = c('TF_gene_id' = 'Gene_ID'))
Res_TF_bound

AGI,log2FC,source,Class,Symbol,type,TFBS_chr,TFBS_start,TFBS_end,TFBS_name,⋯,gene_name,COL_score,DM_score,COL_bound,DM_bound,COL_DM_log2fc,TF_gene_id,TF_gene_name,bound,TF_Family
<chr>,<dbl>,<fct>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,⋯,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905363,26905383,AHBP-1B_AT5G06950,⋯,RLP11,0.33005,0.59534,0,1,-0.64216,AT5G06950,AHBP-1B,DM,bZIP
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905292,26905309,AT5G01380_AT5G01380,⋯,RLP11,0.38530,0.74687,0,1,-0.75210,AT5G01380,AT5G01380,DM,Trihelix
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905283,26905298,BIM2_AT1G69010,⋯,RLP11,0.38530,0.74687,0,1,-0.75210,AT1G69010,BIM2,DM,bHLH
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905364,26905374,BZIP60_AT1G42990,⋯,RLP11,0.34255,0.61809,0,1,-0.64831,AT1G42990,BZIP60,DM,bZIP
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905283,26905298,LRL2_AT4G30980,⋯,RLP11,0.38530,0.74687,0,1,-0.75210,AT4G30980,LRL2,DM,bHLH
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905327,26905339,NAC012_AT1G32770,⋯,RLP11,0.39535,0.67907,0,1,-0.61060,AT1G32770,NAC012,DM,NAC
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905363,26905377,OBF5_AT5G06960,⋯,RLP11,0.34255,0.61809,0,1,-0.64831,AT5G06960,OBF5,DM,bZIP
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905311,26905322,SPL11_AT1G27360,⋯,RLP11,0.38020,0.74687,0,1,-0.76609,AT1G27360,SPL11,DM,SBP
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905363,26905378,TGA1_AT5G65210,⋯,RLP11,0.34255,0.61809,0,1,-0.64831,AT5G65210,TGA1,DM,bZIP
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905360,26905375,TGA10_AT5G06839,⋯,RLP11,0.35136,0.61809,0,1,-0.62238,AT5G06839,TGA10,DM,bZIP


In [84]:
## only DEG TF
Res_TF_bound <- Res_TF_bound %>% left_join(
    Res_combined %>% filter(source == 'DM1/DM2d') %>% dplyr::select(AGI, log2FC) %>% dplyr::rename('TF_log2FC' = 'log2FC'),  by = c('TF_gene_id' = 'AGI'))  %>% filter(TF_log2FC %>% abs > 2)

In [85]:
Res_TF_bound

AGI,log2FC,source,Class,Symbol,type,TFBS_chr,TFBS_start,TFBS_end,TFBS_name,⋯,COL_score,DM_score,COL_bound,DM_bound,COL_DM_log2fc,TF_gene_id,TF_gene_name,bound,TF_Family,TF_log2FC
<chr>,<dbl>,<fct>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905292,26905309,AT5G01380_AT5G01380,⋯,0.38530,0.74687,0,1,-0.75210,AT5G01380,AT5G01380,DM,Trihelix,4.570795
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905363,26905377,OBF5_AT5G06960,⋯,0.34255,0.61809,0,1,-0.64831,AT5G06960,OBF5,DM,bZIP,2.251105
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905363,26905378,TGA1_AT5G65210,⋯,0.34255,0.61809,0,1,-0.64831,AT5G65210,TGA1,DM,bZIP,3.391054
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905360,26905375,TGA10_AT5G06839,⋯,0.35136,0.61809,0,1,-0.62238,AT5G06839,TGA10,DM,bZIP,4.635365
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905359,26905376,TGA7_AT1G77920,⋯,0.35136,0.61809,0,1,-0.62238,AT1G77920,TGA7,DM,bZIP,3.765220
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905364,26905375,TGA9_AT1G08320,⋯,0.34255,0.61809,0,1,-0.64831,AT1G08320,TGA9,DM,bZIP,5.438124
AT1G17240,2.297504,DM1/DM2d,LRR-RLPs,RLP2,RLP,chr1,5897744,5897765,DREB2C_AT2G40340,⋯,0.39720,0.62866,0,1,-0.51437,AT2G40340,DREB2C,DM,ERF,2.703876
AT1G17240,2.297504,DM1/DM2d,LRR-RLPs,RLP2,RLP,chr1,5897736,5897764,ERF1_AT3G23240,⋯,0.39720,0.62866,0,1,-0.51437,AT3G23240,ERF1,DM,ERF,2.311994
AT1G17240,2.297504,DM1/DM2d,LRR-RLPs,RLP2,RLP,chr1,5897743,5897764,ERF6_AT4G17490,⋯,0.39720,0.62866,0,1,-0.51437,AT4G17490,ERF6,DM,ERF,-2.159299
AT1G17240,2.297504,DM1/DM2d,LRR-RLPs,RLP2,RLP,chr1,5897720,5897731,MYB52_AT1G17950,⋯,0.39720,0.61194,0,1,-0.48277,AT1G17950,MYB52,DM,MYB,2.908423


In [86]:
Res_TF_bound %>% filter(COL_bound == 1)

AGI,log2FC,source,Class,Symbol,type,TFBS_chr,TFBS_start,TFBS_end,TFBS_name,⋯,COL_score,DM_score,COL_bound,DM_bound,COL_DM_log2fc,TF_gene_id,TF_gene_name,bound,TF_Family,TF_log2FC
<chr>,<dbl>,<fct>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>
AT1G80080,-4.265392,DM1/DM2d,LRR-RLPs,TMM,RLP,chr1,30129694,30129705,AIL7_AT5G65510,⋯,0.71036,0.46817,1,0,0.48150,AT5G65510,AIL7,COL,AP2,4.178110
AT1G80080,-4.265392,DM1/DM2d,LRR-RLPs,TMM,RLP,chr1,30128814,30128829,AT1G01250_AT1G01250,⋯,0.62651,0.41600,1,0,0.46100,AT1G01250,AT1G01250,COL,ERF,-2.257223
AT1G80080,-4.265392,DM1/DM2d,LRR-RLPs,TMM,RLP,chr1,30129832,30129843,AT1G47655_AT1G47655,⋯,0.82511,0.58057,1,0,0.41998,AT1G47655,AT1G47655,COL,Dof,-3.741772
AT1G80080,-4.265392,DM1/DM2d,LRR-RLPs,TMM,RLP,chr1,30128814,30128828,AT1G77200_AT1G77200,⋯,0.62651,0.41600,1,0,0.46100,AT1G77200,AT1G77200,COL,ERF,-4.541328
AT1G80080,-4.265392,DM1/DM2d,LRR-RLPs,TMM,RLP,chr1,30128808,30128829,AT1G77640_AT1G77640,⋯,0.72065,0.53369,1,0,0.35189,AT1G77640,AT1G77640,COL,ERF,5.117836
AT1G80080,-4.265392,DM1/DM2d,LRR-RLPs,TMM,RLP,chr1,30128813,30128828,AT3G46070_AT3G46070,⋯,0.65349,0.44534,1,0,0.43693,AT3G46070,AT3G46070,COL,C2H2,7.540517
AT1G80080,-4.265392,DM1/DM2d,LRR-RLPs,TMM,RLP,chr1,30129673,30129694,AT4G33280_AT4G33280,⋯,0.72413,0.47231,1,0,0.49485,AT4G33280,AT4G33280,COL,B3,-2.714846
AT1G80080,-4.265392,DM1/DM2d,LRR-RLPs,TMM,RLP,chr1,30130072,30130089,AT5G01380_AT5G01380,⋯,0.77312,0.50397,1,0,0.50176,AT5G01380,AT5G01380,COL,Trihelix,4.570795
AT1G80080,-4.265392,DM1/DM2d,LRR-RLPs,TMM,RLP,chr1,30129819,30129848,ATDOF5.8_AT5G66940,⋯,0.83875,0.58057,1,0,0.44016,AT5G66940,ATDOF5.8,COL,Dof,-2.456498
AT1G80080,-4.265392,DM1/DM2d,LRR-RLPs,TMM,RLP,chr1,30129824,30129844,BBM_AT5G17430,⋯,0.82610,0.58057,1,0,0.42145,AT5G17430,BBM,COL,AP2,2.418350


In [87]:
Res_TF_bound %>% filter(DM_bound == 1)

AGI,log2FC,source,Class,Symbol,type,TFBS_chr,TFBS_start,TFBS_end,TFBS_name,⋯,COL_score,DM_score,COL_bound,DM_bound,COL_DM_log2fc,TF_gene_id,TF_gene_name,bound,TF_Family,TF_log2FC
<chr>,<dbl>,<fct>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<chr>,<chr>,<chr>,<chr>,<dbl>
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905292,26905309,AT5G01380_AT5G01380,⋯,0.38530,0.74687,0,1,-0.75210,AT5G01380,AT5G01380,DM,Trihelix,4.570795
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905363,26905377,OBF5_AT5G06960,⋯,0.34255,0.61809,0,1,-0.64831,AT5G06960,OBF5,DM,bZIP,2.251105
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905363,26905378,TGA1_AT5G65210,⋯,0.34255,0.61809,0,1,-0.64831,AT5G65210,TGA1,DM,bZIP,3.391054
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905360,26905375,TGA10_AT5G06839,⋯,0.35136,0.61809,0,1,-0.62238,AT5G06839,TGA10,DM,bZIP,4.635365
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905359,26905376,TGA7_AT1G77920,⋯,0.35136,0.61809,0,1,-0.62238,AT1G77920,TGA7,DM,bZIP,3.765220
AT1G71390,3.127214,DM1/DM2d,LRR-RLPs,RLP11,RLP,chr1,26905364,26905375,TGA9_AT1G08320,⋯,0.34255,0.61809,0,1,-0.64831,AT1G08320,TGA9,DM,bZIP,5.438124
AT1G17240,2.297504,DM1/DM2d,LRR-RLPs,RLP2,RLP,chr1,5897744,5897765,DREB2C_AT2G40340,⋯,0.39720,0.62866,0,1,-0.51437,AT2G40340,DREB2C,DM,ERF,2.703876
AT1G17240,2.297504,DM1/DM2d,LRR-RLPs,RLP2,RLP,chr1,5897736,5897764,ERF1_AT3G23240,⋯,0.39720,0.62866,0,1,-0.51437,AT3G23240,ERF1,DM,ERF,2.311994
AT1G17240,2.297504,DM1/DM2d,LRR-RLPs,RLP2,RLP,chr1,5897743,5897764,ERF6_AT4G17490,⋯,0.39720,0.62866,0,1,-0.51437,AT4G17490,ERF6,DM,ERF,-2.159299
AT1G17240,2.297504,DM1/DM2d,LRR-RLPs,RLP2,RLP,chr1,5897720,5897731,MYB52_AT1G17950,⋯,0.39720,0.61194,0,1,-0.48277,AT1G17950,MYB52,DM,MYB,2.908423


In [88]:
Res_TF_bound %>% pull(TF_gene_name) %>% unique %>% length
Res_TF_bound %>% pull(gene_name) %>% unique %>% length

In [89]:
Res_TF_bound %>% group_by(TF_gene_name) %>% summarise(n = n()) %>% arrange(desc(n)) 

TF_gene_name,n
<chr>,<int>
BBM,52
ATDOF5.8,34
JKD,34
TZF9,34
CDF5,31
AT3G46070,29
DOF4.7,29
DREB26,25
DREB2C,25
PI,25


In [90]:
TF_top <- Res_TF_bound %>% group_by(TF_gene_id) %>% summarise(n = n()) %>% arrange(desc(n))  %>% filter(n > 15) %>% pull(TF_gene_id)

In [91]:
TF_of_interest <- c("AT1G02230", "AT1G02250", "AT1G08320", "AT1G12260", "AT1G28470", "AT1G29160", "AT1G29860", "AT1G30650", "AT1G32870", "AT1G34180", "AT1G65910", "AT1G66600", "AT1G69570", "AT1G71930", "AT1G77920", "AT1G79180", "AT2G18060", "AT2G24260", "AT2G24430", "AT2G27300", "AT2G30250", "AT2G40740", "AT2G43010", "AT2G45420", "AT2G46400", "AT3G01970", "AT3G04060", "AT3G10500", "AT3G10800", "AT3G17730", "AT3G18400", "AT3G23250", "AT3G45610", "AT3G56400", "AT3G57600", "AT4G00050", "AT4G14770", "AT4G22070", "AT4G23550", "AT4G28500", "AT4G29230", "AT4G31800", "AT4G36160", "AT4G38000", "AT5G01380", "AT5G01900", "AT5G04390", "AT5G04410", "AT5G06960", "AT5G07100", "AT5G13180", "AT5G18270", "AT5G22570", "AT5G39610", "AT5G40330", "AT5G48560", "AT5G49330", "AT5G62380", "AT5G62940", "AT5G65210", "AT5G66300", "AT5G66940") %>% union(TF_top)

In [92]:
TF_of_interest

### filter by TF_of_interest

In [93]:
Res_TF_bound <- Res_TF_bound %>% filter(TF_gene_id %in% TF_of_interest)

In [94]:
Res_TF_bound %>% dplyr::select(TF_gene_id, AGI, bound) %>% 
    dplyr::rename('source' = 'TF_gene_id', 'target' = 'AGI', 'interaction' = 'bound')  -> TF_interactions_edges
TF_interactions_edges

source,target,interaction
<chr>,<chr>,<chr>
AT5G01380,AT1G71390,DM
AT5G06960,AT1G71390,DM
AT5G65210,AT1G71390,DM
AT1G77920,AT1G71390,DM
AT1G08320,AT1G71390,DM
AT2G40340,AT1G17240,DM
AT5G25390,AT1G17240,DM
AT3G46070,AT1G80080,COL
AT5G01380,AT1G80080,COL
AT5G66940,AT1G80080,COL


In [95]:
Res_TF_bound %>% str

'data.frame':	1201 obs. of  29 variables:
 $ AGI          : chr  "AT1G71390" "AT1G71390" "AT1G71390" "AT1G71390" ...
 $ log2FC       : num  3.13 3.13 3.13 3.13 3.13 ...
 $ source       : Factor w/ 12 levels "DM10/DM11","DM1/DM2d",..: 2 2 2 2 2 2 2 2 2 2 ...
 $ Class        : chr  "LRR-RLPs" "LRR-RLPs" "LRR-RLPs" "LRR-RLPs" ...
 $ Symbol       : chr  "RLP11" "RLP11" "RLP11" "RLP11" ...
 $ type         : chr  "RLP" "RLP" "RLP" "RLP" ...
 $ TFBS_chr     : chr  "chr1" "chr1" "chr1" "chr1" ...
 $ TFBS_start   : num  26905292 26905363 26905363 26905359 26905364 ...
 $ TFBS_end     : num  26905309 26905377 26905378 26905376 26905375 ...
 $ TFBS_name    : chr  "AT5G01380_AT5G01380" "OBF5_AT5G06960" "TGA1_AT5G65210" "TGA7_AT1G77920" ...
 $ TFBS_score   : num  8.07 7.82 8.06 8.17 8.35 ...
 $ TFBS_strand  : chr  "+" "+" "-" "-" ...
 $ peak_chr     : chr  "chr1" "chr1" "chr1" "chr1" ...
 $ peak_start   : num  26905191 26905191 26905191 26905191 26905191 ...
 $ peak_end     : num  26905806 26905806

In [96]:
target_nodes <- Res_TF_bound %>% dplyr::select(AGI, Symbol, Class, log2FC , type) %>% distinct %>% dplyr::rename('id' = 'AGI', 'label' = 'Symbol') %>% mutate(type2 = 'target')
target_nodes

id,label,Class,log2FC,type,type2
<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>
AT1G71390,RLP11,LRR-RLPs,3.127214,RLP,target
AT1G17240,RLP2,LRR-RLPs,2.297504,RLP,target
AT1G80080,TMM,LRR-RLPs,-4.265392,RLP,target
AT1G12460,AT1G12460,LRR-RLKs,-3.619635,RLK,target
AT1G61300,AT1G61300,NBS-LRR,-2.360119,NLR,target
AT1G28440,HSL1,LRR-RLKs,-2.417076,RLK,target
AT1G31540,AT1G31540,TIR-NBS-LRR,3.184026,NLR,target
AT1G55610,BRL1,LRR-RLKs,5.013700,RLK,target
AT1G75820,CLV1,LRR-RLKs,-2.330373,RLK,target
AT1G74360,AT1G74360,LRR-RLKs,4.299102,RLK,target


In [97]:
TF_nodes <- Res_TF_bound %>% dplyr::select(TF_gene_id, TF_gene_name, TF_Family, TF_log2FC ) %>% distinct %>% filter(TF_gene_id %in% TF_of_interest) %>% dplyr::rename('id' = 'TF_gene_id', 'label' = 'TF_gene_name', 'Class' = 'TF_Family', 'log2FC' = 'TF_log2FC') %>% mutate(type = 'TF', type2 = 'TF')
TF_nodes

id,label,Class,log2FC,type,type2
<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>
AT5G01380,AT5G01380,Trihelix,4.570795,TF,TF
AT5G06960,OBF5,bZIP,2.251105,TF,TF
AT5G65210,TGA1,bZIP,3.391054,TF,TF
AT1G77920,TGA7,bZIP,3.765220,TF,TF
AT1G08320,TGA9,bZIP,5.438124,TF,TF
AT2G40340,DREB2C,ERF,2.703876,TF,TF
AT5G25390,SHN3,ERF,-8.001517,TF,TF
AT3G46070,AT3G46070,C2H2,7.540517,TF,TF
AT5G66940,ATDOF5.8,Dof,-2.456498,TF,TF
AT5G17430,BBM,AP2,2.418350,TF,TF


In [98]:
combine_data_frames <- function(df_list) {
  # Get all unique column names across all data frames
  all_col_names <- unique(unlist(lapply(df_list, names)))

  # Add missing columns to each data frame
  df_list <- lapply(df_list, function(df) {
    missing_cols <- setdiff(all_col_names, names(df))
    df[missing_cols] <- NA
    return(df)
  })

  # Ensure the same column order for all data frames
  df_list <- lapply(df_list, function(df) df[all_col_names])

  # Combine all data frames
  combined_df <- do.call(rbind, df_list)
  return(combined_df)
}

# Usage example
# Assuming df1, df2, df3 are your data frames
node_combined <- combine_data_frames(list(target_nodes, TF_nodes))
node_combined %>% nrow
node_combined

id,label,Class,log2FC,type,type2
<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>
AT1G71390,RLP11,LRR-RLPs,3.127214,RLP,target
AT1G17240,RLP2,LRR-RLPs,2.297504,RLP,target
AT1G80080,TMM,LRR-RLPs,-4.265392,RLP,target
AT1G12460,AT1G12460,LRR-RLKs,-3.619635,RLK,target
AT1G61300,AT1G61300,NBS-LRR,-2.360119,NLR,target
AT1G28440,HSL1,LRR-RLKs,-2.417076,RLK,target
AT1G31540,AT1G31540,TIR-NBS-LRR,3.184026,NLR,target
AT1G55610,BRL1,LRR-RLKs,5.013700,RLK,target
AT1G75820,CLV1,LRR-RLKs,-2.330373,RLK,target
AT1G74360,AT1G74360,LRR-RLKs,4.299102,RLK,target


In [99]:
node_combined %>% write.csv('TF_GO_module/TF_bound_nodes.csv', row.names = FALSE, quote = FALSE)
TF_interactions_edges %>% write.csv('TF_GO_module/TF_bound_edges.csv', row.names = FALSE, quote = FALSE)

In [103]:
node_combined %>% sample_n(30)
TF_interactions_edges %>% sample_n(10)

id,label,Class,log2FC,type,type2
<chr>,<chr>,<chr>,<dbl>,<chr>,<chr>
AT1G62630,AT1G62630,CC-NBS-LRR,-2.756869,NLR,target
AT5G04390,ZAT14L,C2H2,3.379464,TF,TF
AT3G11010,RLP34,LRR-RLPs,3.165699,RLP,target
AT3G23010,RLP36,LRR-RLPs,3.143319,RLP,target
AT3G51740,IMK2,LRR-RLKs,-3.229973,RLK,target
AT5G07100,WRKY26,WRKY,2.540367,TF,TF
AT3G49670,BAM2,LRR-RLKs,-5.296197,RLK,target
AT5G38350,AT5G38350,NBS-LRR,9.423456,NLR,target
AT5G04720,ADR1-L2,RPW8-NBS-LRR,2.204074,NLR,target
AT1G58190,RLP9,LRR-RLPs,3.474514,RLP,target


source,target,interaction
<chr>,<chr>,<chr>
AT3G10800,AT5G56040,COL
AT5G25390,AT1G48480,DM
AT2G46400,AT4G33300,DM
AT5G20240,AT3G46370,COL
AT1G30650,AT4G34220,COL
AT3G04060,AT1G48480,DM
AT1G69570,AT3G23120,DM
AT4G38000,AT3G02880,COL
AT3G08500,AT1G45616,DM
AT5G03150,AT3G05650,COL
