In [6]:
library(tidyverse)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.3     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.4     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.3     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [4]:
tfbs <- read.csv( 'TOBIAS/TFBScan_DEG/TFBScan_activators_or_repressors_merged.bed', sep = '\t', header = F, stringsAsFactors = F ) 

In [10]:
tfbs %>% filter(V4 %>% str_detect("WRKY"))

V1,V2,V3,V4,V5,V6
<chr>,<int>,<int>,<chr>,<dbl>,<chr>
chr1,849,868,WRKY31_AT4G22070,8.73609,+
chr1,856,868,WRKY45_AT3G01970,6.55903,-
chr1,857,868,WRKY29_AT4G23550,8.74031,+
chr1,857,868,WRKY46_AT2G46400,7.40225,+
chr1,857,868,WRKY71_AT1G29860,7.80105,+
chr1,857,870,WRKY14_AT1G30650,8.56409,+
chr1,857,870,WRKY26_AT5G07100,7.37960,+
chr1,857,870,WRKY70_AT3G56400,8.54019,-
chr1,858,870,WRKY55_AT2G40740,8.82754,-
chr1,859,867,WRKY62_AT5G01900,7.37274,+


In [13]:
library(dplyr)

# Merge function
merge_rows <- function(df) {
  df <- df[order(df$V2), ]
  merged_df <- NULL
  
  i <- 1
  while (i <= nrow(df)) {
    j <- i
    while (j < nrow(df) && df$V3[j] >= df$V2[j + 1]) {
      j <- j + 1
    }

    new_row <- df[i, ]
    new_row$V3 <- max(df$V3[i:j])
    new_row$V5 <- mean(df$V5[i:j])
    new_row$V6 <- names(which.max(table(df$V6[i:j])))

    merged_df <- rbind(merged_df, new_row)
    i <- j + 1
  }

  return(merged_df)
}

# Applying the function
merged_tfbs_WRKY <- tfbs %>%
  filter(stringr::str_detect(V4, "^WRKY")) %>%
  mutate(V4 = "WRKYs") %>%
  group_by(V1) %>%
  do(merge_rows(.))

merged_tfbs_WRKY


V1,V2,V3,V4,V5,V6
<chr>,<int>,<int>,<chr>,<dbl>,<chr>
chr1,849,870,WRKYs,7.992289,+
chr1,1465,1477,WRKYs,7.732020,-
chr1,1511,1531,WRKYs,8.443352,+
chr1,2115,2136,WRKYs,8.144580,+
chr1,2945,2953,WRKYs,8.715460,+
chr1,2975,2996,WRKYs,9.933377,+
chr1,3864,3885,WRKYs,7.827881,+
chr1,3995,4006,WRKYs,7.676670,+
chr1,4119,4130,WRKYs,8.380230,+
chr1,4693,4714,WRKYs,7.847552,+


In [12]:
library(dplyr)

# Applying the function for "NAC" rows
merged_tfbs_nac <- tfbs %>%
  filter(stringr::str_detect(V4, "^NAC")) %>%
  mutate(V4 = "NACs") %>%
  group_by(V1) %>%
  do(merge_rows(.))

merged_tfbs_nac


V1,V2,V3,V4,V5,V6
<chr>,<int>,<int>,<chr>,<dbl>,<chr>
chr1,665,682,NACs,7.250465,-
chr1,1025,1039,NACs,6.547470,-
chr1,1326,1347,NACs,5.832590,-
chr1,1497,1514,NACs,8.076907,-
chr1,2099,2116,NACs,7.920475,-
chr1,2449,2461,NACs,8.085810,-
chr1,2801,2813,NACs,7.675930,-
chr1,3085,3097,NACs,9.152340,-
chr1,3183,3202,NACs,8.797185,+
chr1,3783,3799,NACs,7.086300,-


In [18]:
tfbs %>% nrow

In [22]:
library(dplyr)

# Custom merge function
merge_rows <- function(df) {
  df <- df[order(df$V2), ]
  merged_df <- NULL
  
  i <- 1
  while (i <= nrow(df)) {
    j <- i
    while (j < nrow(df) && df$V3[j] >= df$V2[j + 1]) {
      j <- j + 1
    }

    new_row <- df[i, ]
    new_row$V3 <- max(df$V3[i:j])
    new_row$V5 <- mean(df$V5[i:j])
    new_row$V6 <- names(which.max(table(df$V6[i:j])))

    merged_df <- rbind(merged_df, new_row)
    i <- j + 1
  }

  return(merged_df)
}

# Applying the function for "NAC" rows, including "ANAC"
merged_tfbs_nac <- tfbs %>%
  filter(stringr::str_detect(V4, "^(NAC|ANAC)")) %>%
  mutate(V4 = "NACs") %>%
  group_by(V1) %>%
  do(merge_rows(.))

# Make sure to define or compute `merged_tfbs_WRKY` before using it.
# I'll include its definition for clarity and consistency.
merged_tfbs_WRKY <- tfbs %>%
  filter(stringr::str_detect(V4, "^WRKY")) %>%
  mutate(V4 = "WRKYs") %>%
  group_by(V1) %>%
  do(merge_rows(.))

# Combining the original tfbs dataframe (excluding WRKY and NAC rows), with the merged WRKY and NAC rows
tfbs_WRKYs_NACs <- tfbs %>%
  filter(!stringr::str_detect(V4, "^(WRKY|NAC|ANAC)")) %>%
  rbind(merged_tfbs_nac) %>%
  rbind(merged_tfbs_WRKY)

tfbs_WRKYs_NACs


V1,V2,V3,V4,V5,V6
<chr>,<int>,<int>,<chr>,<dbl>,<chr>
chr1,107,126,LBD18_AT2G45420,8.65726,+
chr1,248,259,ZAT14L_AT5G04390,7.86147,-
chr1,434,462,DOF4.7_AT4G38000,7.91403,+
chr1,732,760,DOF4.7_AT4G38000,7.18249,+
chr1,1058,1079,MYB15_AT3G23250,10.31562,+
chr1,1066,1077,AT3G57600_AT3G57600,6.64451,+
chr1,1076,1097,MYB63_AT1G79180,11.12822,+
chr1,1081,1102,MYB15_AT3G23250,7.98873,+
chr1,1085,1096,AT3G57600_AT3G57600,6.64451,+
chr1,1154,1182,DOF4.7_AT4G38000,14.18404,+


In [27]:
# Write to a BED file
write.table(tfbs_WRKYs_NACs %>% arrange(V1, V2), 
            "TOBIAS/TFBScan_DEG/TFBScan_activators_or_repressors_WRKYs_NACs.bed", 
            quote = FALSE, 
            sep = "\t", 
            row.names = FALSE, 
            col.names = FALSE)


In [31]:
tfbs_WRKYs_NACs %>% arrange(V1, V2) %>% mutate(V4 = V4 %>% str_replace_all('_AT\\dG\\d{5}$', '') ) %>%
    write.table( 'TOBIAS/TFBScan_DEG/TFBScan_activators_or_repressors_WRKYs_NACs_cleaned.bed', quote = FALSE, sep = "\t", row.names = FALSE, col.names = FALSE)