Test correlations between physical size and sex-averaged map length as well as recombination rate of each linkage group (ran in R 4.2.1)

In [None]:
library(statsExpressions)

# physical size and sex-averaged map length
corr_test(data, size, avgMap, type = "nonparametric")

# physical size and recombination rate
corr_test(data, size, avgRate, type = "nonparametric")

Define non-overlapping intervals of 1 Mbp along each chromosome and extracting physical positions of the first and last SNP in each interval (ran in R 4.2.1)

In [None]:
# list of linkage groups
ilist <- 1:32
ilist2 <- ilist[-c(16,25,29,31)]
# list of chromosomes
jlist <- 1:24
jlist1 <- jlist[-16]
jlist2 <- append(jlist1, c("26", "27", "28", "1A", "4A"))

mapply(function(i, j) {
  # import physical map
  setwd("path/")
  data <- read.delim(Sys.glob(paste0("order-physical-",i,"-*.txt")), header = FALSE)
  data2 <- data[-(1:3), 1:4]
  colnames(data2) <- c("marker", "male", "female", "avg")
  data2$male <- as.numeric(data2$male)
  data2$female <- as.numeric(data2$female)
  data2$avg <- as.numeric(data2$avg)
  rownames(data2) = seq(length=nrow(data2))
  # get physical position
  data2$phy <- lepPos2$pos[match(data2$marker,lepPos2$marker)]
  data2$markerOrder <- 1:nrow(data2)
  name <- paste0("dataChr",i)
  assign(name, data2, envir = .GlobalEnv)
  value <- ceiling(max(data2$phy))
  freq <- as.data.frame(table(cut(data2$phy,seq(0,value,1))))
  freq$Var1 <- stringr::str_replace(freq$Var1, '\\(', '')
  freq$Var1 <- stringr::str_replace(freq$Var1, '\\]', '')
  freq2 <- cSplit(freq, "Var1", ",")
  colnames(freq2) <- c("numSNP", "phyStart", "phyEnd")
  list <- seq(1,length(freq2$numSNP), by = 1)
  markerEnd <- data.frame(matrix(NA,    # Create empty data frame
                                 nrow = 1,
                                 ncol = 1))
  colnames(markerEnd) <- "markerEnd"
  for(k in list){
    end <- as.data.frame(with(freq2, sum(numSNP[1:k])))
    colnames(end) <- "markerEnd"
    markerEnd <- rbind(markerEnd,end)
  }
  markerEnd[is.na(markerEnd)] <- 0
  markerEnd$markerStart <- markerEnd$markerEnd+1
  tempEnd <- as.data.frame(markerEnd[-1,1])
  tempStart <- head(as.data.frame(markerEnd[,2]),-1)
  freq3 <- cbind(freq2, tempStart, tempEnd)
  freq3 <- freq3[freq3$numSNP > 0,]
  colnames(freq3) <- c("numSNP", "phyStart", "phyEnd", "markerStart", "markerEnd")
  freq3$maleStart <- data2[freq3$markerStart,2]
  freq3$maleEnd <- data2[freq3$markerEnd,2]
  freq3$maleInt <- freq3$maleEnd-freq3$maleStart
  freq3$femaleStart <- data2[freq3$markerStart,3]
  freq3$femaleEnd <- data2[freq3$markerEnd,3]
  freq3$femaleInt <- freq3$femaleEnd-freq3$femaleStart
  freq3$avgStart <- data2[freq3$markerStart,4]
  freq3$avgEnd <- data2[freq3$markerEnd,4]
  freq3$avgInt <- freq3$avgEnd-freq3$avgStart
  freq3$phyStartActual <- data2$phy[match(freq3$markerStart, data2$markerOrder)]
  freq3$phyEndActual <- data2$phy[match(freq3$markerEnd, data2$markerOrder)]
  freq3$chr <- i
  name <- paste0("freqChr",i)
  assign(name, freq3, envir = .GlobalEnv)
},
ilist2, jlist2)


Extract a list of genes for each 1 Mbp interval (ran in R 4.2.1)

In [None]:
library(data.table)

# import list of gene annotations
ann7 <- read.delim("annotations.txt", header = FALSE)
ann8 <- ann7[,c(2,3,4,1)]
colnames(ann8) <- c("chr", "start", "end", "geneid")

# import list of intervals, extracting columns "chr", "start", "end"
intAll2

## get genes WITHIN notable intervals
intAll2 <- as.data.table(intAll2)
ann8 <- as.data.table(ann8)
setkey(intAll2, chr, start, end)
overlap <- foverlaps(ann8, intAll2, type="within", nomatch=NULL)
#overlap <- overlap[,-7]
# count number of genes
overlap$pos <- paste(overlap$chr,"-",overlap$start,"-",overlap$end)
overlapCount <- overlap[, .(numGenes = uniqueN(geneid)), by = pos]
overlapCount <-as.data.frame(overlapCount)
overlapCount[c("chr", "start", "end")] <- str_split_fixed(overlapCount$pos,"-",3)
overlapCount2 <- overlapCount[,c(3,4,5,2)]
overlapCount2$chr <- as.numeric(overlapCount2$chr)
overlapCount2$start <- as.numeric(overlapCount2$start)
overlapCount2$end <- as.numeric(overlapCount2$end)
intAll$numGenes <- overlapCount2$numGenes[match(paste0(intAll$chr,",",intAll$phyStart),
                                             paste0(overlapCount2$chr,",",overlapCount2$start))]
