Format plink pedigree file (.tfam) for input into Lep-MAP3 (ran in R 4.2.1)


In [None]:
#### load packages ####
library(plyr)
library(xlsx)
library(stringr)
library(dplyr)
library(tidyr)

#### import plink file ####
data <- read.delim("plink.tfam", sep = " ", header = FALSE)
colnames(data) <- c("family", "individual", "dad", "mom", "sex", "phenotype")

#standardise missing data as '0' (replace '-9')
data[data == "-9"] <- "0"

#### subset data of individuals with both parents missing ####

# get individuals with both parents missing
orphans <- data %>% 
  filter(dad==0 & mom==0)

## reformat data such that each orphan individual forms one family unit instead of all orphan individuals being merged into same family due to "identical" parents (both missing)
orphans$combineParents <- paste(orphans$dad, ",", orphans$mom)
orphans1 <- orphans[,c(7, 1, 2, 3, 4, 5, 6)] # re-order columns
orphans2 <- orphans1[,c(1:3)]
colnames(orphans2) <- c("dad,mom", "numOffsprings", "offspringList")

#### get information about plink file ####
## list of unique individuals and their counts
nonuniqueIndiv <- cbind(data$individual, data$dad, data$mom)
uniqueIndivCounts <- as.data.frame(table(unlist(nonuniqueIndiv)))
colnames(uniqueIndivCounts) <- c("individual", "frequency")

## list of unique parent pairs
uniqueParents <- as.data.frame(unique(cbind(data$dad, data$mom)))
colnames(uniqueParents) <- c("dad","mom")
# combine dad mom ID into 1 column
data$combineParents <- paste(data$dad, ",", data$mom)

## for each unique parent pair, output frequency and list of offsprings
familyInfo <- ddply(data, .(combineParents), summarise, length(unique(individual)), paste(unique(individual), collapse = ","))
colnames(familyInfo) <- c("dad,mom", "numOffsprings", "offspringList")

#### filtering steps ####
# remove offsprings with both parents unknown i.e. dad,mom ID is 0,0
familyInfo <- familyInfo[!familyInfo$`dad,mom` == "0 , 0", ]
max <- max(familyInfo$numOffsprings)

# append rows for orphan individuals
familyInfo2 <- rbind(familyInfo,orphans2)

# re-order columns
familyInfo3 <- familyInfo2[, c(1, 3, 2)]
# split dad mom ID into 2 columns
dadMom <- as.data.frame(str_split_fixed(familyInfo3$`dad,mom`, " , ", 2))
colnames(dadMom) <- c("dad","mom")
# split offspring list into separate columns
offsprings <- as.data.frame(str_split_fixed(familyInfo3$offspringList, ",", max))
offspringsNames <- str_c("offspring", str_pad(1:max, width = 0, pad = 0))
colnames(offsprings) <- offspringsNames
# create new dataframe
familyInfo4 <- cbind(dadMom, offsprings, familyInfo3$numOffsprings)
names(familyInfo4)[names(familyInfo4) == 'familyInfo3$numOffsprings'] <- 'numOffsprings'
# number of families with >1 offspring
sum(familyInfo4$numOffsprings > 1)

# add unique family ID for each parent pair
familyInfo4$familyID <- 1:nrow(familyInfo4)

## for each male offspring "dad", get "grandparents" information
dadGrand <- data[(data$sex=="1"),]
dadGrand <- dadGrand[,2:4]
colnames(dadGrand) <- c("dad","grandad1","grandmom1")
# get actual list of dads
dadActual <- as.data.frame(familyInfo4$dad)
colnames(dadActual) <- "dad"
dadActual$rowOrder  <- 1:nrow(dadActual)
# merge to get list of dads with granddad1 and grandmom1
dadGrand2 <- merge(x=dadActual,y=dadGrand,by="dad",all.x=TRUE)
colnames(dadGrand2) <- c("dadFake","rowOrder","granddad1","grandmom1")
dadGrand2 <- dadGrand2[order(dadGrand2$rowOrder), ]
dadGrand3 <- dadGrand2[,c(1,3,4)]

## for each female offspring "mom", get "grandparents" information
momGrand <- data[(data$sex=="2"),]
momGrand <- momGrand[,2:4]
colnames(momGrand) <- c("mom","grandad2","grandmom2")
# get actual list of moms
momActual <- as.data.frame(familyInfo4$mom)
colnames(momActual) <- "mom"
momActual$rowOrder <- 1:nrow(momActual)
# merge to get list of moms with grandad1 and grandmom1
momGrand2 <- merge(x=momActual,y=momGrand,by="mom",all.x=TRUE)
# reorder momGrand2 using rowOrder
colnames(momGrand2) <- c("momFake","rowOrder","granddad2","grandmom2")
momGrand2 <- momGrand2[order(momGrand2$rowOrder),]
momGrand3 <- momGrand2[,c(1,3,4)]

## merge grandparent information into familyInfo
familyInfo5 <- cbind(familyInfo4,dadGrand3,momGrand3)
# remove rows that are no longer needed
drops <- c("dadFake","momFake","rowOrder")
familyInfo6 <- familyInfo5[ , !(names(familyInfo5) %in% drops)]
# replace missing grandparent info with 0
familyInfo6[is.na(familyInfo6)] <- 0

# output data summary
write.xlsx(familyInfo6, file="snps-summary.xlsx", sheetName="family information", row.names=FALSE)
write.xlsx(uniqueIndivCounts, file="snps-summary.xlsx", sheetName="unique individual counts", append=TRUE, row.names=FALSE)

#### subset familyInfo for Lep-MAP3 ####
# keep families with all parents known
subset1 <- filter(familyInfo6, dad > 0, mom > 0)

# keep families with 2 or more offsprings
subset2 <- filter(subset1, numOffsprings > 1)

# rescue families with both parents known but with 1 offspring, if all grandparents are known
rescue1 <- filter(subset1, numOffsprings == 1)
rescue2 <- filter(rescue1, granddad1 > 0, grandmom1 > 0, granddad2 > 0, grandmom2 > 0)

# get dataset of families with both parents known and at least 2 children; with 1 child allowed ony if all grandparents are known
subset3 <- rbind (subset2, rescue2)
# re-order rows
subset4 <- subset3[order(subset3$familyID),]
subset4$familyID <- 1:nrow(subset4)

# remove unnecessary columns
drops2 <- c("numOffsprings")
subset5 <- subset4[ , !(names(subset4) %in% drops2)]

# re-order columns
input <- subset5 %>% select(dad, mom, granddad1, grandmom1, granddad2, grandmom2, everything())
matrixNcol <- ncol(input)

#### reformat familyInfo file for input into Lep-MAP3 ####
allFam <- data.frame(matrix(NA,    # Create empty data frame
                          nrow = 6,
                          ncol = 1))

for (row in 1:nrow(input)) {
  fam1 <- as.data.frame(matrix(, nrow = 1, ncol = matrixNcol)) # create matrix for each family
  colnames(fam1) <- colnames(input) # rename columns
  fam1[1,] <- input[row,] # parent and offspring names
  fam1[nrow(fam1) + 1,] <- fam1$familyID # family ID
  fam1[nrow(fam1) + 1,] <- c(data$dad[match(fam1[1,1], data$individual)], 
                             data$dad[match(fam1[1,2], data$individual)],
                             "0",
                             "0",
                             "0",
                             "0",
                             data$dad[match(fam1[1,7], data$individual)],
                             data$dad[match(fam1[1,8], data$individual)],
                             data$dad[match(fam1[1,9], data$individual)],
                             data$dad[match(fam1[1,10], data$individual)],
                             data$dad[match(fam1[1,11], data$individual)],
                             data$dad[match(fam1[1,12], data$individual)],
                             data$dad[match(fam1[1,13], data$individual)],
                             data$dad[match(fam1[1,14], data$individual)],
                             "NA") # dad ID
  fam1[nrow(fam1) + 1,] <- c(data$mom[match(fam1[1,1], data$individual)], 
                             data$mom[match(fam1[1,2], data$individual)],
                             "0",
                             "0",
                             "0",
                             "0",
                             data$mom[match(fam1[1,7], data$individual)],
                             data$mom[match(fam1[1,8], data$individual)],
                             data$mom[match(fam1[1,9], data$individual)],
                             data$mom[match(fam1[1,10], data$individual)],
                             data$mom[match(fam1[1,11], data$individual)],
                             data$mom[match(fam1[1,12], data$individual)],
                             data$mom[match(fam1[1,13], data$individual)],
                             data$mom[match(fam1[1,14], data$individual)],
                             "NA") # mom ID
  fam1[nrow(fam1) + 1,] <- c("1", 
                             "2",
                             "1", 
                             "2",
                             "1", 
                             "2",
                             data$sex[match(fam1[1,7], data$individual)],
                             data$sex[match(fam1[1,8], data$individual)],
                             data$sex[match(fam1[1,9], data$individual)],
                             data$sex[match(fam1[1,10], data$individual)],
                             data$sex[match(fam1[1,11], data$individual)],
                             data$sex[match(fam1[1,12], data$individual)],
                             data$sex[match(fam1[1,13], data$individual)],
                             data$sex[match(fam1[1,14], data$individual)],
                             "NA") # gender info
  fam1[nrow(fam1) + 1,] <- "0" # phenotype info
  fam1a <- fam1[c(2, 1, 3, 4, 5, 6),] # re-order rows
  fam1b <- fam1a[,1:14] # only keep first 14 columns
  fam1c <- fam1b[,!(fam1b[2,]) == ""] # remove columns with individual names empty
  allFam = cbind(allFam, fam1c)
}

allFam2 <- allFam[ -c(1) ]
allFam2[allFam2 == "-9"] <- "0"
allFam2[is.na(allFam2)] <- 0
allFam3 <- allFam2[,!(allFam2[2,]) == "0"] # remove columns with individual names missing (i.e. 0) to remove missing grandparents
allFam3[2,] <- paste0('1_', allFam3[2,]) #add "1_" prefix to individual names in row 2
allFam3[3,] <- paste0('1_', allFam3[3,]) #add "1_" prefix to individual names in row 3
allFam3[4,] <- paste0('1_', allFam3[4,]) #add "1_" prefix to individual names in row 4
allFam3[allFam3 == "1_0"] <- "0"

# create first 2 columns 'CHR' and 'POS'
addChrPos <- data.frame(matrix(NA,    # Create empty data frame
                            nrow = 6,
                            ncol = 2))
colnames(addChrPos) <- c("CHR", "POS")
addChrPos$CHR <- "CHR"
addChrPos$POS <- "POS"

# final dataset
pedigree <- cbind(addChrPos, allFam3)
rownames(pedigree) <- c("family", "individual", "dad", "mom", "sex", "phenotype")

# write .txt file without headers
write.table(pedigree, file = "pedigree-subset.txt", sep = "\t", row.names = FALSE, col.names = FALSE, quote = FALSE)
# write .csv file with headers
write.csv(pedigree, file = "pedigree-subset.csv")
