-
Notifications
You must be signed in to change notification settings - Fork 0
/
convertDCas20_5360_ToVCF.Rmd
140 lines (118 loc) · 5.49 KB
/
convertDCas20_5360_ToVCF.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
---
title: "Convert DCas20-5360 to VCF"
site: workflowr::wflow_site
author: "Marnin Wolfe"
date: "2020-August-26"
output:
workflowr::wflow_html:
editor_options:
chunk_output_type: inline
---
# Input Parameters
```{r, eval=F}
#' @dartvcfInput input name and path of "vcf" file from DArT
#' @dartcountsInput input name and path of counts file from DArT
#' @outName output path and name
#' @nskipvcf number of "VCF" rows to skip on read-in
#' @nskipcounts number of "counts file" rows to skip on read in
#' @ncores number of cores to use, could be VERY memory intensive
#' @dartVars chr vector, column names that _are not_ sample IDs in the read-counts file. I use this internally to assign the sampleIDs in the VCF file
library(tidyverse); library(magrittr)
dartvcfInput<-here::here("data/Report-DCas20-5360","Report_DCas20-5360_SNP_VCF_Version_6.csv")
dartcountsInput<-here::here("data/Report-DCas20-5360","Report_DCas20-5360_SNP_Counts_Version_6_final.csv")
outName<-here::here("data/Report-DCas20-5360","DCas20_5360")
# From report to report, there has been minor (NOT SO MINOR?) variation
# in format among files, requiring user input
# rather frustrating
# can't pipeline their stuff
nskipvcf<-2
nskipcounts<-3
ncores<-75
dartVars<-read.csv(dartcountsInput,
stringsAsFactors = F,
header = T,
skip=nskipcounts,
nrows = 1) %>% colnames(.) %>% .[1:26]
```
# Prelim. check format
Start manual. Check that the files read in according to previous code. Adjust code if necessary. Make a function and apply it to the input files.
```{r, eval=F}
vcf<-read.table(dartvcfInput,
stringsAsFactors = F,skip = nskipvcf, header = T, sep = "\t", comment.char = "")
readCounts<-read.csv(dartcountsInput, stringsAsFactors = F,header = T,skip=nskipcounts)
# dim(vcf)
# [1] 13603 3749
# dim(readCounts)
# [1] 27206 4105
# colnames(readCounts)[1:100]
# [1] "AlleleID" "CloneID"
# [3] "AlleleSequence" "TrimmedSequence"
# [5] "TrimmedSequence_plus_Strand" "Chrom_Cassava_v61"
# [7] "ChromPos_Cassava_v61" "SNP_ChromPos_Cassava_v61"
# [9] "AlnCnt_Cassava_v61" "AlnEvalue_Cassava_v61"
# [11] "Strand_Cassava_v61" "SeqDiff_Cassava_v61"
# [13] "SNP" "SnpPosition.on.Tag"
# [15] "CallRate" "OneRatioRef"
# [17] "OneRatioSnp" "FreqHomRef"
# [19] "FreqHomSnp" "FreqHets"
# [21] "PICRef" "PICSnp"
# [23] "AvgPIC" "AvgCountRef"
# [25] "AvgCountSnp" "RepAvg"
# [27] "X9624.09" "BGM.0004"
# [29] "BGM.0005" "BGM.0006"
# [31] "BGM.0014" "BGM.0018"
# [33] "BGM.0019" "BGM.0020"
# [35] "BGM.0021" "BGM.0022"
# [37] "BGM.0023" "BGM.0024"
# colnames(vcf)[1:30]
# [1] "X.CHROM" "POS" "ID"
# [4] "REF" "ALT" "QUAL"
# [7] "FILTER" "INFO" "FORMAT"
# [10] "X911118200001_A_1" "X911118200001_A_10" "X911118200001_A_11"
# [13] "X911118200001_A_12" "X911118200001_A_2" "X911118200001_A_3"
# [16] "X911118200001_A_4" "X911118200001_A_5" "X911118200001_A_6"
# [19] "X911118200001_A_7" "X911118200001_A_8" "X911118200001_A_9"
# [22] "X911118200001_B_1" "X911118200001_B_10" "X911118200001_B_11"
# [25] "X911118200001_B_12" "X911118200001_B_2" "X911118200001_B_3"
# [28] "X911118200001_B_4" "X911118200001_B_5" "X911118200001_B_6"
colnames(vcf)[10:length(colnames(vcf))]<-colnames(readCounts)[length(dartVars):length(colnames(readCounts))]
vcf %<>%
mutate(X.CHROM=gsub("Chromosome","",X.CHROM)) %>%
rename(Pos=POS) %>%
mutate(Chr=as.numeric(gsub("Chromosome","",X.CHROM)),
Pos=as.numeric(Pos))
# dim(readCounts)
# readCounts %>% .[,1:30] %>% str
# readCounts[1:5,1:30]
# readCounts[1:10,] %>% select(SNP,CloneID,AlleleID,Chrom_Cassava_v61,SNP_ChromPos_Cassava_v61)
# vcf[1:10,1:5]
# Note: The ID column in "vcf" is the value for the ALT allele in AlleleID of "readCounts"
readCounts %<>%
mutate(RefAlt=ifelse(SNP=="","REF","ALT"),
Chr=as.numeric(gsub("Chromosome","",Chrom_Cassava_v61)),
Pos=as.numeric(SNP_ChromPos_Cassava_v61))
```
# Conversion function
Printed here, but available and sourced from `code/` subdirectory.
```{r, code = readLines(here::here("code/","convertDart2vcf.R"))}
```
# Run conversion function
```{r, eval=F}
source(here::here("code/","convertDart2vcf.R"))
convertDart2vcf(dartvcfInput,dartcountsInput,outName,
nskipvcf=2,nskipcounts=3,ncores)
```
# Genomewide to per-chrom VCFs
Split the genome-wide VCF into per-chromosome VCFs for imputation.
```{r}
require(furrr); options(mc.cores=18); plan(multiprocess)
source(here::here("code","imputationFunctions.R"))
vcfIn<-here::here("data/Report-DCas20-5360","DCas20_5360.vcf.gz")
filters<-"--minDP 4 --maxDP 50" # because using GT not PL for impute (Beagle5)
outPath<-here::here("data/Report-DCas20-5360/")
outSuffix<-"DCas20_5360"
future_map(list(Chr=1:18),
~splitVCFbyChr(Chr=.),
vcfIn=vcfIn,filters=filters,
outPath=outPath,outSuffix=outSuffix)
```