-
Notifications
You must be signed in to change notification settings - Fork 0
/
VCF2FASTA_IUPACcoding.r
86 lines (67 loc) · 5.82 KB
/
VCF2FASTA_IUPACcoding.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
## I wrote this script to generate a large number of 2000 bp sequences randomly drawn from the reference genome for RAxML
## note that the scripts prints out one sequence per diploid sample using the IUPAC coding for heterozygote genotypes.
## If you simply want to output all haplotypes in a population (2 haploytpes per individual) for a given set of sequences please refer to the script VCF2indFASTA.r
## use VCFtools to generate --012 output of VCF file, create an .Allele file by copying the CHROM,REF,ALT columns from the VCF file
## I advise to subset reference fasta using samtools faidx to contain only those loci for which alignment is required. Large fasta files will make this a painfully slow process.
## faidx <ref_fasta> <regions_file> > <fastafile_subset>
# read in SNPs
library(seqinr)
geno<-t(read.delim("VCF.012", header=FALSE))[-1,]
allele<-read.delim("VCF.Allele", sep="/", header=FALSE)
snps<-read.delim("VCF.pos", header=FALSE)
ind<-read.delim("VCF.indv", header=FALSE)
genotypes<-data.frame(cbind(snps,allele,geno))
colnames(genotypes)<-c("contig","pos","ref","alt",as.character(ind[,1]))
## sample all SNPs to retain one SNP per contig at random and select nucleotide in the range of 2 kb around the SNP (plus 1000 bp and minus 1000 bp).
one_SNP_per_contig<-data.frame(unique(snps[,1]))
one_SNP_per_contig$SNP<-NA
for(i in 1:nrow(one_SNP_per_contig)) {
contig_snps<-snps[snps[,1]==as.character(one_SNP_per_contig[i,1]),]
if(nrow(contig_snps)!=1){
one_SNP_per_contig$SNP[i]<-sample(contig_snps[,2],1)
} else {
one_SNP_per_contig$SNP[i]<-contig_snps[,2]}
}
one_SNP_per_contig$plus1000<-one_SNP_per_contig$SNP+1000
one_SNP_per_contig$min1000<-one_SNP_per_contig$SNP-1000
# update original SNP positions relative to fragment start and end
one_SNP_per_contig_adjusted<-merge(one_SNP_per_contig_adjusted,genotypes,"contig",sort=FALSE)
one_SNP_per_contig_adjusted$pos_adjusted<-one_SNP_per_contig_adjusted$pos-one_SNP_per_contig_adjusted$min1000_adjusted
one_SNP_per_contig_adjusted<-one_SNP_per_contig_adjusted[one_SNP_per_contig_adjusted[,"pos_adjusted"] > 0,]
one_SNP_per_contig_adjusted<-one_SNP_per_contig_adjusted[one_SNP_per_contig_adjusted[,"pos_adjusted"] < 2002,]
fa<-read.fasta(fastafilename, seqtype="DNA", as.string=TRUE, forceDNAtolower = FALSE)
alignment<-list()
# alignment is going to be you output list of fasta sequences with one entry for each individual, which will be all sequences for that individual concatenated
for(i in 1:nrow(ind)) {
temp.loci<-NULL
for(j in 1:length(fa)) {
contig_name<-strsplit(attributes(fa[[j]])$name,":")[[1]][1] # this depends on how the sequences are named in the fasta file - in my case: <contig_name>:<region>. So I only retain the contig name by splitting based on ":" and taking the first element
temp.genotypes<-one_SNP_per_contig_adjusted[one_SNP_per_contig_adjusted[,"contig"]==contig_name,c("ref","alt","pos_adjusted",paste(ind[i,1]))]
temp.seq<-fa[[j]]
## replace nucleotides in reference fasta ("temp.seq") with genotypes in VCF file
## 0 -> REF, 2 -> ALT, 1 -> ambiguous
## IUPAC Ambiguities: M (A or C), R (A or G), W (A or T), S (C or G), Y (C or T), K (G or T)
for(k in 1:nrow(temp.genotypes)) {
if(temp.genotypes[k,paste(ind[i,1])]==0) { temp.seq[temp.genotypes[k,"pos_adjusted"]]<-as.character(temp.genotypes[k,"ref"])}
if(temp.genotypes[k,paste(ind[i,1])]==2) { temp.seq[temp.genotypes[k,"pos_adjusted"]]<-as.character(temp.genotypes[k,"alt"])}
if(temp.genotypes[k,paste(ind[i,1])]==1 && paste0(temp.genotypes[k,"ref"],temp.genotypes[k,"alt"])=="AC" ) { temp.seq[temp.genotypes[k,"pos_adjusted"]]<-"M"}
if(temp.genotypes[k,paste(ind[i,1])]==1 && paste0(temp.genotypes[k,"ref"],temp.genotypes[k,"alt"])=="CA" ) { temp.seq[temp.genotypes[k,"pos_adjusted"]]<-"M"}
if(temp.genotypes[k,paste(ind[i,1])]==1 && paste0(temp.genotypes[k,"ref"],temp.genotypes[k,"alt"])=="AG" ) { temp.seq[temp.genotypes[k,"pos_adjusted"]]<-"R"}
if(temp.genotypes[k,paste(ind[i,1])]==1 && paste0(temp.genotypes[k,"ref"],temp.genotypes[k,"alt"])=="GA" ) { temp.seq[temp.genotypes[k,"pos_adjusted"]]<-"R"}
if(temp.genotypes[k,paste(ind[iwrite.fasta(alignment, names=attributes(alignment)$name, file.out="filename.fa")
,1])]==1 && paste0(temp.genotypes[k,"ref"],temp.genotypes[k,"alt"])=="AT" ) { temp.seq[temp.genotypes[k,"pos_adjusted"]]<-"W"}
if(temp.genotypes[k,paste(ind[i,1])]==1 && paste0(temp.genotypes[k,"ref"],temp.genotypes[k,"alt"])=="TA" ) { temp.seq[temp.genotypes[k,"pos_adjusted"]]<-"W"}
if(temp.genotypes[k,paste(ind[i,1])]==1 && paste0(temp.genotypes[k,"ref"],temp.genotypes[k,"alt"])=="CG" ) { temp.seq[temp.genotypes[k,"pos_adjusted"]]<-"S"}
if(temp.genotypes[k,paste(ind[i,1])]==1 && paste0(temp.genotypes[k,"ref"],temp.genotypes[k,"alt"])=="GC" ) { temp.seq[temp.genotypes[k,"pos_adjusted"]]<-"S"}
if(temp.genotypes[k,paste(ind[i,1])]==1 && paste0(temp.genotypes[k,"ref"],temp.genotypes[k,"alt"])=="CT") { temp.seq[temp.genotypes[k,"pos_adjusted"]]<-"Y"}
if(temp.genotypes[k,paste(ind[i,1])]==1 && paste0(temp.genotypes[k,"ref"],temp.genotypes[k,"alt"])=="TC") { temp.seq[temp.genotypes[k,"pos_adjusted"]]<-"Y"}
if(temp.genotypes[k,paste(ind[i,1])]==1 && paste0(temp.genotypes[k,"ref"],temp.genotypes[k,"alt"])=="GT" ) { temp.seq[temp.genotypes[k,"pos_adjusted"]]<-"K"}
if(temp.genotypes[k,paste(ind[i,1])]==1 && paste0(temp.genotypes[k,"ref"],temp.genotypes[k,"alt"])=="TG" ) { temp.seq[temp.genotypes[k,"pos_adjusted"]]<-"K"}
}
temp.loci<-c(temp.loci,unlist(temp.seq)) ## this creates a list with each element being all the 2000 bp regions of a single contig pasted together for a given individual
}
alignment[[i]]<-temp.loci
attributes(alignment)$name[i]<-paste0(ind[i,1])
}
# write.fasta(alignment, names=attributes(alignment)$name, file.out="filename.fa")
# this write the output to file.out