-
Notifications
You must be signed in to change notification settings - Fork 6
/
haploid.go
169 lines (157 loc) · 6.89 KB
/
haploid.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
package sam
import (
"math"
"github.com/vertgenlab/gonomics/dna"
"github.com/vertgenlab/gonomics/numbers"
"github.com/vertgenlab/gonomics/numbers/logspace"
)
// HaploidCall is a struct to represent the data of a haploid genotype called
// from a Pileup from a haploid genome.
type HaploidCall struct {
Base dna.Base
Insertion string
Deletion int
}
// HaploidCallFromPile calls a haploid variant (including Base identity at the Pile position, and INDELs immediately after the position),
// for an input Pile.
func HaploidCallFromPile(p Pile, refBase dna.Base, epsilon float64, lambda float64, haploidBasePriorCache [][]float64, haploidIndelPriorCache []float64, homoBaseCache [][]float64, heteroBaseCache [][]float64, homoIndelCache [][]float64, ancientCache AncientLikelihoodCache) HaploidCall {
var answer HaploidCall = HaploidCall{Base: refBase, Insertion: "", Deletion: 0}
var aCount = p.CountF[dna.A] + p.CountR[dna.A]
var cCount = p.CountF[dna.C] + p.CountR[dna.C]
var gCount = p.CountF[dna.G] + p.CountR[dna.G]
var tCount = p.CountF[dna.T] + p.CountR[dna.T]
var nCount = p.CountF[dna.N] + p.CountR[dna.N]
var N = aCount + cCount + gCount + tCount + nCount // this is the read depth for the pile
var maxPosterior, currPosterior float64
if refBase != dna.N { //if we have a real base, we'll calculate posteriors.
var maxBase = []dna.Base{dna.A}
if lambda > 0 {
maxPosterior = logspace.Multiply(ancientBaseLikelihood(aCount, cCount, gCount, tCount, AA, epsilon, lambda, ancientCache), haploidBasePriorCache[refBase][dna.A])
} else {
maxPosterior = logspace.Multiply(baseLikelihood(aCount, cCount, gCount, tCount, AA, epsilon, homoBaseCache, heteroBaseCache), haploidBasePriorCache[refBase][dna.A])
}
//CC
if lambda > 0 {
currPosterior = logspace.Multiply(ancientBaseLikelihood(aCount, cCount, gCount, tCount, CC, epsilon, lambda, ancientCache), haploidBasePriorCache[refBase][dna.C])
} else {
currPosterior = logspace.Multiply(baseLikelihood(aCount, cCount, gCount, tCount, CC, epsilon, homoBaseCache, heteroBaseCache), haploidBasePriorCache[refBase][dna.C])
}
if currPosterior > maxPosterior {
maxBase = maxBase[:1] //clear ties
maxBase[0] = dna.C
maxPosterior = currPosterior
} else if currPosterior == maxPosterior {
maxBase = append(maxBase, dna.C)
}
//GG
if lambda > 0 {
currPosterior = logspace.Multiply(ancientBaseLikelihood(aCount, cCount, gCount, tCount, GG, epsilon, lambda, ancientCache), haploidBasePriorCache[refBase][dna.G])
} else {
currPosterior = logspace.Multiply(baseLikelihood(aCount, cCount, gCount, tCount, GG, epsilon, homoBaseCache, heteroBaseCache), haploidBasePriorCache[refBase][dna.G])
}
if currPosterior > maxPosterior {
maxBase = maxBase[:1] //clear ties
maxBase[0] = dna.G
maxPosterior = currPosterior
} else if currPosterior == maxPosterior {
maxBase = append(maxBase, dna.G)
}
//TT
if lambda > 0 {
currPosterior = logspace.Multiply(ancientBaseLikelihood(aCount, cCount, gCount, tCount, TT, epsilon, lambda, ancientCache), haploidBasePriorCache[refBase][dna.T])
} else {
currPosterior = logspace.Multiply(baseLikelihood(aCount, cCount, gCount, tCount, TT, epsilon, homoBaseCache, heteroBaseCache), haploidBasePriorCache[refBase][dna.T])
}
if currPosterior > maxPosterior {
maxBase = maxBase[:1] //clear ties
maxBase[0] = dna.T
maxPosterior = currPosterior
} else if currPosterior == maxPosterior {
maxBase = append(maxBase, dna.T)
}
answer.Base = maxBase[numbers.RandIntInRange(0, len(maxBase))]
}
//next we evaluate haploid insertion. We only consider Ia
var iTot int
var insertions = make(map[string]int, 0) //this will be the combined insertion depth map, merging the for and rev observations
for key, value := range p.InsCountF {
iTot += value
insertions[key] = value
}
for key, value := range p.InsCountR {
iTot += value
if _, ok := insertions[key]; ok { // if insertion on reverse strand was seen already on positive strand
insertions[key] = insertions[key] + value //add reverse read counts to forward read counts
} else {
insertions[key] = value
}
}
var IaKey string
var IaValue int
for key, value := range insertions {
if value > IaValue {
IaKey = key
IaValue = value
}
}
var B = N - iTot
var noInsertionPosterior float64
var insertionPosterior float64
if IaValue > 0 { //if IaValue == 0, keep initial answer insertion value of ""
noInsertionPosterior = logspace.Multiply(homozygousIndelLikelihoodExpression(B, IaValue, epsilon, homoIndelCache), haploidIndelPriorCache[0])
insertionPosterior = logspace.Multiply(homozygousIndelLikelihoodExpression(IaValue, B, epsilon, homoIndelCache), haploidIndelPriorCache[1])
if insertionPosterior > noInsertionPosterior {
answer.Insertion = IaKey
}
}
//finally we evaluate haploid deletion. We only consider Da
var dTot int
var deletions = make(map[int]int, 0) //this will be the combined deletion depth map, merging the for and rev observations
for key, value := range p.DelCountF {
dTot += value
deletions[key] = value
}
for key, value := range p.DelCountR {
dTot += value
if _, ok := deletions[key]; ok { // if insertion on reverse strand was seen already on positive strand
deletions[key] = deletions[key] + value //add reverse read counts to forward read counts
} else {
deletions[key] = value
}
}
var DaKey, DaValue int
for key, value := range deletions {
if value > DaValue {
DaKey = key
DaValue = value
}
}
B = N - iTot
var noDeletionPosterior, deletionPosterior float64
if DaValue > 0 { //if DaValue == 0, keep initial answer deletion value of 0
noDeletionPosterior = logspace.Multiply(homozygousIndelLikelihoodExpression(B, DaValue, epsilon, homoIndelCache), haploidIndelPriorCache[0])
deletionPosterior = logspace.Multiply(homozygousIndelLikelihoodExpression(DaValue, B, epsilon, homoIndelCache), haploidIndelPriorCache[1])
if deletionPosterior > noDeletionPosterior {
answer.Deletion = DaKey
}
}
return answer
}
// MakeHaploidBasePriorCache is a helper function used in samAssembler before running HaploidCallFromPile.
// Constructs a matrix ([][]float64) of form answer[refBase][outputBase], where the index of both bases 0=a, 1=c, 2=g, 3=t.
// Values are log transformed.
func MakeHaploidBasePriorCache(delta float64, gamma float64) [][]float64 {
Tv := math.Log(delta / (2.0 + gamma)) //the probability of transversions
Tr := math.Log(gamma * delta / (2.0 + gamma)) //the probability of transitions
oneMinusDelta := math.Log(1.0 - delta)
return [][]float64{
{oneMinusDelta, Tv, Tr, Tv},
{Tv, oneMinusDelta, Tv, Tr},
{Tr, Tv, oneMinusDelta, Tv},
{Tv, Tr, Tv, oneMinusDelta}}
}
// MakeHaploidIndelPriorCache is a helper function used in before calling HaploidCallFromPile.
// Stores two values: answer[0] = p(Base), answer[1] = p(Indel).
func MakeHaploidIndelPriorCache(delta float64, kappa float64) []float64 {
return []float64{math.Log(1.0 - delta*kappa), math.Log(delta * kappa)}
}