# 1. What is GWAS?

## Genetic variation

### Genotypes and Hardy-Weinberg equilibrium

##### Example 1.1

In [None]:
geno = c(66, 29, 4)
n = sum(geno)
f = sum(geno * c(0, 1, 2)) / (2*n)
f

In [None]:
hwe.prop = c((1-f)^2, 2*f*(1-f), f^2)
rbind(obs = geno/n, hwe = hwe.prop)

In [None]:
hwe.test = sum((geno - n*hwe.prop)^2 / (n*hwe.prop))
hwe.p = pchisq(hwe.test, df=1, lower=FALSE)

barplot(geno, main=paste0("rs429358 FIN in 1000G Phase3; HWE P=", signif(hwe.p, 3)),
        names = c(0, 1, 2), xlab = "genotype", col="skyblue")

##### Synthetic data

In [None]:
# set.seed(19)
n = 1000
sample.from.geno = sample(c(0, 1, 2), prob=geno, size=n, replace=T)
tab = table(sample.from.geno)
counts.from.geno = rep(0, 3)
counts.from.geno[1 + as.numeric(names(tab))] = as.numeric(tab)

sample.from.hwe = rbinom(n, size=2, p=f)
counts.from.hwe = rep(0, 3)
for(ii in 0:2) {
    counts.from.hwe[ii+1] = sum(sample.from.hwe == ii)
}

rbind(geno = counts.from.geno/n, hwe = counts.from.hwe/n)

In [None]:
barplot(cbind(counts.from.geno/n, hwe=counts.from.hwe/n),
       names=c("geno", "HWE"), beside=F, horiz=T)

In [None]:
interval.from.geno = matrix(NA, ncol=2, nrow=3)
interval.from.hwe = matrix(NA, ncol=2, nrow=3)
for(ii in 1:3) {
    interval.from.geno[ii,] = qbeta(c(0.025, 0.975), counts.from.geno[ii]+0.5, n-counts.from.geno[ii]+0.5)
    interval.from.hwe[ii,] = qbeta(c(0.025, 0.975), counts.from.hwe[ii]+0.5, n-counts.from.hwe[ii]+0.5)
}

In [None]:
cbind(geno.est = counts.from.geno/n, interval.from.geno,
     hwe.est=counts.from.hwe/n, interval.from.hwe)

In [None]:
chisq.test(rbind(counts.from.geno, counts.from.hwe))