# 1. What is GWAS?

## 1.1 Genetic variation

### 1.1.2 Genotypes and Hardy-Weinberg equilibrium

##### Example 1.1

In [None]:
geno = c(66, 29, 4)
n = sum(geno)
f = sum(geno * c(0, 1, 2)) / (2*n)
f

In [None]:
hwe.prop = c((1-f)^2, 2*f*(1-f), f^2)
rbind(obs = geno/n, hwe = hwe.prop)

In [None]:
hwe.test = sum((geno - n*hwe.prop)^2 / (n*hwe.prop))
hwe.p = pchisq(hwe.test, df=1, lower=FALSE)

barplot(geno, main=paste0("rs429358 FIN in 1000G Phase3; HWE P=", signif(hwe.p, 3)),
        names = c(0, 1, 2), xlab = "genotype", col="skyblue")

##### Synthetic data

In [None]:
set.seed(19)
n = 1000
sample.from.geno = sample(c(0, 1, 2), prob=geno, size=n, replace=T)
tab = table(sample.from.geno)
counts.from.geno = rep(0, 3)
counts.from.geno[1 + as.numeric(names(tab))] = as.numeric(tab)

sample.from.hwe = rbinom(n, size=2, p=f)
counts.from.hwe = rep(0, 3)
for(ii in 0:2) {
    counts.from.hwe[ii+1] = sum(sample.from.hwe == ii)
}

rbind(geno = counts.from.geno/n, hwe = counts.from.hwe/n)

In [None]:
barplot(cbind(counts.from.geno/n, hwe=counts.from.hwe/n),
       names=c("geno", "HWE"), beside=F, horiz=T)

In [None]:
interval.from.geno = matrix(NA, ncol=2, nrow=3)
interval.from.hwe = matrix(NA, ncol=2, nrow=3)
for(ii in 1:3) {
    interval.from.geno[ii,] = qbeta(c(0.025, 0.975), counts.from.geno[ii]+0.5, n-counts.from.geno[ii]+0.5)
    interval.from.hwe[ii,] = qbeta(c(0.025, 0.975), counts.from.hwe[ii]+0.5, n-counts.from.hwe[ii]+0.5)
}

In [None]:
cbind(geno.est = counts.from.geno/n, interval.from.geno,
     hwe.est=counts.from.hwe/n, interval.from.hwe)

In [None]:
chisq_result <- chisq.test(rbind(counts.from.geno, counts.from.hwe))
chisq_result

## 1.2 What is a genome-wide association study?

### 1.2.1 Quantitative traits

In [None]:
n = 10000
f = 0.04
mu = c(0.02, -0.40, -2.00) # mean of each genotype
sigma = c(1, 1, 1) # SD for each genotype

x = rbinom(n, size=2, p=f)
table(x) / n

In [None]:
y = rep(NA, n)
for(ii in 0:2) {
    y[x==ii] = rnorm(sum(x==ii), mu[1+ii], sigma[1+ii])
}

boxplot(y ~ x, main="Simulated rs11591147 in Finns", ylabel="LDL",
        xlab="Copies of T", col="limegreen")

##### Additive model

In [None]:
lm.fit = lm(y ~ x)
summary(lm.fit)

In [None]:
plot(x + runif(n, -0.05, 0.05), y, xlab="genotype", ylab="LDL", xaxt="n",
     pch=3, cex=0.50, col="gray")
axis(1, at = 0:2, labels=0:2)
points(0:2, c(mean(y[x==0]), mean(y[x==1]), mean(y[x==2])), col="red", pch="X", cex=1.3)
abline(lm.fit, col="orange", lwd=2)
legend("topright", pch="X", legend="group means", col="red")

In [None]:
z = as.numeric(x == 2)
lm.full = lm(y ~ x + z)
summary(lm.full)

In [None]:
lm.full2 = lm(y ~ as.factor(x))
summary(lm.full2)

##### Quantile normalization

In [None]:
n = 200 # males + females
fem = rep(c(0, 1), each=n/2) # who is female
y = 2 + rgamma(n, shape=1.5, scale=1.5) # males have shift of 2
y[fem==1] = 4 + y[fem==1] # females have shift of 6 = 2 + 4
hist(y, breaks=30, col="khaki") # shows some outliesr compared to mixture of 2 Normals