# P-values in GWAS

## 2.1 What is P-value?

In [None]:
n = 100
f = 0.30 # MAF
x = rbinom(n, 2, f) # example genotypes for n individuals
y = rnorm(n) # outcome that is independent of x
lm.fit = lm(y ~ x)

summary(lm.fit)

In [None]:
par(mfrow = c(1, 2)) # draw two panels on the grid with 1 row and 2 cols

#1 on t-statistic's scale
x.grid = seq(-3, 3, 0.05) # we need to define the plotting the region
plot(x.grid, dt(x.grid, df=n-2), lty=2, lwd=2, t = "l",
     xlab=expression(hat(beta)/SE), ylab="density", main="NULL DISTR of t") # null distribution of t-statistic
t.stat = summary(lm.fit)$coeff[2, 3]
points(t.stat, 0, pch=19, cex=1.5, col="red")
segments(t.stat*c(1, -1), c(0, 0), t.stat*c(1, -1), rep(dt(t.stat, df=n-2), 2), col="red")
text(2, 0.25, paste0("P=", signif(summary(lm.fit)$coeff[2, 4], 3)), col="red")
legend("topright", pch=19, col="red", leg="observed")

#2nd on t^2 statistics's scale
x.grid = seq(0, 10, 0.05)
plot(x.grid, dchisq(x.grid, df=1), lty=2, lwd=2, t="l",
     xlab=expression((hat(beta)/SE)^2), ylab="density", main="NULL DISTR of t^2") # null distribution of t^2
t2.stat = summary(lm.fit)$coeff[2, 3]^2
points(t2.stat, 0, pch=19, cex=1.5, col="red")
segments(t2.stat, 0, t2.stat, dchisq(t2.stat, df=1), col="red")
text(2.5, 0.25, paste0("P=", signif(summary(lm.fit)$coeff[2, 4], 3)), col="red")
legend("topright", pch=19, col="red", leg="observed")

In [None]:
z = summary(lm.fit)$coeff[2, 3] # t-statistic also called z-score under Normal approximation
pnorm(-abs(z), 0, 1, lower=T) + pnorm(abs(z), 0, 1, lower=F) # P-value from N(0, 1): left + right tails

In [None]:
pchisq(z^2, df=1, lower=F)

## 2.2 Distribution of P-values

In [None]:
set.seed(39)
n = 100 # individuals
p = 1000 # variants measured on each individual
f = 0.40 # MAF is assumed the same for all variants; doesn't actually matter here
X = matrix(rbinom(n*p, 2, f), nrow=n, ncol=p) # just random genotypes
y = rnorm(n) # phenotype that is not associated with any of geneotypes

# Apply lm to each column of X seperately and collect results for gentypes (row 2 of coeff)
lm.res = apply(X, 2, function(x) summary(lm(y ~ x))$coeff[2,])
# results has 4 rows: beta, SE, t-stat and pval
pval = lm.res[4,] # pick values

par(mfrow=c(1, 2))
plot(density(lm.res[3,]), sub="", xlab="t-stat", main="", lwd=2) # should be t with n-2 df
x.seq = seq(-4, 4, 0.1) # x-coordinates for plotting
lines(x.seq, dt(x.seq, df=n-2), col="blue", lty=2) # t distribution in blue
lines(x.seq, dnorm(x.seq), col="red", lty=3) # normal distribution in red
hist(pval, breaks=10, xlab="P-value", main="", col="limegreen") # should be uniformely dsitributed

In [None]:
par(mfrow=c(1, 2)) # Let's make qqplots for t-stats and P-values
qqnorm(lm.res[3,], cex=0.5, pch=3)
qqline(lm.res[3,], col="red")

# For P-values, we want to compare to the Uniform(0, 1) distribution
# We use ppoints(p) to get
# p equally spaced values in (0, 1) to represent quantiles of Uniform(0, 1).
# We take -log10 trasnformation to see small P-values particularly well
qqplot(-log10(ppoints(p)), -log10(pval), xlab="theoretical",
       ylab="obs'd", main="Q-Q Plot for -log10 Pval", cex=0.5, pch=3)
abline(0, 1, col="red")

In [None]:
par(pty="s")
plot(ecdf(pval), xlab="sig thresh", ylab="proportion Pval < thresh", main="ECDF of Pvalues")

In [None]:
sum(pval < 0.05)

In [None]:
set.seed(49)
n = 1000 # inidividuals
p = 1000 # genotypes measured on each individual
m = 50 # number of variants that have an effect: they are x_1, ..., x_m
f = 0.4 # MAF
b = 0.5 # effect size of variants that have an effect
X = matrix(rbinom(n*p, 2, f), nrow=n, ncol=p) # just random genetypes at SNPs
y = X[,1:m] %*% rep(b, m) + rnorm(n) # phenotype that is associated with x_1, ..., x_m

# apply lm to each column of X seperately
lm.res = apply(X, 2, function(x) summary(lm(y ~ x))$coeff[2,])
# has 4 rows: ebta, SE, t-stat, and pval
pval = lm.res[4,]
               
par(mfrow=c(1, 2))
plot(density(lm.res[3,]), sub="", xlab="t-stat", main="", lwd=2) # under null is t with n-2 df
lines(seq(-4, 4, 0.1), dnorm(seq(-4, 4, 0.1)), col="red", lty=3) # normal distribution in red
hist(pval, breaks=10, xlab="P-value", main="", col="skyblue") # under null is uniformly distributed

In [None]:
par(mfrow=c(1,2)) # Let's make qqplots for t-stats and for P-values
qqnorm(lm.res[3,], cex=0.5, pch=3)
qqline(lm.res[3,], col="red")
qqplot(-log10(ppoints(p)), -log10(pval), xlab="theoretical", ylab="obs'd",
       main="Q-Q Plot for -log10 Pval", cex=0.50, pch=3)
abline(0, 1, col="red")

In [None]:
p.T = 1e-6
prior.odds = p.T / (1 - p.T)
pwr = 1 # upper bound for power --> upper bound for alphs
post.odds = 0.95 / (1 - 0.95)
alpha = prior.odds * pwr / post.odds
paste0(signif(alpha, 3))