# 1. What is GWAS?

In [None]:
using DataFrames
using CategoricalArrays # For CategoricalArray
using GLM # For linear models
using Distributions
using HypothesisTests # For ChisqTest
using Plots
using StatsPlots # For grouped bar plot
using Printf

## Genetic variation

### Genotypes and Hardy-Weinberg equilibrium

##### Example 1.1

In [None]:
geno = [ 66, 29, 4]
n = sum(geno) # number of individuals
f = sum(geno .* [ 0, 1, 2]) / (2 * n) # (66*0 + 29*1 + 4*2) / (2 * (66+29+4))

In [None]:
hwe_prop = [ (1-f)^2, 2*f*(1-f), f^2] # these would be the geno type freqa under HWE
DataFrame(obs = geno/n, hwe = hwe_prop) # print the observed genotype freqs and the HWE

In [None]:
# For testing HWE, we use chi-square test even though counts are quite small in the last row
hwe_test = sum( (geno - n*hwe_prop).^2 ./ (n * hwe_prop)) # HWE test statistic
hwe_p = ccdf(Chisq(1), hwe_test) # P-value from the test

bar(0:2, geno, 
    xaxis=("genotype", 0:2),
    leg=false,
    color="skyblue",
    title=@sprintf("rs429358 FIN in 1000G Phase3; HWE P = %.3f", hwe_p)
)

##### Synthetic Data

In [None]:
n = 1000

sample_from_geno = wsample(0:2, geno, n, replace=true) # sample from genotype frequencies
counts_from_geno = [count(x -> x==i, sample_from_geno) for i ∈ 0:2] # compute the vector of occurences

sample_from_hwe = rand(Binomial(2, f), n) # sample n genotypes from Binomial(2, f)
counts_from_hwe = [count(x -> x==i, sample_from_hwe) for i ∈ 0:2]

DataFrame(geno = counts_from_geno/n, hwe = counts_from_hwe/n)

In [None]:
groupedbar(
    ["geno", "HWE"], [reverse(counts_from_geno')/n; reverse(counts_from_hwe)'/n],
    orientation=:h, bar_position=:stack, legend=false, xticks=0:0.2:1
)

In [None]:
df_count_int = DataFrame(
    geno = counts_from_geno/n,
    geno_interval = map(x -> quantile(Beta(x+0.5, n-x+0.5), [0.025, 0.975]), counts_from_geno),
    hwe = counts_from_hwe/n,
    hwe_interval = map(x -> quantile(Beta(x+0.5, n-x+0.5), [0.025, 0.975]), counts_from_hwe)
)

In [None]:
ChisqTest([counts_from_geno counts_from_hwe])

## 1.2 What is genome-wide association study?

### 1.2.1 Quantitive traits

In [None]:
n = 10000
f = 0.04
μ = [0.02, -0.40, -2.00] # mean of each genotype
σ = ones(size(μ)) # SD of each genoetype

x = rand(Binomial(2, f), n)
DataFrame([(genotype=i, freq=count(_x->_x==i, x)/n) for i ∈ 0:2])

In [None]:
ldl_data = DataFrame(
    "x" => x,
    "y" => μ[x .+ 1] .+ randn(n) .* σ[x .+ 1] # sample LDL levels for genotype
)
boxplot(ldl_data.x, ldl_data.y, title="Simulated rs11591147 in Finns", 
        ylabel="LDL", xlabel="Copies of T", 
        xticks=0:2,
        color="limegreen", legend=false)

##### Additive model

In [None]:
lm_fit = lm(@formula(y ~ x), ldl_data)

In [None]:
# plot LDL data with some uniform jitter in X-axis
scatter(ldl_data.x .+ 0.20 .* (0.50 .- rand(n)), ldl_data.y,
    xlabel="genotype", ylabel="LDL", label=nothing,
    marker=:+, markersize=3, color="gray", xticks=0:2
)

# plot group means for each genotype
scatter!(0:2, [mean(ldl_data[ldl_data.x .== i, :y]) for i ∈ 0:2],
    markershape=:d, markersize=5, markercolor="red", label="group means"
)

# Plot ab line for the model; Plots.abline() expects coefficients in reverse
# order than that provided by lm()
Plots.abline!(reverse(coef(lm_fit))..., color="orange", label=nothing)

##### Full model

In [None]:
# ldl_full_data = ldl_data
# ldl_full_data.z = ldl_data.x .== 2 # add one more column with indicator variable for type-2

# lm_full = lm(@formula(y ~ x + z), ldl_full_data)

lm_full = lm(
    @formula(y ~ x + z),
    DataFrame(
        "x" => ldl_data.x,
        "y" => ldl_data.y,
        "z" => ldl_data.x .== 2
    )
)

In [None]:
lm_full2 = lm(
    @formula(y ~ x), 
    DataFrame(
        "x" => CategoricalArray(ldl_data.x),
        "y" => ldl_data.y
    )
)

##### Quantile Normalisation

In [None]:
# Generate a phenotype with male following 2 + Γ(shape=1.5, scale=1.5) and females
# following 6 + Γ(shape=1.5, scale=1.5)
n = 200 # males + females
fem = repeat([0, 1], inner=n÷2)
y = 2 .+ 4 .* fem + rand(Gamm)

In [None]:
Distri

In [None]:
n ÷ 2