In [1]:
import numpy as np
import pandas as pd
import polars as pl

# Replicate feature GWAS

## Load data

In [2]:
genotype_df = (
    pl.scan_csv("data/simulated-phenotypes/Genotypes.raw", separator="\t")
    .select("IID", pl.col("SNP_1_.").alias("g"))
    .collect()
)
genotype_df.head(2)

IID,g
str,i64
"""ID_1""",1
"""ID_2""",1


In [3]:
covar_df = (
    pl.scan_csv("data/simulated-phenotypes/Covs_plink.txt", separator="\t")
    .select("IID", pl.all().exclude("FID", "IID"), const=1.0)
    .rename({
        "sharedConfounder_norm1": "x_1",
        "sharedConfounder_norm2": "x_2",
        "sharedConfounder_norm3": "x_3",
        "independentConfounder_norm1": "x_4",
        "independentConfounder_norm2": "x_5",
        "const": "x_6"
    })
    .collect()
    
)
covar_df.head(2)

IID,x_1,x_2,x_3,x_4,x_5,x_6
str,f64,f64,f64,f64,f64,f64
"""ID_1""",1.079511,0.675817,0.335099,1.812403,-0.383385,1.0
"""ID_2""",2.308693,-0.211148,-1.933398,1.139712,0.087484,1.0


In [4]:
phenotype_df = (
    pl.scan_csv("data/simulated-phenotypes/Ysim_plink_names.txt", separator="\t")
    .select("IID", pl.col("feature_01").alias("y"))
    .collect()
)
phenotype_df.head(2)

IID,y
str,f64
"""ID_1""",-1.946783
"""ID_2""",-2.037948


## Combine data

In [5]:
merged_df = (
    genotype_df
    .join(covar_df, on="IID")
    .join(phenotype_df, on="IID")
)
merged_df.head(2)

IID,g,x_1,x_2,x_3,x_4,x_5,x_6,y
str,i64,f64,f64,f64,f64,f64,f64,f64
"""ID_1""",1,1.079511,0.675817,0.335099,1.812403,-0.383385,1.0,-1.946783
"""ID_2""",1,2.308693,-0.211148,-1.933398,1.139712,0.087484,1.0,-2.037948


## Residualize data

In [6]:
X = merged_df.select("^x_.+$").to_pandas().values
P = np.eye(1000) - X @ np.linalg.inv(X.T @ X) @ X.T

g = merged_df["g"].to_numpy()
y = merged_df["y"].to_numpy()

gt_unscaled = P @ g
yt_unscaled = P @ y

gt_scaled = gt_unscaled / gt_unscaled.std()
yt_scaled = yt_unscaled / yt_unscaled.std()

N = 1000
C = 6

## Indirect GWAS

In [7]:
T = (gt_unscaled @ yt_unscaled) * np.sqrt(N - C) / np.sqrt(gt_unscaled @ gt_unscaled) / np.linalg.norm(yt_unscaled, 2)
T

np.float64(2.702984345815617)

In [8]:
T = (gt_scaled @ yt_scaled) * np.sqrt(N - C) / np.sqrt(gt_scaled @ gt_scaled) / np.linalg.norm(yt_scaled, 2)
T

np.float64(2.702984345815617)

In [9]:
beta_hat_scaled = (gt_scaled @ yt_scaled) / (gt_scaled @ gt_scaled)
se_hat_scaled = np.sqrt((yt_scaled @ yt_scaled) / (N - C) / (gt_scaled @ gt_scaled))

assert np.allclose(T, beta_hat_scaled / se_hat_scaled)

In [10]:
beta_hat_unscaled = (gt_unscaled @ yt_unscaled) / (gt_unscaled @ gt_unscaled)
se_hat_unscaled = np.sqrt((yt_unscaled @ yt_unscaled) / (N - C) / (gt_unscaled @ gt_unscaled))

assert np.allclose(T, beta_hat_unscaled / se_hat_unscaled)

In [11]:
chisq = T ** 2
chisq

np.float64(7.306124373724278)

In [12]:
beta_hat_scaled, se_hat_scaled, beta_hat_scaled / se_hat_scaled

(np.float64(0.08573345746027532),
 np.float64(0.03171807398477756),
 np.float64(2.7029843458156173))

In [13]:
beta_hat_unscaled, se_hat_unscaled, beta_hat_unscaled / se_hat_unscaled

(np.float64(0.11419474053588408),
 np.float64(0.04224765145705132),
 np.float64(2.7029843458156173))

## Compare to direct

In [14]:
(
    pl.scan_csv("data/gwas/regenie/features/result_feature_01.regenie", separator=" ")
    .filter(pl.col("ID").eq("SNP_1"))
    .collect()
)

CHROM,GENPOS,ID,ALLELE0,ALLELE1,A1FREQ,N,TEST,BETA,SE,CHISQ,LOG10P,EXTRA
i64,i64,str,i64,i64,f64,i64,str,f64,f64,f64,f64,str
1,0,"""SNP_1""",0,0,0.5965,1000,"""ADD""",-0.114195,0.0422477,7.30612,2.16292,"""NA"""


# $\tilde{g}^\intercal \tilde{g}$

In [15]:
gtg_unscaled = gt_unscaled @ gt_unscaled
gtg_scaled = gt_scaled @ gt_scaled

print(f"Unscaled: {gtg_unscaled}, Scaled: {gtg_scaled}")

gtg_hat_unscaled = (yt_unscaled @ yt_unscaled) / ((N - C) * se_hat_unscaled**2)
gtg_hat_scaled = (yt_scaled @ yt_scaled) / ((N - C) * se_hat_scaled**2)

print(f"Unscaled: {gtg_hat_unscaled}, Scaled: {gtg_hat_scaled}")

Unscaled: 467.18324528470043, Scaled: 1000.0
Unscaled: 467.18324528470055, Scaled: 1000.0000000000001
