In [1]:
import polars as pl
import pandas as pd
import tqdm
import numpy as np

In [2]:
phecode_df = pl.read_csv("data/pheno/phecodes.tsv", separator="\t", null_values="NA").drop("FID")
binary_df = pl.read_csv("../data/pheno/binary_pheno.tsv", separator="\t").drop("FID")
merged_df = (
    binary_df
    .join(phecode_df, on="IID", how="inner")
    .drop("IID")
)
binary_names = merged_df.select("^b_.+$").columns
phecode_names = merged_df.select("^phecode_.+$").columns
coef_df = pd.DataFrame(index=binary_names, columns=phecode_names, dtype=float)

In [3]:
for phecode in tqdm.tqdm(phecode_names):
    reg_df = merged_df.select(binary_names + [phecode]).drop_nulls()
    X = reg_df.select(binary_names).to_numpy()
    Y = reg_df[phecode].to_numpy()
    coef = np.linalg.lstsq(X, Y, rcond=None)[0]
    coef_df[phecode] = coef

  0%|          | 0/433 [00:00<?, ?it/s]

In [4]:
(
    coef_df
    .rename_axis(index="feature")
    .rename(index=lambda x: f"plink.{x}.glm.linear.zst")
    .to_csv("data/coef/binary_to_phecode.tsv", sep="\t")
)

In [5]:
phecode_inclusion_df = pl.read_csv("data/pheno/phecodes_inclusion_only.tsv", separator="\t").drop("FID")
merged_inclusion_df = (
    binary_df
    .join(phecode_inclusion_df, on="IID", how="inner")
    .drop("IID")
)

In [6]:
X = merged_inclusion_df.select(binary_names)
Y = merged_inclusion_df.select(phecode_names)
coef = coef = np.linalg.lstsq(X, Y, rcond=None)[0]
coef_inclusion_df = pd.DataFrame(coef, index=binary_names, columns=phecode_names, dtype=float)

In [7]:
(
    coef_inclusion_df
    .rename_axis(index="feature")
    .rename(index=lambda x: f"plink.{x}.glm.linear.zst")
    .to_csv("data/coef/binary_to_phecode_inclusion.tsv", sep="\t")
)