In [None]:
import polars as pl
import pandas as pd
import numpy as np
import sklearn.metrics
import seaborn as sns

In [None]:
anon_pheno_df = pl.read_csv("../../../webgwas-data/ukb_wb_100k_reduced_anon/phenotype_data.csv.zst")
pheno_df = pl.read_csv("../../../webgwas-data/ukb_wb_100k_full/phenotype_data.csv", separator="\t")
assert pheno_df.drop("FID", "IID").shape == anon_pheno_df.shape

anon_features_df = anon_pheno_df.select(pl.all().sub(2)).to_pandas().assign(const=1.0)
features_df = pheno_df.select(pl.all().exclude(["FID", "IID"]).sub(2).name.map(lambda x: x.lstrip("b_"))).to_pandas().assign(const=1.0)

anon_pheno_df.head(2)

In [None]:
N_random_phenotypes = 10000
N_features_per_phenotype = 3

random_phenotype_definitions = list()

np.random.seed(0)
while len(random_phenotype_definitions) < N_random_phenotypes:
    selected_features = np.random.choice(features, size=N_features_per_phenotype, replace=False).tolist()
    negations = np.random.binomial(1, 0.5, size=N_features_per_phenotype)
    phenotype_definition = list()
    for i, (feature, negated) in enumerate(zip(selected_features, negations)):
        phenotype_definition.append(feature)
        if negated:
            phenotype_definition.append(NOT)
        if i != 0:
            phenotype_definition.append(AND)

    phenotype_definition = webgwas.phenotype_definitions.validate_nodes(phenotype_definition, knowledge_base)
    webgwas.phenotype_definitions.type_check_nodes(phenotype_definition)

    phenotype_values = webgwas.phenotype_definitions.apply_definition_pandas(phenotype_definition, features_df)
    if phenotype_values.drop_duplicates().shape[0] == 1:
        continue
        
    random_phenotype_definitions.append(phenotype_definition)

In [None]:
# Actually applying the definitions to real phenotypes
random_phenotypes_df = pd.DataFrame({
    f"R{i+1}": webgwas.phenotype_definitions.apply_definition_pandas(defn, features_df)
    for i, defn in enumerate(random_phenotype_definitions)
})

# Producing the phenotypes that are equivalent to what indirect GWAS would produce
anon_random_phenotypes_df = pd.DataFrame({
    f"R{i+1}": webgwas.phenotype_definitions.apply_definition_pandas(defn, anon_features_df)
    for i, defn in enumerate(random_phenotype_definitions)
})
anon_left_inverse_df = webgwas.regression.compute_left_inverse(anon_features_df)
anon_betas_df = anon_left_inverse_df @ anon_random_phenotypes_df
anon_random_phenotypes_preds_df = anon_features_df @ anon_betas_df
predicted_phenotypes_df = features_df @ anon_betas_df

In [None]:
anon_random_phenotypes_df.to_parquet("data/anon_random_phenotypes.parquet")
anon_random_phenotypes_preds_df.to_parquet("data/anon_random_phenotypes_pred.parquet")

In [None]:
gwas_pheno_df = (
    pd.concat([
        pheno_df.select("FID", "IID").to_pandas(),
        random_phenotypes_df,
        predicted_phenotypes_df.rename(columns=lambda name: f"P{name}")
    ], axis=1)
    .pipe(pl.DataFrame)
    .with_columns(pl.all().exclude("FID", "IID").add(2).cast(pl.Float32))
)

assert gwas_pheno_df.drop(["FID", "IID"]).select(pl.all().n_unique()).transpose().select(pl.col("column_0").min()).item() > 1

gwas_pheno_df.write_csv("data/random_phenotypes.tsv", separator="\t")