# Knockoff on Linear Gaussian Model


+ generate knockoff with closed formula, in case x ~ Normal

In [1]:
import os
import sys

module_paths = [
    '../deepknockoffs/DeepKnockoffs/DeepKnockoffs',
    '../deepknockoffs/examples'
]
module_paths = [os.path.abspath(os.path.join(x)) for x in module_paths]
for module_path in module_paths:
    if module_path not in sys.path:
        sys.path.append(module_path)


import math
import numpy as np
import matplotlib.pyplot as plt
np.random.seed(0)

from DeepKnockoffs.gaussian import GaussianKnockoffs
from DeepKnockoffs.kfilter import kfilter

In [62]:
from sklearn import preprocessing
from glmnet import LogitNet

def lasso_stats(X,Xk,y,alpha=0.1,scale=True):
    X  = X.astype("float")
    Xk = Xk.astype("float")
    p = X.shape[1]
    if scale:
        X_concat = preprocessing.scale(np.concatenate((X,Xk),1))
    else:
        X_concat = np.concatenate((X,Xk),1)
    cols_order = np.random.choice(X_concat.shape[1],X_concat.shape[1],replace=False)
    m = LogitNet(alpha=0.1, n_splits=3)
    m.fit(X_concat[:,cols_order].copy(), y.copy())
    Z = np.zeros((2*p,))
    Z[cols_order] = m.coef_.squeeze()
    W = np.abs(Z[0:p]) - np.abs(Z[p:(2*p)])
    return W.squeeze(), Z

In [63]:
# number of data point
n = 1000
# number of variables 
p = 1000
# number of variables with nonzero coefficients
k = 60
# magnitude of nonzero coefficients
amplitude = 15
# noise level 
sigma = 1
# target FDR 
q = 0.1

In [64]:
Sigma = np.eye(p)
mu = np.zeros(p)

S0 = np.random.choice(n, k, replace=False)
beta = np.zeros(p)
beta[S0] = amplitude/np.sqrt(n)

def sigmoid(x):
    return np.exp(x)/(1 + np.exp(x))

def sample_logistic(X):
    return np.random.binomial(1, sigmoid(X@beta))

def summary(S):
    
    true_discovery = sum(beta[S] > 0)
    power = true_discovery*100/k
    FDP = sum(beta[S] == 0) / max(1, np.size(S))
    
    print(f"""
        true_discovery = {true_discovery} (power = {power})
        FDP = {FDP} % (target FDR = {q})
    """)

In [65]:
X = np.random.multivariate_normal(mu, Sigma, size=n)
y = sample_logistic(X)

knockoff_generator = GaussianKnockoffs(Sigma, method='equi')
X_k = knockoff_generator.generate(X)

W, Z = lasso_stats(X,X_k,y,alpha=0,scale=False)
t = kfilter(W, q=q)
print(f'threshold: {t}')

S = np.where(W >= t)[0]
summary(S)

threshold: 0.008534440448662441

        true_discovery = 39 (power = 65.0)
        FDP = 0.15217391304347827 % (target FDR = 0.1)
    


In [67]:
X = np.random.multivariate_normal(mu, Sigma, size=n)
y = sample_logistic(X)

knockoff_generator = GaussianKnockoffs(Sigma, method='sdp')
X_k = knockoff_generator.generate(X)

W, Z = lasso_stats(X,X_k,y,alpha=0,scale=False)
t = kfilter(W, q=q)
print(f'threshold: {t}')

S = np.where(W >= t)[0]
summary(S)

threshold: 0.023464253992070666

        true_discovery = 29 (power = 48.333333333333336)
        FDP = 0.09375 % (target FDR = 0.1)
    
