Import useful libraries

In [1]:
import numpy as np
import random
from scipy import stats
import time
from collections import defaultdict
import warnings
from scipy.stats import rankdata
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd


Useful functions

In [2]:
def same(x):
    return x
    
def cube(x):
    return np.power(x, 3)

def negexp(x):
    return np.exp(-np.abs(x))

Simulation of useful examples

Example 1 

Independent random variables 

In [3]:
def generate_samples_indep(size,fixed_function1,fixed_function2, normalize = True, seed = None):
    '''Generate independent post-nonlinear samples
    Arguments:
        size : number of samples
        f1, f2 to be within {x,x^2,x^3,tanh x, e^{-|x|}, cos x}
    Output:
        Samples X, Y
    '''
    if seed == None:
        np.random.seed()
    else:
        np.random.seed(seed)

    if fixed_function1 == 'linear':
      f1 = same
    elif fixed_function1=='carre':
      f1 = np.square
    elif fixed_function1=='cube':
      f1 = cube
    elif fixed_function1=='negexp':
      f1 = negexp
    else:
      f1 = np.cos

    if fixed_function2 == 'linear':
      f2= same
    elif fixed_function2=='carre':
      f2 = np.square
    elif fixed_function2=='cube':
      f2 = cube
    elif fixed_function2=='negexp':
      f2 = negexp
    else:
      f2 = np.cos

    X = f1(np.random.normal(0, 1.0, size))
    Y = f2(np.random.normal(0, 1.0, size))

    if normalize == True:
        X = (X - X.min()) / (X.max() - X.min())
        Y = (Y - Y.min()) / (Y.max() - Y.min())

    return [X,Y]

In [22]:
[Xind,Yind]=generate_samples_indep(1000,'cube','carre', normalize = True, seed = None)
df_ind=pd.DataFrame({'X':Xind,'Y':Yind})

Example 2

Fork : $Z$ is a common cause to $X$ and $Y$. To simulate a fork, we shall use the following SCM  \\
$$X=f_1(a_1\cdot Z+\varepsilon_1)$$
$$Y=f_2(a_2\cdot Z+\varepsilon_2)$$
where $(Z,\varepsilon_1,\varepsilon_2)$ are independent r.v.'s and where the variance of the two noise variables are $\sigma^2$

In [8]:
def generate_samples_fork(size,fixed_function1,fixed_function2, dist_z,dz,aX,aY,nstd,normalize = True, seed = None):
    '''Generate independent post-nonlinear samples
    Arguments:
        size : number of samples
        f1, f2 to be within {x,x^2,x^3,tanh x, e^{-|x|}, cos x}
    Output:
        Samples X, Y,Z
    '''
    if seed == None:
        np.random.seed()
    else:
        np.random.seed(seed)

    if fixed_function1 == 'linear':
      f1 = same
    elif fixed_function1=='carre':
      f1 = np.square
    elif fixed_function1=='cube':
      f1 = cube
    elif fixed_function1=='negexp':
      f1 = negexp
    else:
      f1 = np.cos

    if fixed_function2 == 'linear':
      f2= same
    elif fixed_function2=='carre':
      f2 = np.square
    elif fixed_function2=='cube':
      f2 = cube
    elif fixed_function2=='negexp':
      f2 = negexp
    else:
      f2 = np.cos
    
    if dist_z =='gaussian':
        cov = np.eye(dz)
        mu = np.ones(dz)
        Z = np.random.multivariate_normal(mu, cov, size)
        

    elif dist_z == 'laplace':
        Z = np.random.laplace(loc=0.0, scale=1.0, size=size*dz)
        Z = np.reshape(Z,(size,dz))
        

    X = f1(np.dot(aX,Z)+nstd * np.random.normal(0.0, 1.0, (size,1)))
    Y = f2(np.dot(aY,Z)+nstd * np.random.normal(0.0, 1.0, (size,1)))

    if normalize == True:
        X = (X - X.min()) / (X.max() - X.min())
        Y = (Y - Y.min()) / (Y.max() - Y.min())
        Z = (Z - Z.min()) / (Z.max() - Z.min())

    return [X,Y,Z]

In [28]:
[Xfork,Yfork,Zfork]=generate_samples_fork(1000,'cube','carre', 'gaussian',1,1.0,2.0,1.0)
df_fork=pd.DataFrame({'X':Xfork[:,0],'Y':Yfork[:,0],'Z':Zfork[:,0]})

Example 3

Collider : $Z$ has two causes $X$ and $Y$. We shall simulate the following SCM 
$$Z=f(a_X\cdot X+a_Y\cdot Y+\varepsilon)$$

In [12]:
def generate_samples_collider(size,fixed_function, dist_x,dist_y,dx,dy,aX,aY,nstd,normalize = True, seed = None):
    '''Generate independent post-nonlinear samples
    Arguments:
        size : number of samples
        f1, f2 to be within {x,x^2,x^3,tanh x, e^{-|x|}, cos x}
    Output:
        Samples X, Y,Z
    '''
    if seed == None:
        np.random.seed()
    else:
        np.random.seed(seed)

    if fixed_function == 'linear':
      f = same
    elif fixed_function=='carre':
      f = np.square
    elif fixed_function=='cube':
      f = cube
    elif fixed_function=='negexp':
      f = negexp
    else:
      f = np.cos

    if dist_x =='gaussian':
        cov = np.eye(dx)
        mu = np.ones(dx)
        X = np.random.multivariate_normal(mu, cov, size)
        X = np.matrix(X)

    elif dist_x == 'laplace':
        X = np.random.laplace(loc=0.0, scale=1.0, size=size*dx)
        X = np.reshape(X,(size,dx))
        X = np.matrix(X)

    if dist_y =='gaussian':
        cov = np.eye(dy)
        mu = np.ones(dy)
        Y = np.random.multivariate_normal(mu, cov, size)
        Y = np.matrix(Y)

    elif dist_y == 'laplace':
        Y = np.random.laplace(loc=0.0, scale=1.0, size=size*dy)
        Y = np.reshape(Y,(size,dy))
        Y = np.matrix(Y)

    Z = f(np.dot(aX,X)+np.dot(aY,Y)+nstd * np.random.multivariate_normal(np.zeros(1), np.eye(1), size))
    
    if normalize == True:
        X = (X - X.min()) / (X.max() - X.min())
        Y = (Y - Y.min()) / (Y.max() - Y.min())
        Z = (Z - Z.min()) / (Z.max() - Z.min())

    return np.array(X), np.array(Y),np.array(Z)

In [33]:
[Xcoll,Ycoll,Zcoll]=generate_samples_collider(1000,'cube','gaussian', 'gaussian',1,1,1.0,2.0,1.0)
df_coll=pd.DataFrame({'X':Xcoll[:,0],'Y':Ycoll[:,0],'Z':Zcoll[:,0]})

Useful functions for hypothesis testing

Test of independence and conditional independence with mutual information : https://pypi.org/project/pycit/

In [19]:
!pip install pycit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pycit
  Downloading pycit-0.0.7-py3-none-any.whl (18 kB)
Installing collected packages: pycit
Successfully installed pycit-0.0.7


In [25]:
from pycit import itest

# Test whether or not x and y are independent
pval_ind = itest(Xind, Yind, test_args={'statistic': 'ksg_mi', 'n_jobs': 2})

In [26]:
pval_ind

0.124

In [29]:
from pycit import citest

# Test whether or not x and y are conditionally independent given z
pval_fork = citest(Xfork, Yfork, Zfork, test_args={'statistic': 'ksg_mi', 'n_jobs': 2})
pval_fork

0.943

In [34]:
pval_coll1 = itest(Xcoll, Ycoll, test_args={'statistic': 'ksg_mi', 'n_jobs': 2})
pval_coll1

0.661

In [35]:
# Test whether or not x and y are conditionally independent given z
pval_coll2 = citest(Xcoll, Ycoll, Zcoll, test_args={'statistic': 'ksg_mi', 'n_jobs': 2})
pval_coll2

0.0

Test independence with RKHS : https://pypi.org/project/PyRKHSstats/ 

In [36]:
!pip install PyRKHSstats

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting PyRKHSstats
  Downloading PyRKHSstats-2.1.0-py3-none-any.whl (54 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m54.4/54.4 KB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
Collecting GPy
  Downloading GPy-1.10.0.tar.gz (959 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m959.4/959.4 KB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting paramz>=0.9.0
  Downloading paramz-0.9.5.tar.gz (71 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m71.3/71.3 KB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: GPy, paramz
  Building wheel for GPy (setup.py) ... [?25l[?25hdone
  Created wheel for GPy: filename=GPy-1.10.0-cp38-cp38-linux_x86_64.whl size=3424783 sha256=e7773220c9947697ea

In [40]:
import PyRKHSstats as rkhs

In [46]:
!pip install conditional_independence

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting conditional_independence
  Downloading conditional_independence-0.1a6-py3-none-any.whl (23 kB)
Collecting ipdb
  Downloading ipdb-0.13.11-py3-none-any.whl (12 kB)
Collecting dataclasses
  Downloading dataclasses-0.6-py3-none-any.whl (14 kB)
Collecting pygam
  Downloading pygam-0.8.0-py2.py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m18.5 MB/s[0m eta [36m0:00:00[0m
Collecting typing
  Downloading typing-3.7.4.3.tar.gz (78 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.6/78.6 KB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ipython>=7.31.1
  Downloading ipython-8.11.0-py3-none-any.whl (793 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m793.3/793.3 KB[0m [31m45.6 MB/s[0m eta [36m0:00:00[0m
Collecting je

In [47]:
import conditional_independence

https://conditional-independence.readthedocs.io/en/latest/

https://github.com/uhlerlab/conditional_independence

https://conditional-independence.readthedocs.io/en/latest/ci_tests/index.html

Causal discovery toolbox

https://github.com/FenTechSolutions/CausalDiscoveryToolbox

In [48]:
!pip install cdt

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting cdt
  Downloading cdt-0.6.0-py3-none-any.whl (921 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m921.1/921.1 KB[0m [31m13.8 MB/s[0m eta [36m0:00:00[0m
Collecting skrebate
  Downloading skrebate-0.62.tar.gz (19 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting GPUtil
  Downloading GPUtil-1.4.0.tar.gz (5.5 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: GPUtil, skrebate
  Building wheel for GPUtil (setup.py) ... [?25l[?25hdone
  Created wheel for GPUtil: filename=GPUtil-1.4.0-py3-none-any.whl size=7409 sha256=b1460a5e9ee7a7511910236a505944c5a740c50d1344b285fc646c8ece67c08a
  Stored in directory: /root/.cache/pip/wheels/ba/03/bb/7a97840eb54479b328672e15a536e49dc60da200fb21564d53
  Building wheel for skrebate (setup.py) ... [?25l[?25hdone
  Created wheel for skrebate: filename=skrebate-0.

In [49]:
import cdt

No GPU automatically detected. Setting SETTINGS.GPU to 0, and SETTINGS.NJOBS to cpu_count.
