In [1]:
from causality_functions import *

In [2]:
import time
import random
import functools
import numpy as np
import collections

import pandas as pd
import altair as alt
alt.data_transformers.disable_max_rows()

samplesize = 100
Am, Bm, Cm, = 60, 10, 30
Xm, Ym = 1, 1
Astd,Bstd,Cstd = 1000, 15, 21
Xstd,Ystd = 1000, 1000

m1, k1 = 1, 0
m2, k2 = 1.25, 0
m3, k3 = 0.5, 0
m4, k4 = 1.5, 0

d1 = 0
d2 = 0
d3 = 0
d4 = 0

dg = 1


errorA = 2 # not using this
errorB = 0.001
errorC = 0.002
errorX = 0.75
errorY = 0.6

mrange1 = 0,2 #slope ranges
mrange2 = 0,2
mrange3 = 0,2
mrange4 = 0,2

erangeA = 0.2,2000 #error range
erangeB = 0.2,200 #error range
erangeC = 1000,2000 #error range
erangeX = 0.2,200 #error range
erangeY = 0.2,200 #error range



def linear(n=200, debug=False):

    def compute_A(history, generation):
        A, _, _ = history[-1]
        return jitter(A, errorA) # previous A + error

    def compute_B(A, history, generation):
        if check_generation(generation, d1):
            A, _, _ = history[-1 - d1]
        _, B, _ = history[-1]
        return jitter(B + A*m1 - dg*B + k1, errorB)

    def compute_C(B, history, generation):
        if check_generation(generation, d2):
            _, B, _ = history[-1 - d2]
        _, _, C = history[-1]
        return jitter(C+ B*m2 - dg*C + k2, errorC)

    def next_generation(history, generation):
        A_ = compute_A(history, generation)
        B_ = compute_B(A_, history, generation)
        C_ = compute_C(B_, history, generation)
        return A_, B_, C_

    A = np.random.normal(loc=Am, scale=Astd) # normal distribution, Am - mean and Astd - standard deviation
    history = collections.deque(maxlen=max([d1, d2])+1)
    history.append([A, 0, 0])
    data = []
    for i in range(n+max([d1, d2])):
        A, B, C = next_generation(history, i)
        history.append((A, B, C))
        data.append((A, B, C))

    if debug:
        data = pd.DataFrame(np.array(data))
        data.to_csv("data.csv")
    return np.array(history[-1])


def radiating(n=200):

    def compute_B(history, generation):
        _, B, _ = history[-1]
        return jitter(B, errorB)

    def compute_A(B, history, generation):
        if check_generation(generation, d1):
            _, B, _ = history[-1 - d1]
        A, _, _ = history[-1]
        return jitter(A + B*m1 - dg*A + k1, errorA)

    def compute_C(B, history, generation):
        if check_generation(generation, d2):
            _, B, _ = history[-1 - d2]
            C_ = dependent(B, m2, k2, errorC)
        _, _, C = history[-1]
        return jitter(C + B*m2 - dg*C + k2, errorC)

    def next_generation(history, generation):
        B_ = compute_B(history, generation)
        A_ = compute_A(B_, history, generation)
        C_ = compute_C(B_, history, generation)
        return A_, B_, C_


    B = np.random.normal(loc=Bm, scale=Bstd) # normal distribution, Bm - mean and Bstd - standard deviation
    history = collections.deque(maxlen=max([d1, d2])+1)
    history.append((0, B, 0))
    for i in range(n+max([d1, d2])):
        history.append(next_generation(history, i))

    return np.array(history[-1])


def convergent(n=200):

    def next_generation(history, generation):
        if check_generation(generation, d1):
            A, _, _ = history[-1 - d1]
        else:
            A, _, _ = history[-1]

        if check_generation(generation, d2):
            _, _, C = history[-1 - d2]
        else:
            _, _, C = history[-1]

        _, B, _ = history[-1]
        B_ = jitter(B + m1*A + m2*C - dg*B + k1, errorB)

        A, _, C = history[-1]
        A_ = jitter(A, errorA)
        C_ = jitter(C, errorC)
        return A_, B_, C_

    A = np.random.normal(loc=Am, scale=Astd) # normal distribution, Am - mean and Astd - standard deviation
    C = np.random.normal(loc=Cm, scale=Cstd) # normal distribution, Cm - mean and Cstd - standard deviation
    history = collections.deque(maxlen=max([d1, d2]) + 1)
    history.append((A, 0, C))

    for i in range(n + max([d1, d2])):
        A, B, C = next_generation(history, i)
        history.append((A, B, C))

    return np.array(history[-1])


def common_cause(n=200):

    def compute_A(X, history, generation):
        if check_generation(generation, d1):
            _, _, _, X = history[-1 - d1]

        A, _, _, _ = history[-1]
        return jitter(A + X*m1 - dg*A +k1, errorA)

    def compute_B(X, history, generation):
        if check_generation(generation, d2):
            _, _, _, X = history[-1 - d2]

        _, B, _, _ = history[-1]
        return jitter(B + X*m2 - dg*B +k2, errorB)

    def compute_C(X, history, generation):
        if check_generation(generation, d3):
            _, _, _, X = history[-1 - d3]

        _, _, C, _ = history[-1]
        return jitter(C + X*m3 -dg*C + k3, errorC)

    def next_generation(history, generation):
        _, _, _, X = history[-1]
        X_ = jitter(X, errorX)
        A_ = compute_A(X_, history, generation)
        B_ = compute_B(X_, history, generation)
        C_ = compute_C(X_, history, generation)

        return A_, B_, C_, X

    X = np.random.normal(loc=Xm, scale=Xstd) # normal distribution, Xm - mean and Xstd - standard deviation
    history = collections.deque(maxlen=max([d1, d2, d3])+1)
    history.append((0, 0, 0, X))
    for i in range(n+max([d1, d2, d3])):
        A, B, C, X = next_generation(history, i)
        history.append((A, B, C, X))
    A, B, C, _ = history[-1]
    return np.array([A,B,C])


def single_difference_cause(n=200):
    def next_generation(history, generation):
        A, _, _, X = history[-1]
        A_ = jitter(A, errorA)
        X_ = jitter(X, errorX)

        # compute B
        if d1 > 0 and generation >= d1:
            A, _, _, _ = history[-1 - d1]
        else:
            A, _, _, _ = history[-1]

        if d2 > 0 and generation >= d2:
            _, _, _, X = history[-1 - d2]
        else:
            _, _, _, X = history[-1]

        _, B, _, _ = history[-1]
        B_ = jitter(B + m1*A + m2*X -dg*B + k1, errorB)

        # compute C
        if d3 > 0 and generation >= d3:
            _, _, _, X = history[-1 - d3]
        else:
            _, _, _, X = history[-1]

        _, _, C, _ = history[-1]
        C_ = jitter(C + X*m2 -dg*C + k3, errorC)

        return A_, B_, C_, X_


    X = np.random.normal(loc=Xm, scale=Xstd) # normal distribution, Xm - mean and Xstd - standard deviation
    A = np.random.normal(loc=Am, scale=Astd) # normal distribution, Am - mean and Astd - standard deviation
    history = collections.deque(maxlen=max([d1, d2, d3]) + 1)
    history.append((A, 0, 0, X))
    for i in range(n + max(d1, d2, d3)):
        A,B,C,X = next_generation(history, i)
        history.append((A, B, C, X))

    A, B, C, X = history[-1]
    return np.array([A, B, C])


def double_difference_cause(n=200):

    def next_generation(history, generation):
        _, _, _, X, Y = history[-1]
        X_ = jitter(X, errorX)
        Y_ = jitter(Y, errorY)

        # compute A
        if d1 and generation >= d1:
            _, _, _, X, _ = history[-1 - d1]
        else:
            _, _, _, X, _ = history[-1]
        A, _, _ ,_ , _  = history[-1]
        A_ = jitter(A + X*m1 -dg*A + k1, errorA)

        # compute B
        if d2 > 0 and generation >= d2:
            _, _, _, X, _ = history[-1-d2]
        else:
            _, _, _, X, _ = history[-1]
        if d3 > 0 and generation >= d3:
            _, _, _, _, Y = history[-1-d3]
        else:
            _, _, _, _, Y = history[-1]
        _, B, _, _, _ = history[-1]
        B_ = jitter(B + m2*X+ m3*Y -dg*B + k2, errorB)

        # compute C
        if d4 > 0 and generation >= d4:
            _, _, _, _, Y = history[-1-d4]
        else:
            _, _, _, _, Y = history[-1]
        _, _, C, _, _ = history[-1]
        C_ = jitter(C + Y*m4 -dg*C + k4, errorC)

        return A_, B_, C_, X_, Y_


    X = np.random.normal(loc=Xm, scale=Xstd) # normal distribution, Xm - mean and Xstd - standard deviation
    Y = np.random.normal(loc=Ym, scale=Ystd) # normal distribution, Ym - mean and Ystd - standard deviation
    history = collections.deque(maxlen=max([d1, d2, d3, d4]) +  1)
    history.append((0, 0, 0, X, Y))

    for i in range(n + max([d1, d2, d3, d4])):
        A, B, C, X, Y = next_generation(history, i)
        history.append((A, B, C, X, Y))

    A, B, C, X, Y = history[-1]
    return np.array([A, B, C])

def update_slopes():
    def select(lower, upper):
        return lower + random.random()*(upper-lower)

    global m1, m2, m3, m4

    m1 = select(*mrange1)
    m2 = select(*mrange2)
    m3 = select(*mrange3)
    m4 = select(*mrange4)

def update_errors():
    def select(lower, upper):
        return lower + random.random()*(upper-lower)

    global errorA, errorB, errorC, errorX, errorY

    errorA = select(*erangeA)
    errorB = select(*erangeB)
    errorC = select(*erangeC)
    errorX = select(*erangeX)
    errorY = select(*erangeY)

def overall_simulation(n=100):
    stats = []
    data = []
    for i in range(n):
        update_slopes()
        update_errors()
        ABC = simulations_data(pathway, samplesize, i) # you can change pathway here
        r = compute_regression(ABC)
        r.update(compute_correlation(ABC))
        stats.append(r)
        data.append(ABC)
    return pd.DataFrame(stats), np.array(data)


def run_simulation_with_params(pathway_=linear,
                               samplesize_=100,
                               dg_=0,
                               d1_=0,
                               d2_=0,
                               d3_=0,
                               d4_=0, 
                               numpoints=100):
    global pathway, samplesize, dg, d1, d2, d3, d4
    #np.random.seed(0)
    #random.seed(0)
    pathway = pathway_
    samplesize = samplesize_
    dg = dg_
    d1 = d1_
    d2 = d2_
    d3 = d3_
    d4 = d4_
    d, ABC_all = overall_simulation(numpoints)
    folder = mkfolder(pathway, samplesize, dg, d1, d2, d3, d4)
    d.to_csv(os.path.join(folder, "stats.csv"))
    add_confidence_stats(d, ABC_all)
    return confidence_graphs(folder, d), ABC_all

In [3]:
import itertools

pathway__ = [linear, radiating, convergent, common_cause, single_difference_cause, double_difference_cause]
samplesize__ = [100, 500, 1000]
dg__ = [1, 0.9, 0.2, 0.1]
params = []
for pathway_ in pathway__:
    for samplesize_ in samplesize__:
        for dg_ in dg__:
            d1__ = [0, 30, 50]
            d2__ = [0, 30, 50]
            d3__ = [0]
            d4__ = [0]
            if pathway_ in [common_cause, single_difference_cause, double_difference_cause]:
                d3__ = [0, 10, 30, 50]
            if pathway_ == double_difference_cause:
                d4__ = [0, 10, 30, 50]

            params.extend([{"pathway_":pathway_,
                            "samplesize_":samplesize_,
                                       "dg_":dg_,
                                       "d1_":d1_,
                                       "d2_":d2_,
                                       "d3_":d3_,
                                       "d4_":d4} for d1_ in d1__ for d2_ in d2__ for d3_ in d3__ for d4_ in d4__])
print(len(params))

2916


In [5]:
for i, params_ in enumerate(params[210:500]):
    print(i)
    run_simulation_with_params(**params_)

0


KeyboardInterrupt: 

In [36]:
chart, ABC_all =  run_simulation_with_params(pathway_=linear,
                               samplesize_=100,
                               dg_=0,
                               d1_=0,
                               d2_=0,
                               d3_=0,
                               d4_=0)
chart

In [4]:
chart, ABC_all = run_simulation_with_params(pathway_=single_difference_cause,
                               samplesize_=200,
                               dg_=0.1,
                               d1_=0,
                               d2_=0,
                               d3_=0,
                               d4_=0, numpoints=1000)
chart

In [5]:
chart, ABC_all = run_simulation_with_params(pathway_=radiating,
                               samplesize_=100,
                               dg_=0.2,
                               d1_=0,
                               d2_=0,
                               d3_=0,
                               d4_=0,numpoints=1000)
chart 

In [43]:
A, B, C = random.choice(ABC_all).transpose()
AB = alt.Chart(pd.DataFrame({"A":A, "B":B})).mark_circle().encode(
    x="A",
    y="B")
BC= alt.Chart(pd.DataFrame({"B":B, "C":C})).mark_circle().encode(
    x="B",
    y="C")
AC = alt.Chart(pd.DataFrame({"A":A, "C":C})).mark_circle().encode(
    x="A",
    y="C")

alt.hconcat(AB, BC, AC)

In [59]:
run_simulation_with_params(pathway_=radiating,
                               samplesize_=100,
                               dg_=0.3,
                               d1_=0,
                               d2_=0,
                               d3_=0,
                               d4_=0)
chart

In [64]:
run_simulation_with_params(pathway_=radiating,
                               samplesize_=100,
                               dg_=0.5,
                               d1_=0,
                               d2_=0,
                               d3_=0,
                               d4_=0)
chart

In [51]:
run_simulation_with_params(pathway_=radiating,
                               samplesize_=100,
                               dg_=0.7,
                               d1_=0,
                               d2_=0,
                               d3_=0,
                               d4_=0)
chart

In [52]:
run_simulation_with_params(pathway_=radiating,
                               samplesize_=100,
                               dg_=0.9,
                               d1_=0,
                               d2_=0,
                               d3_=0,
                               d4_=0)
chart

In [63]:
run_simulation_with_params(pathway_=single_difference_cause,
                               samplesize_=100,
                               dg_=1,
                               d1_=0,
                               d2_=0,
                               d3_=0,
                               d4_=0)
chart