In [84]:
import time
import random
import numpy as np
from sklearn.linear_model import LinearRegression

import altair as alt
import pandas as pd
import plotly.express as px
alt.data_transformers.disable_max_rows()

Am, Bm, Cm, = 1, 1, 1
Xm, Ym = 1, 1
Astd,Bstd,Cstd = 1, 1, 1
Xstd,Ystd = 1, 1

m1, k1 = 0.1, -1.0
m2, k2 = 0.25, 2.0
m3, k3 = 0.5, -1.5
m4, k4 = 1.5, 0.0

d1 = 5
d2 = 3
d3 = 7
d4 = 4

errorA = 2
errorB = 1.5
errorC = 0.5
errorX = 0.75
errorY = 0.6

def jitter(x, scale=1):
    return x + np.random.normal(scale=scale)

def dependent(x, m, c, error=1):
    return jitter(m*x + c, scale=error) # mx +c + error


def linear(n=100):

    def next_generation(A):
        A_ = jitter(A, errorA) # previous A + error
        B_ = dependent(A_, m1, k1, error=errorB)
        C_ = dependent(B_, m2, k2, error=errorC)
        return A_, B_, C_

    A = np.random.normal(loc=Am, scale=Astd) # normal distribution, Am - mean and Astd - standard deviation
    B = dependent(A, m1, k1, errorB)
    C = dependent(B, m2, k2, errorC)
    for i in range(n):
        A, B, C = next_generation(A)

    return np.array([A,B,C])


def radiating(n=100):
    def next_generation(B):
        B_ = jitter(B, errorB)
        A_ = dependent(B_, m1, k1, errorA)
        C_ = dependent(B_, m2, k2, errorC)
        return A_, B_, C_

    B = np.random.normal(loc=Bm, scale=Bstd) # normal distribution, Bm - mean and Bstd - standard deviation
    A = dependent(B, m1, k1, errorA)
    C = dependent(B, m2, k2, errorC)

    for i in range(n):
        A,B,C = next_generation(B)

    return np.array([A, B, C])


def common_cause(n=100):
    def next_generation(X):
        X = jitter(X, errorX)
        A = dependent(X, m1, k1, errorA)
        B = dependent(X, m2, k2, errorB)
        C = dependent(X, m3, k3, errorC)
        return A, B, C, X

    X = np.random.normal(loc=Xm, scale=Xstd) # normal distribution, Xm - mean and Xstd - standard deviation
    A = dependent(X, m1, k1, errorA)
    B = dependent(X, m2, k2, errorB)
    C = dependent(X, m3, k3, errorC)

    for i in range(n):
        A, B, C, X = next_generation(X)

    return np.array([A,B,C])


def single_difference_cause(n=100):

    def next_generation(A, X):
        X = jitter(X, errorX)
        A = jitter(A, errorA)
        B = jitter(m1*A + m2*X + k1, errorB)
        C = dependent(X, m3, k3, errorC)
        return A, B, C, X

    A = np.random.normal(loc=Am, scale=Astd) # normal distribution, Am - mean and Astd - standard deviation
    X = np.random.normal(loc=Xm, scale=Xstd) # normal distribution, Xm - mean and Xstd - standard deviation

    for i in range(n+1):
        A,B,C,X = next_generation(A,X)

    return np.array([A, B, C])


def double_difference_cause(n=100):
    def next_generation(X,Y):
        X = jitter(X, errorX)
        Y = jitter(Y, errorY)
        A = dependent(X, m1, k1, errorA)
        B = jitter(m2*X+m3*Y+k2, errorB)
        C = dependent(Y, m4, k4, errorC)
        return A, B, C, X, Y

    X = np.random.normal(loc=Xm, scale=Xstd) # normal distribution, Xm - mean and Xstd - standard deviation
    Y = np.random.normal(loc=Ym, scale=Ystd) # normal distribution, Ym - mean and Ystd - standard deviation
    X = random.random()
    Y = random.random()

    for i in range(n+1):
        A, B, C, X, Y = next_generation(X, Y)

    return np.array([A, B, C])

def convergent(n=100):

    def next_generation(A, C):
        B_ = jitter(m1*A + m2*C + k1, errorB)
        A_ = jitter(A, errorA)
        C_ = jitter(C, errorC)
        return A_, B_, C_

    A = np.random.normal(loc=Am, scale=Astd) # normal distribution, Am - mean and Astd - standard deviation
    C = np.random.normal(loc=Cm, scale=Cstd) # normal distribution, Cm - mean and Cstd - standard deviation

    B = jitter(m1*A + m2*C + k1, errorB)
    for i in range(n):
        A, B, C = next_generation(A, C)

    return np.array([A, B, C])

def simulations_data(pathway, n=1000):
    random.seed(time.time())
    return np.array([pathway() for i in range(n)])


def regress(X,Y):
    model = LinearRegression()
    mXY = model.fit(X.reshape(-1,1), Y)
    r_sqr = mXY.score(X.reshape(-1,1), Y)
    residual = Y - model.predict(X.reshape(-1, 1))
    return mXY.intercept_, mXY.coef_[0], r_sqr, residual


def get_slope_intercept(model):
    return model._slopt, model._intercept

def compute_regresssion(ABC):
    A,B,C = ABC.transpose()
    RAB = regress(A, B)
    RBC = regress(B, C)
    RAC = regress(A, C)
    corrE = np.corrcoef(np.array([RAB[3], RBC[3]]))
    corrE_BA_C = np.corrcoef(np.array([np.square(RAB[3]), C]))


    #print(RAB[1]*RBC[1]-RAC[1]) ## better to look at distribution of this error..it should come with center as 0
    return {"kAB": RAB[0], "kBC":RBC[0], "kAC":RAC[0],
            "mAB":RAB[1], "mBC":RBC[1], "mAC":RAC[1],
            "r_sqrAB":RAB[2], "r_sqrBC":RBC[2], "r_sqrAC":RAC[2],
            "r_E":corrE[0,1],
            "r_E_BA_C":corrE_BA_C[0,1]}

def compute_correlation(ABC):
    corr = np.corrcoef(ABC.transpose())
    rAB, rBC, rAC = corr[0,1], corr[1,2], corr[0,2]
    #print(rAB**2*rBC**2-rAC**2) ## better to look at distribution of this error..it should come with center as 0
                                ## or see correlation between these two quantities should be 1 and if we regress ,
                                ###       it should have slope 1
    return {"rAB":rAB, "rBC":rBC, "rAC":rAC}


def overall_simulation(n=100):
    stats = []
    data = []
    for i in range(n):
        ABC = simulations_data(convergent) # you can change pathway here
        r = compute_regresssion(ABC)
        r.update(compute_correlation(ABC))
        stats.append(r)
        data.append(ABC)
    return pd.DataFrame(stats), np.array(data)

d, ABC_all = overall_simulation()

In [85]:
d

Unnamed: 0,kAB,kBC,kAC,mAB,mBC,mAC,r_sqrAB,r_sqrBC,r_sqrAC,r_E,r_E_BA_C,rAB,rBC,rAC
0,-0.980722,1.148047,0.488268,0.101459,0.799691,0.002615,0.495274,0.198233,0.000102,0.335469,0.042261,0.703757,0.445233,0.010100
1,-0.835460,1.228520,0.496288,0.094141,0.811101,-0.005355,0.474878,0.197013,0.000460,0.347363,-0.041450,0.689114,0.443861,-0.021452
2,-0.782660,1.206107,0.621321,0.102520,0.758121,0.004691,0.525831,0.185985,0.000356,0.342977,0.125656,0.725142,0.431260,0.018876
3,-0.820002,1.424780,0.732477,0.101288,0.779308,0.004095,0.533057,0.188396,0.000270,0.356343,0.001281,0.730107,0.434046,0.016441
4,-0.912313,0.983267,0.384163,0.093253,0.767908,-0.007021,0.483791,0.191280,0.000890,0.359586,0.060075,0.695551,0.437356,-0.029827
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.941156,1.250148,0.431989,0.095674,0.903105,-0.003311,0.456099,0.221995,0.000149,0.343016,0.016554,0.675351,0.471164,-0.012195
96,-0.852812,1.245275,0.567746,0.100031,0.854971,0.010296,0.514791,0.212783,0.001588,0.337970,0.022797,0.717489,0.461285,0.039845
97,-0.799083,1.176329,0.577892,0.097381,0.831828,0.000462,0.492355,0.200140,0.000003,0.343695,0.024663,0.701680,0.447370,0.001792
98,-0.863784,1.196570,0.375324,0.102057,0.891894,0.009106,0.488172,0.229584,0.001122,0.335229,-0.029556,0.698693,0.479149,0.033491


In [86]:
def stats_graphs(d):
    d['rAB2*rBC2-rAC2']=d.rAB**2 * d.rBC**2 - d.rAC**2
    d['r_E_BA_C2-rBC2'] = d.r_E_BA_C**2 - d.rBC**2
    #d['rAC2'] = d.rAC**2
    d['mAB*mBC-mAC'] = d.mAB*d.mBC - d.mAC
    slope_histogram = alt.Chart(d).mark_bar().encode(
        x=alt.X('mAB*mBC-mAC:Q', bin=True),
        y='count()').properties(title="slope diff histogram")
   
    bincount = 100
    ticks = 10
    correlation_graph = alt.Chart(d).mark_bar().encode(
        x=alt.X('rAB2*rBC2-rAC2:Q',bin=True,
                   axis=alt.Axis(
                    tickCount=ticks,
                    grid=False)),
        y='count()').properties(
            title="Correlation")
    residual_correlation = alt.Chart(d).mark_bar().encode(
        x=alt.X('r_E:Q', bin=True),
        y='count()').properties(title="Correlation of residuals")
    corrected_correlation = alt.Chart(d).mark_bar().encode(
        x=alt.X("r_E_BA_C2-rBC2:Q", bin=True),
        y='count()').properties(
        title="Corrected Correlation")

    return alt.vconcat(slope_histogram,correlation_graph,residual_correlation, corrected_correlation)

stats_graphs(d)

In [87]:
#ABC_all = ABC_all.reshape(100*1000, 3)

In [88]:
d

Unnamed: 0,kAB,kBC,kAC,mAB,mBC,mAC,r_sqrAB,r_sqrBC,r_sqrAC,r_E,r_E_BA_C,rAB,rBC,rAC,rAB2*rBC2-rAC2,r_E_BA_C2-rBC2,mAB*mBC-mAC
0,-0.980722,1.148047,0.488268,0.101459,0.799691,0.002615,0.495274,0.198233,0.000102,0.335469,0.042261,0.703757,0.445233,0.010100,0.098078,-0.196447,0.078520
1,-0.835460,1.228520,0.496288,0.094141,0.811101,-0.005355,0.474878,0.197013,0.000460,0.347363,-0.041450,0.689114,0.443861,-0.021452,0.093097,-0.195295,0.081714
2,-0.782660,1.206107,0.621321,0.102520,0.758121,0.004691,0.525831,0.185985,0.000356,0.342977,0.125656,0.725142,0.431260,0.018876,0.097440,-0.170196,0.073031
3,-0.820002,1.424780,0.732477,0.101288,0.779308,0.004095,0.533057,0.188396,0.000270,0.356343,0.001281,0.730107,0.434046,0.016441,0.100155,-0.188394,0.074839
4,-0.912313,0.983267,0.384163,0.093253,0.767908,-0.007021,0.483791,0.191280,0.000890,0.359586,0.060075,0.695551,0.437356,-0.029827,0.091650,-0.187671,0.078631
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,-0.941156,1.250148,0.431989,0.095674,0.903105,-0.003311,0.456099,0.221995,0.000149,0.343016,0.016554,0.675351,0.471164,-0.012195,0.101103,-0.221721,0.089715
96,-0.852812,1.245275,0.567746,0.100031,0.854971,0.010296,0.514791,0.212783,0.001588,0.337970,0.022797,0.717489,0.461285,0.039845,0.107951,-0.212264,0.075228
97,-0.799083,1.176329,0.577892,0.097381,0.831828,0.000462,0.492355,0.200140,0.000003,0.343695,0.024663,0.701680,0.447370,0.001792,0.098537,-0.199532,0.080542
98,-0.863784,1.196570,0.375324,0.102057,0.891894,0.009106,0.488172,0.229584,0.001122,0.335229,-0.029556,0.698693,0.479149,0.033491,0.110955,-0.228710,0.081918


In [89]:
A, B, C = random.choice(ABC_all).transpose()
AB = alt.Chart(pd.DataFrame({"A":A, "B":B})).mark_circle().encode(
    x="A",
    y="B")
BC= alt.Chart(pd.DataFrame({"B":B, "C":C})).mark_circle().encode(
    x="B",
    y="C")
AC = alt.Chart(pd.DataFrame({"A":A, "C":C})).mark_circle().encode(
    x="A",
    y="C")

alt.vconcat(AB, BC, AC)

In [131]:
OLS?

[0;31mInit signature:[0m [0mOLS[0m[0;34m([0m[0mendog[0m[0;34m,[0m [0mexog[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0mmissing[0m[0;34m=[0m[0;34m'none'[0m[0;34m,[0m [0mhasconst[0m[0;34m=[0m[0;32mNone[0m[0;34m,[0m [0;34m**[0m[0mkwargs[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m     
Ordinary Least Squares

Parameters
----------
endog : array_like
    A 1-d endogenous response variable. The dependent variable.
exog : array_like
    A nobs x k array where `nobs` is the number of observations and `k`
    is the number of regressors. An intercept is not included by default
    and should be added by the user. See
    :func:`statsmodels.tools.add_constant`.
missing : str
    Available options are 'none', 'drop', and 'raise'. If 'none', no nan
    checking is done. If 'drop', any observations with nans are dropped.
    If 'raise', an error is raised. Default is 'none'.
hasconst : None or bool
    Indicates whether the RHS includes a user-supp

In [14]:
alt.Chart(pd.DataFrame({"X":np.random.normal(scale=1, size=1000)})).mark_bar().encode(
    x=alt.X("X:Q", bin=True),
    y="count()")