In [1]:
import time
import random
import pandas as pd
import numpy as np
import os
from sklearn.linear_model import LinearRegression
import altair as alt
import pytest


def jitter(x, scale=1):
    return x + np.random.normal(scale=scale)


def dependent(x, m, c, error=1):
    return jitter(m*x + c, scale=error)  # mx +c + error


def check_generation(generation, d):
    return d > 0 and generation >= d


def mkfolder(pathway, samplesize, dg, d1, d2, d3, d4):
    tokens = [f"{pathway.__name__}",
              f"samplesize-{samplesize}"]
    if dg < 1:
        tokens.append("dg-{}".format(dg))
    if d1 > 0:
        tokens.append("d1-{}".format(d1))
    if d2 > 0:
        tokens.append("d2-{}".format(d2))
    if d3 > 0:
        tokens.append("d3-{}".format(d3))
    if d4 > 0:
        tokens.append("d4-{}".format(d4))

    path = os.path.join(*tokens)
    os.makedirs(path, exist_ok=True)
    return path


def simulations_data(pathway, n=1000, run=0):
    def save_data(r):
        folder = mkfolder(pathway, samplesize, dg, d1, d2, d3, d4)
        np.savetxt(os.path.join(folder, f"sample{run}.csv"), r, delimiter=",")

    r = np.array([pathway() for i in range(n)])
    # save_data(r)
    return r


def regress(X, Y):
    model = LinearRegression()
    mXY = model.fit(X.reshape(-1, 1), Y)
    r_sqr = mXY.score(X.reshape(-1, 1), Y)
    residual = Y - model.predict(X.reshape(-1, 1))
    return mXY.intercept_, mXY.coef_[0], r_sqr, residual


def get_slope_intercept(model):
    return model._slopt, model._intercept


def compute_regression(ABC):
    A, B, C = ABC.transpose()
    RAB = regress(A, B)
    RBC = regress(B, C)
    RAC = regress(A, C)
    corrE = np.corrcoef(np.array([RAB[3], RBC[3]]))
    corrE_BA_C = np.corrcoef(np.array([RAB[3], C]))

    # print(RAB[1]*RBC[1]-RAC[1]) ## better to look at distribution of this error..it should come with center as 0
    return {"kAB": RAB[0], "kBC": RBC[0], "kAC": RAC[0],
            "mAB": RAB[1], "mBC": RBC[1], "mAC": RAC[1],
            "r_sqrAB": RAB[2], "r_sqrBC": RBC[2], "r_sqrAC": RAC[2],
            "r_E": corrE[0, 1],
            "r_E_BA_C": corrE_BA_C[0, 1],
            "n": len(A)}


def compute_correlation(ABC):
    corr = np.corrcoef(ABC.transpose())
    rAB, rBC, rAC = corr[0, 1], corr[1, 2], corr[0, 2]
    # print(rAB**2*rBC**2-rAC**2) ## better to look at distribution of this error..it should come with center as 0
    # or see correlation between these two quantities should be 1 and if we regress ,
    # it should have slope 1
    return {"rAB": rAB, "rBC": rBC, "rAC": rAC}


def compute_confidence_interval(r, n):
    def boundary(zeta):
        return ((np.exp(2*zeta))-1)/((np.exp(2*zeta))+1)

    z = 0.5*np.log(((1+r)/(1-r)))  # check this
    zetal = z-1.96*np.sqrt(1/(n-3))
    rl = boundary(zetal)
    zetau = z+1.96*np.sqrt(1/(n-3))
    ru = boundary(zetau)
    return rl, ru


def test_compute_confidence_interval():
    pass


def confidence_status(L, U, v, debug=False):
    if debug:
        for l, u, m in zip(L, U, v):
            print(l, u, m)

    l = pd.Series([""]*len(v))
    l = l.mask(v < L, "less")
    if debug:
        print(l)
    l.mask(v > U, "more", inplace=True)
    if debug:
        print(l)
    w = l.mask((v >= L) & (v <= U), "within")
    if debug:
        print((v >= L) & (v <= U))
    return w


def confidence_status_(r, n):
    L, U = compute_confidence_interval(r, n)
    return confidence_status(L, U, r)


def slope_confidence(m, X, Y, k):
    """ as per this pdf
    https://www.ncss.com/wp-content/themes/ncss/pdf/Procedures/PASS/Confidence_Intervals_for_Linear_Regression_Slope.pdf
    """
    n = len(Y)
    Y_ = m*X + k
    numer = np.sqrt(np.sum((Y - Y_)**2)/(n-2))
    denom = np.sqrt(np.sum((X - np.mean(X))**2))
    term = 1.96*numer/denom
    return m-term, m+term


def compute_slope_confidence(m_all, ABC_all, k_all, m_check):
    L, U = [], []

    for m, k, ABC in zip(m_all, k_all, ABC_all):
        A, B, C = ABC.transpose()
        l, u = slope_confidence(m, A, C, k)
        L.append(l)
        U.append(u)

    return confidence_status(pd.Series(L), pd.Series(U), m_check)


def get_value(d, variable):
    if isinstance(variable, str):
        return d[variable]
    else:
        value = d[variable[0]]
        for v in variable[1:]:
            value = value*d[v]
        return value


def compute_confidence_handle_square(d, boundary, variable):
    L, U = compute_confidence_interval(get_value(d, boundary), d['n'])
    conf = confidence_status(L, U, get_value(d, variable))
    sqrdV = get_value(d, variable)**2
    sqrdB = get_value(d, boundary)**2
    outside = conf != "within"
    outside_ = pd.Series([""]*len(outside))
    outside_ = outside_.mask(sqrdV[outside] < sqrdB[outside], "less")
    outside_ = outside_.mask(sqrdV[outside] > sqrdB[outside], "more")

    conf.mask(outside, outside_)
    return conf


def compute_confidence_rAC(d):
    L, U = compute_confidence_interval(
        d.rAC, d['n'])  # whether to remove sqr
    conf_rAC = confidence_status(L, U, d.rAB*d.rBC)
    sqrd = d.rAB**2*d.rBC**2
    sqrAC = d.rAC**2
    outside = conf_rAC != "within"
    outside_ = pd.Series([""]*len(outside))
    outside_ = outside_.mask(sqrd[outside] > sqrAC[outside], "more")
    outside_ = outside_.mask(sqrd[outside] < sqrAC[outside], "less")
    conf_rAC.mask(outside, outside_)
    return conf_rAC


def test_squred(d):
    x = compute_confidence_handle_square(d, 'rAC', ('rAB', 'rBC'))
    assert d['confidence_rAC'].equals(x)


def add_confidence_stats(d, ABC_all):
    d['rAB2*rBC2-rAC2'] = d.rAB**2 * d.rBC**2 - d.rAC**2
    d['r_E_BA_C2-rBC2'] = d.r_E_BA_C**2 - d.rBC**2
    # d['rAC2'] = d.rAC**2
    d['mAB*mBC-mAC'] = d.mAB*d.mBC - d.mAC
    d['mAB*mBC'] = d.mAB*d.mBC
    d['confidence_rAC'] = compute_confidence_rAC(d)
    test_squred(d)
    L, U = compute_confidence_interval(d.r_E, d['n'])
    d['confidence_residual_corr'] = confidence_status(
        L, U, pd.Series(np.zeros_like(L)))
    # L, U = compute_confidence_interval(d.rBC**2, d['n'])  # ???
    d['confidence_corrected_bc_corr'] = compute_confidence_handle_square(d,
                                                                         'rBC',
                                                                         'r_E_BA_C')
    confidence_slope_AC = compute_slope_confidence(d.mAC,
                                                   ABC_all,
                                                   d.kAC,
                                                   d['mAB*mBC'])
    d['confidence_slope_AC'] = confidence_slope_AC


def confidence_graphs(folder, d):
    confidence = alt.Chart(d, title=f"Correlation confidence {(d['confidence_rAC']=='within').sum()}/{len(d)}").mark_point().encode(
        x=alt.X('rAB2:Q'),
        y=alt.Y('rBC2:Q'),
        color=alt.Color('confidence_rAC:N',
                        scale=alt.Scale(domain=['less', 'within', 'more'],
                                        range=['orange', 'green', 'red'])),
    ).transform_calculate(
        rAB2='datum.rAB*datum.rAB',
        rBC2='datum.rBC*datum.rBC'
    )

    confidence_res_corr = alt.Chart(d, title=f"residual corr confidence {(d['confidence_residual_corr']=='within').sum()}/{len(d)}").mark_point().encode(
        x=alt.X('rAB2:Q'),
        y=alt.Y('rBC2:Q'),
        color=alt.Color('confidence_residual_corr:N',
                        scale=alt.Scale(domain=['less', 'within', 'more'],
                                        range=['orange', 'green', 'red'])),
    ).transform_calculate(
        rAB2='datum.rAB*datum.rAB',
        rBC2='datum.rBC*datum.rBC'
    )

    confidence_corrected_bc_corr = alt.Chart(d, title=f"corrected bc corr confidence {(d['confidence_corrected_bc_corr']=='within').sum()}/{len(d)}").mark_point().encode(
        x=alt.X('rAB2:Q'),
        y=alt.Y('rBC2:Q'),
        color=alt.Color('confidence_corrected_bc_corr:N',
                        scale=alt.Scale(domain=['less', 'within', 'more'],
                                        range=['orange', 'green', 'red'])),
    ).transform_calculate(
        rAB2='datum.rAB*datum.rAB',
        rBC2='datum.rBC*datum.rBC'
    )

    confidence_slope_AC = alt.Chart(d, title=f"slope AC confidence {(d['confidence_slope_AC']=='within').sum()}/{len(d)}").mark_point().encode(
        x=alt.X('rAB2:Q'),
        y=alt.Y('rBC2:Q'),
        color=alt.Color('confidence_slope_AC:N',
                        scale=alt.Scale(domain=['less', 'within', 'more'],
                                        range=['orange', 'green', 'red'])),
    ).transform_calculate(
        rAB2='datum.rAB*datum.rAB',
        rBC2='datum.rBC*datum.rBC'
    )
    row1 = alt.hconcat(confidence_slope_AC, confidence)
    row2 = alt.hconcat(confidence_res_corr, confidence_corrected_bc_corr)
    chart = alt.vconcat(row1, row2).interactive()
    chart.save(os.path.join(folder, "charts.html"))
    return chart


def stats_graphs(d):
    slope_histogram = alt.Chart(d).mark_bar().encode(
        x=alt.X('mAB*mBC-mAC:Q', bin=True),
        y='count()').properties(title="slope diff histogram")

    bincount = 100
    ticks = 10
    correlation_graph = alt.Chart(d).mark_bar().encode(
        x=alt.X('rAB2*rBC2-rAC2:Q', bin=True,
                axis=alt.Axis(
                    tickCount=ticks,
                    grid=False)),
        y='count()').properties(
            title="Correlation")
    residual_correlation = alt.Chart(d).mark_bar().encode(
        x=alt.X('r_E:Q', bin=True),
        y='count()').properties(title="Correlation of residuals")
    corrected_correlation = alt.Chart(d).mark_bar().encode(
        x=alt.X("r_E_BA_C2-rBC2:Q", bin=True),
        y='count()').properties(
        title="Corrected Correlation")

    row1 = alt.hconcat(slope_histogram, correlation_graph)
    row2 = alt.hconcat(residual_correlation, corrected_correlation)
    chart = alt.vconcat(row1, row2).interactive()
    return chart


In [2]:
import time
import random
import collections
import numpy as np

import altair as alt
import pandas as pd
alt.data_transformers.disable_max_rows()

samplesize = 500
Am, Bm, Cm, = 40, 10, 30
Xm, Ym = 1, 1
Astd,Bstd,Cstd = 5, 13, 12
Xstd,Ystd = 1, 1

m1, k1 = 1, 0
m2, k2 = 1.5, 0
m3, k3 = 0.15, 0 # this is feedback term
m4, k4 = 0.75, 0.0

dg = 0.1
d1 = 1
d2 = 1
d3 = 1
d4 = 1

errorA = 2
errorB = 4
errorC = 3.5
errorX = 0.75
errorY = 0.6

mrange1 = 0,2 #slope ranges
mrange2 = 0,2
mrange3 = 1,1.1
mrange4 = 0,2

erangeA = 0.2,2000 #error range
erangeB = 3000,7000 #error range
erangeC = 3000,7000 #error range
erangeX = 0.2,5 #error range
erangeY = 0.2,5 #error range


def linear_positive_C_to_B(n=100):

    def compute_B(history, generation):
        if check_generation(generation, d1):
             A, _, _ = history[-1 - d1]   
        else:
             A, _, _ = history[-1]
        if check_generation(generation, d3):
            _, _, C = history[-1 - d3]
        else:
            _, _, C = history[-1]
        _, B, _ = history[-1]
        return jitter(B + A*m1 + C*m3 -dg*B + k1 ,  errorB)
        

    def compute_C(history, generation):
        if check_generation(generation, d2):
            _, B, _ = history[-1 - d2]
        else:
            _, B, _ = history[-1]
        _, _, C = history[-1]
        return jitter(C + B*m2 - dg*C + k2, errorC)

    def next_generation(history, generation):
        A, _, _ = history[-1]
        A_ = jitter(A, errorA) # previous A + error

        # compute B
        B_ = compute_B(history, generation)
        # compute C
        C_ = compute_C(history, generation)
        
        return A_, B_, C_

    A = np.random.normal(loc=Am, scale=Astd) # normal distribution, Am - mean and Astd - standard deviation
    history = collections.deque(maxlen=max([d1, d2, d3]) + 1)
    history.append((A, 0, 0))
    for i in range(n + max([d1, d2, d3])):
        history.append(next_generation(history, i))
    
    return np.array(history[-1])


def linear_positive_A_to_C(n=100):    

    
    def compute_A(history, generation):
        A, _, _ = history[-1]
        return jitter(A, errorA)
        
    def compute_B(history, generation):
        if check_generation(generation, d1):
            A, _, _ = history[-1 - d1]
        else:
            A, _, _ = history[-1]
            
        return dependent(A, m1, k1 ,  errorB)
        

    def compute_C(history, generation):
        if check_generation(generation, d1):
            A, _, _ = history[-1 - d1]
        else:
            A, _, _ = history[-1]
        
        if check_generation(generation, d2):
            _, B, _ = history[-1 - d2]
        else:
            _, B, _ = history[-1]
            
        _, _, C = history[-1]
        return jitter(A*m3+ B*m2+ -dg*C + k2, errorC)
        
            
    
    def next_generation(history, generation):
        A_ = compute_A(history, generation)
        B_ = compute_B(history, generation)
        C_ = compute_C(history, generation)
        return A_, B_, C_

    A = np.random.normal(loc=Am, scale=Astd) # normal distribution, Am - mean and Astd - standard deviation
    history = collections.deque(maxlen=max([d1, d2, d3]) + 1)
    history.append((A, 0, 0))
    for i in range(n + max([d1, d2, d3])):
        history.append(next_generation(history, i))

    return np.array(history[-1])

def update_slopes():
    def select(lower, upper):
        return lower + random.random()*(upper-lower)
    
    global m1, m2, m3, m4
  
    m1 = select(*mrange1)
    m2 = select(*mrange2)
    m3 = select(*mrange3)
    m4 = select(*mrange4)

def update_errors():
    def select(lower, upper):
        return lower + random.random()*(upper-lower)
    
    global errorA, errorB, errorC, errorX, errorY

    errorA = select(*erangeA)
    errorB = select(*erangeB)
    errorC = select(*erangeC)
    errorX = select(*erangeX)
    errorY = select(*erangeY)
    
def overall_simulation(n=100):
    stats = []
    data = []
    for i in range(n):
        update_slopes()
        update_errors()
        #print(m1,m2, m3)
        ABC = simulations_data(pathway, samplesize) # you can change pathway here
        r = compute_regression(ABC)
        r.update(compute_correlation(ABC))
        stats.append(r)
        data.append(ABC)
    return pd.DataFrame(stats), np.array(data)

def run_simulation_with_params(pathway_=linear_positive_A_to_C,
                               samplesize_=100,
                               dg_=0,
                               d1_=0,
                               d2_=0,
                               d3_=0,
                               d4_=0, 
                               numpoints=100):
    global pathway, samplesize, dg, d1, d2, d3, d4
    #np.random.seed(0)
    #random.seed(0)
    pathway = pathway_
    samplesize = samplesize_
    dg = dg_
    d1 = d1_
    d2 = d2_
    d3 = d3_
    d4 = d4_
    print(pathway)
    d, ABC_all = overall_simulation(numpoints)
    folder = mkfolder(pathway, samplesize, dg, d1, d2, d3, d4)
    d.to_csv(os.path.join(folder, "stats.csv"))
    add_confidence_stats(d, ABC_all)
    return confidence_graphs(folder, d), ABC_all,d

In [3]:

mrange1 = 0,2 #slope ranges
mrange2 = 0,2
mrange3 = 0,-2
mrange4 = 0,2

erangeA = 0,500 #error range
erangeB = 0,2000 #error range
erangeC = 0,2000 #error range
erangeX = 0.2,5 #error range
erangeY = 0.2,5 #error range

chart, ABC_all,d =  run_simulation_with_params(pathway_=linear_positive_A_to_C,
                               samplesize_=1000,
                               dg_=0.9,
                               d1_=0,
                               d2_=0,
                               d3_=0,
                               d4_=0, 
                               numpoints=100)
chart

<function linear_positive_A_to_C at 0x7f35835c0430>


In [1]:
A, B, C = random.choice(ABC_all).transpose()
AB = alt.Chart(pd.DataFrame({"A":A, "B":B})).mark_circle().encode(
    x="A",
    y="B")
BC= alt.Chart(pd.DataFrame({"B":B, "C":C})).mark_circle().encode(
    x="B",
    y="C")
AC = alt.Chart(pd.DataFrame({"A":A, "C":C})).mark_circle().encode(
    x="A",
    y="C")

alt.hconcat(AB, BC, AC)

NameError: name 'random' is not defined

In [5]:
d

Unnamed: 0,kAB,kBC,kAC,mAB,mBC,mAC,r_sqrAB,r_sqrBC,r_sqrAC,r_E,...,rBC,rAC,rAB2*rBC2-rAC2,r_E_BA_C2-rBC2,mAB*mBC-mAC,mAB*mBC,confidence_rAC,confidence_residual_corr,confidence_corrected_bc_corr,confidence_slope_AC
0,-7.352851,-276.978264,-289.089613,1.040876,0.092969,0.304345,0.351083,0.001269,0.004409,-0.033331,...,0.035630,0.066397,-0.003963,-0.001248,-0.207576,0.096769,within,within,within,within
1,98.953363,2.259401,-13.064808,1.172122,-0.196819,-0.260272,0.875319,0.283431,0.315783,0.199877,...,-0.532383,-0.561946,-0.067691,-0.283078,0.029576,-0.230696,more,less,more,more
2,-5.253612,-41.084416,-32.902871,0.979404,-0.762839,-0.780597,0.950238,0.856101,0.888017,0.465445,...,-0.925257,-0.942347,-0.074517,-0.855210,0.033469,-0.747128,more,less,more,more
3,-10.157969,-7.191368,-5.697916,0.652281,-0.159309,-0.098831,0.909421,0.015614,0.012844,-0.018618,...,-0.124957,-0.113333,0.001355,-0.012469,-0.005084,-0.103915,within,within,more,within
4,11.533889,2.236102,1.012229,1.167792,-0.102394,-0.116844,0.927528,0.032963,0.029193,-0.014526,...,-0.181557,-0.170861,0.001380,-0.028974,-0.002731,-0.119575,within,within,more,within
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,17.986123,1.666134,-0.180772,1.025382,-0.081719,-0.081218,0.952120,0.028323,0.025335,-0.022831,...,-0.168294,-0.159169,0.001632,-0.024803,-0.002575,-0.083793,within,within,more,within
96,-5.797653,-74.063938,-57.583943,0.265686,-3.068109,-0.861614,0.942982,0.313921,0.330729,0.152258,...,-0.560286,-0.575090,-0.034707,-0.313862,0.046462,-0.815152,within,less,more,within
97,-3.651953,-88.373457,-85.278520,0.646538,-0.773991,-0.528517,0.943907,0.746102,0.785572,0.383673,...,-0.863772,-0.886325,-0.081321,-0.745976,0.028103,-0.500414,more,less,more,more
98,81.851813,39.918655,53.739301,1.638785,0.194532,0.385902,0.858638,0.050127,0.063068,-0.110430,...,0.223890,0.251133,-0.020027,-0.049577,-0.067105,0.318797,within,more,less,within
