In [13]:
import pandas as pd
from scmodels import SCM

scm = SCM(
    [
        # exogenous nodes
        "A1 = N, N ~ LogLogistic(alpha=5, beta=2)",
        "A2 = N, N ~ LogLogistic(alpha=2, beta=2)",
        "A3 = N, N ~ Normal(mean=0, std=1)",
        "A4 = N, N ~ LogNormal(mean=1, std=0.5)",
        
        # first layer of dependency nodes
        "B1 = A1 + A2 + N,  N ~ LogNormal(mean=1, std=1)",
        "B2 = A1*A2 + A3*N, N ~ LogNormal(mean=1, std=1)",
        "B3 = A3 + A4*N,    N ~ Benini(alpha=0.1, beta=0.4, sigma=0.4)",
        "B4 = A3**2 - sqrt(A1 + A4*N), N ~ Normal(mean=2, std=1)",
        
        # second layer of dependency nodes
        "C1 = A1*A3*0.5 + B1*N, N ~ Beta(alpha=0.5, beta=0.5)",
        "C2 = A1*A2     + B2*N, N ~ Beta(alpha=0.5, beta=0.5)",
        "C3 = A1*B3     + B3*N, N ~ Beta(alpha=0.5, beta=0.5)",
        "C4 = B1**2     + B4*N, N ~ Beta(alpha=0.5, beta=0.5)",
        
        # last layer of dependency nodes
        "D1 = A1 + A2 + C1*0.5*N, N ~ Normal(mean=2, std=1)",
        "D2 = B1 + B2 + C1*0.5*N, N ~ Normal(mean=0, std=1)",
        "D3 = C1 + C2 + C3*N,     N ~ Normal(mean=0, std=1)",
        "D4 = (A1 + B1 + C1) * N, N ~ Normal(mean=1, std=1)",
                
        # "Z = N, N ~ LogLogistic(alpha=1, beta=1)",
        # "X = N * 3 * Z ** 2, N ~ LogNormal(mean=1, std=1)",
        # "Y = N + 2 * Z + sqrt(X), N ~ Normal(mean=2, std=1)",
        # "W = K * Y + X, K ~ Benini(alpha=0.1, beta=0.4, sigma=0.4)",
        # "A = Y + N, N ~ DiscreteUniform('N', symbols('a b c'))"
        #"A = A, DiscreteUniform('A', symbols('a b c'))"
    ]
)
print(scm)

Structural Causal Model of 16 variables: A1, A2, A3, A4, B1, B2, B3, B4, C1, C2, ...
Variables with active interventions: []
Assignments:
A1 := f(N) = N	 [ N ~ LogLogistic(alpha=5, beta=2) ]
A2 := f(N) = N	 [ N ~ LogLogistic(alpha=2, beta=2) ]
A3 := f(N) = N	 [ N ~ Normal(mean=0, std=1) ]
A4 := f(N) = N	 [ N ~ LogNormal(mean=1, std=0.500000000000000) ]
B1 := f(N, A1, A2) = A1 + A2 + N	 [ N ~ LogNormal(mean=1, std=1) ]
B2 := f(N, A1, A2, A3) = A1*A2 + A3*N	 [ N ~ LogNormal(mean=1, std=1) ]
B3 := f(N, A3, A4) = A3 + A4*N	 [ N ~ Benini(alpha=0.100000000000000, beta=0.400000000000000, sigma=0.400000000000000) ]
B4 := f(N, A3, A1, A4) = A3**2 - sqrt(A1 + A4*N)	 [ N ~ Normal(mean=2, std=1) ]
C1 := f(N, A1, A3, B1) = A1*A3*0.5 + B1*N	 [ N ~ Beta(alpha=0.500000000000000, beta=0.500000000000000) ]
C2 := f(N, A1, A2, B2) = A1*A2     + B2*N	 [ N ~ Beta(alpha=0.500000000000000, beta=0.500000000000000) ]
C3 := f(N, A1, B3) = A1*B3     + B3*N	 [ N ~ Beta(alpha=0.500000000000000, beta=0.5000000000000

In [15]:
simple_scm = SCM(
        [
        # last layer of dependency nodes
        "A1 = N, N ~ Normal(mean=2, std=1)",
        "A2 = N, N ~ Normal(mean=0, std=1)",
            
         "B1 = A1 + A2 + N, N ~ Normal(mean=0, std=1)",
         "B2 = A1 * A2 * N, N ~ Normal(mean=0, std=1)",
            
         "C1 = B1 + B2 + N, N ~ Normal(mean=0, std=1)",
         "C2 = B1 + B2",
            
         "D1 = N, N ~ Bernoulli(p=0.8)",
    ]
)
print(simple_scm)

Structural Causal Model of 7 variables: A1, A2, D1, B1, B2, C1, C2
Variables with active interventions: []
Assignments:
A1 := f(N) = N	 [ N ~ Normal(mean=2, std=1) ]
A2 := f(N) = N	 [ N ~ Normal(mean=0, std=1) ]
B1 := f(N, A1, A2) = A1 + A2 + N	 [ N ~ Normal(mean=0, std=1) ]
B2 := f(N, A1, A2) = A1 * A2 * N	 [ N ~ Normal(mean=0, std=1) ]
C1 := f(N, B1, B2) = B1 + B2 + N	 [ N ~ Normal(mean=0, std=1) ]
C2 := f(B1, B2) = B1 + B2
D1 := f(N) = N	 [ N ~ Bernoulli(p=0.800000000000000, succ=1, fail=0) ]


In [18]:
simple_scm.sample(10)


The numsamples parameter to sympy.stats.sample() is deprecated.
Either use a list comprehension, like

[sample(...) for i in range(10)]

or add a dimension to size, like

sample(..., size=(10,))

See https://docs.sympy.org/latest/explanation/active-deprecations.html#deprecated-sympy-stats-numsamples
for details.

This has been deprecated since SymPy version 1.9. It
will be removed in a future version of SymPy.

  list(sample(noise_gen, numsamples=n, seed=seed)), dtype=float


Unnamed: 0,A1,A2,D1,B1,B2,C1,C2
0,1.577068,0.386635,0.0,1.769287,0.678207,2.312114,2.447494
1,2.647351,0.793169,1.0,2.612836,-1.842925,2.053882,0.76991
2,2.24233,0.443557,0.0,3.530265,-1.889432,1.95024,1.640833
3,1.682866,-1.250713,1.0,0.996758,2.352043,2.848457,3.348801
4,2.427859,0.029516,1.0,2.630466,-0.002203,3.778266,2.628263
5,1.831662,-1.347743,1.0,1.293677,-0.157169,0.998696,1.136508
6,1.172411,-0.664964,1.0,0.526395,0.969788,2.54967,1.496183
7,2.472986,-0.676057,1.0,0.470703,3.430889,3.428561,3.901592
8,2.90902,-0.706356,1.0,3.020703,3.230326,6.146821,6.25103
9,2.757263,-0.761453,0.0,1.449719,1.462665,1.596173,2.912384


## Generate the large dataset

In [30]:
n = 50000
d = simple_scm.sample(n).round(2)


The numsamples parameter to sympy.stats.sample() is deprecated.
Either use a list comprehension, like

[sample(...) for i in range(50000)]

or add a dimension to size, like

sample(..., size=(50000,))

See https://docs.sympy.org/latest/explanation/active-deprecations.html#deprecated-sympy-stats-numsamples
for details.

This has been deprecated since SymPy version 1.9. It
will be removed in a future version of SymPy.

  list(sample(noise_gen, numsamples=n, seed=seed)), dtype=float


In [31]:
d.describe().round(2)

Unnamed: 0,A1,A2,D1,B1,B2,C1,C2
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,2.0,0.0,0.8,2.0,0.01,2.0,2.0
std,1.0,1.0,0.4,1.73,2.27,3.02,2.85
min,-2.39,-3.79,0.0,-6.61,-25.64,-26.85,-24.68
25%,1.33,-0.68,1.0,0.83,-0.62,0.25,0.46
50%,2.0,0.01,1.0,2.0,0.0,1.89,1.87
75%,2.67,0.68,1.0,3.17,0.62,3.55,3.35
max,5.93,4.17,1.0,8.61,29.01,35.84,35.84


In [32]:
d.head(10)

Unnamed: 0,A1,A2,D1,B1,B2,C1,C2
0,3.07,2.52,0.0,5.99,7.93,11.81,13.92
1,1.41,-0.14,1.0,-0.6,-0.11,-0.84,-0.71
2,2.47,0.51,0.0,3.26,-0.65,2.31,2.61
3,3.51,-1.51,1.0,2.88,-4.02,-1.7,-1.14
4,1.91,0.74,0.0,2.47,-0.08,1.57,2.39
5,2.62,-0.03,1.0,1.16,-0.07,2.0,1.08
6,3.87,0.18,1.0,3.54,-0.13,3.16,3.41
7,2.63,1.44,1.0,5.41,1.65,8.32,7.06
8,1.59,-0.88,0.0,0.97,-0.65,0.6,0.32
9,3.14,-0.01,1.0,3.23,-0.03,1.9,3.21


In [33]:
d.to_csv('simple_scm.csv')