# Meta-config generation for synthetic data

We will generate synthetic data configurations that can in turn generate synthetic data. The entire process is coded in a reproducible manner. This is useful for testing the pipeline. In the working directory, there are directories that are named by the number of covariates.
In each of the directories, we will generate a set of configurations that generate causal graphs with that number of covariates. The functional form of the covariate is one of the following:

1. Linear non-Gaussian datasets
2. Non-linear Gaussian datasets with parametric assumptions
    1. The invertible function is a polynomial of degree 3
    2. The invertible function is $x + sin(x)$
3. Non-linear Gaussian datasets with no parametric assumptions

We will explain in detail how we generate each of these datasets. The causal graph generators are separate entities and we will sample from three different causal graph types:

1. Chains (only one correct ordering and really sparse)
2. Stars (Many correct orderings and sparse)
3. Erdos-Renyi (Many correct orderings and dense)

In [15]:
import numpy as np

def generate_graph_generator_args(n: int):
    # graph generator
    graph_generator_args = {'n': n, 'seed': np.random.randint(1000)}
    type_ind = np.random.randint(5)
    if type_ind == 0 or n <= 2:
        graph_generator_args['graph_type'] = 'chain'
    elif type_ind == 1:
        graph_generator_args['graph_type'] = 'fork' if np.random.randint(2) else 'v_structure'
    elif type_ind == 2:
        graph_generator_args['graph_type'] = 'full'
    elif type_ind == 3:
        graph_generator_args['graph_type'] = 'erdos_renyi'
        graph_generator_args['p'] = 0.4
    else:
        graph_generator_args['graph_type'] = 'fork'
    
    return graph_generator_args

Set the following for reproducibility:

In [16]:
import typing as th
from ruamel import yaml
np.random.seed(100)

## Functional form Generator

### Linear non-Gaussian datasets

In these datasets, the function format is of 

In [17]:
def generate_linear_non_gaussan_configurations(n_cov: th.List[int],
                                               n_configs: th.List[int],
                                               observation_sizes: th.List[int]):
    
    for n, observation_size, n_config in zip(n_cov, observation_sizes, n_configs):
        for _ in range(n_config):
            
            scm_generator_args = {}
            scm_generator_args['graph_generator'] = 'ocd.data.scm.GraphGenerator'
            scm_generator_args['graph_generator_args'] = generate_graph_generator_args(n)    
            scm_generator_args['weight'] = [-1.0, 1.0]
            
            scm_generator_args['noise_type'] = "laplace" if np.random.randint(2) == 0 else "uniform"

            scm_generator = 'ocd.data.synthetic.LinearNonGaussianSCMGenerator'
            
            dataset_args = {
                'seed': np.random.randint(1000),
                'scm_generator': scm_generator,
                'scm_generator_args': scm_generator_args,
                'observation_size': observation_size
            }
            
            conf = {
                'class_path': 'lightning_toolbox.DataModule',
                'init_args': {
                    'dataset': 'ocd.data.SyntheticOCDDataset',
                    'dataset_args': dataset_args,
                    'val_size': 0.1,
                    'batch_size': 128,
                },
            }
            
            conf_name = f"linear_non_gaussian_{n}_{observation_size}_{scm_generator_args['graph_generator_args']['graph_type']}_{scm_generator_args['noise_type']}.yaml"
            
            # write conf to conf_name in yaml format
            with open(conf_name, 'w') as f:
                # yaml.dump(conf, f)
                yaml.safe_dump(conf, f, indent=4)
                

            

In [17]:
ns = [2, 3, 4, 5, 10, 25, 50, 100]
n_configs = [30, 30, 30, 30, 15, 10, 3, 3]
observation_size = [500, 500, 1000, 1000, 10000, 10000, 10000, 10000]

generate_linear_non_gaussan_configurations(ns, n_configs, observation_size)

## Parametric non-linear Gaussian datasets


In [7]:

s_func = """def func(x):
    return numpy.log(1 + numpy.exp(x))"""

def get_t_func_1(t):
    return f"""def func(x):
    x_mean = numpy.mean(x)
    x_std = numpy.std(x)
    if x_std == 0:
        x_std = 1
    x = (x - x_mean) / x_std
    return x**3 + {t}"""

def get_t_func_2():
    return f"""def func(x):
    return numpy.sin(x) + x"""
    
def non_linear_gaussan_configurations(n_cov: th.List[int],
                                      n_configs: th.List[int],
                                      observation_sizes: th.List[int]):
    
    for n, observation_size, n_config in zip(n_cov, observation_sizes, n_configs):
        for _ in range(n_config):
            
            scm_generator_args = {}
            scm_generator_args['graph_generator'] = 'ocd.data.scm.GraphGenerator'
            scm_generator_args['graph_generator_args'] = generate_graph_generator_args(n)    
            
            type = np.random.randint(2)
            
            scm_generator_args['seed'] = np.random.randint(1000)
            scm_generator_args['std'] = 1.0
            scm_generator_args['mean'] = 0.0
            scm_generator_args['weight_s'] = [0.5, 1.5]
            scm_generator_args['weight_t'] = [0.5, 1.5]
            scm_generator_args['s_function'] = {
                'function_descriptor': s_func,
                'function_of_interest': 'func'
            }
            'lambda x: numpy.log(1 + numpy.exp(x))'
            scm_generator_args['s_function_signature'] = 'softplus'
            
            if type == 0:
                # Cube and dislocate function
                type_naming = 'cube_dislocate'
                scm_generator_args['t_function'] = {
                    'function_descriptor': get_t_func_1(np.random.randint(10)),
                    'function_of_interest': 'func'
                }
                scm_generator_args['t_function_signature'] = 'cube_and_dislocate'
            else:
                # Sine function
                type_naming = 'sin_plus_x'
                scm_generator_args['t_function'] = {
                    'function_descriptor': get_t_func_2(),
                    'function_of_interest': 'func'
                }
                scm_generator_args['t_function_signature'] = 'sin_plus_x'

            scm_generator = 'ocd.data.synthetic.InvertibleModulatedGaussianSCMGenerator'
            
            dataset_args = {
                'seed': np.random.randint(1000),
                'scm_generator': scm_generator,
                'scm_generator_args': scm_generator_args,
                'observation_size': observation_size
            }
            
            conf = {
                'class_path': 'lightning_toolbox.DataModule',
                'init_args': {
                    'dataset': 'ocd.data.SyntheticOCDDataset',
                    'dataset_args': dataset_args,
                    'val_size': 0.1,
                    'batch_size': 128,
                },
            }
            
            conf_name = f"parametric_non_linear_gaussian_{n}_{observation_size}_{scm_generator_args['graph_generator_args']['graph_type']}_{type_naming}.yaml"
            
            # write conf to conf_name in yaml format
            with open(conf_name, 'w') as f:
                # yaml.dump(conf, f)
                yaml.safe_dump(conf, f, indent=4)
                

            

In [8]:
ns = [2, 3, 4, 5, 10, 25, 50, 100]
n_configs = [10, 10, 10, 10, 10, 10, 3, 3]
observation_size = [10000, 10000, 10000, 10000, 10000, 10000, 10000, 10000]

non_linear_gaussan_configurations(ns, n_configs, observation_size)

## Non-parametric generators with Gaussian Processes



In [13]:

s_func = """def func(x):
    return numpy.log(1 + numpy.exp(x))"""

def get_t_func_1(t):
    return f"""def func(x):
    x_mean = numpy.mean(x)
    x_std = numpy.std(x)
    if x_std == 0:
        x_std = 1
    x = (x - x_mean) / x_std
    return x**3 + {t}"""

def get_t_func_2():
    return f"""def func(x):
    return numpy.sin(x) + x"""
    
def non_parametric_gaussian_process_generator(
                                    n_cov: th.List[int],
                                    n_configs: th.List[int],
                                    observation_sizes: th.List[int]):
    
    for n, observation_size, n_config in zip(n_cov, observation_sizes, n_configs):
        for _ in range(n_config):
            
            scm_generator_args = {}
            scm_generator_args['graph_generator'] = 'ocd.data.scm.GraphGenerator'
            scm_generator_args['graph_generator_args'] = generate_graph_generator_args(n)    
            

            scm_generator_args['noise_std'] = 1.0
            scm_generator_args['noise_mean'] = 0.0
            scm_generator_args['s_gamma_rbf_kernel'] = 1.0
            scm_generator_args['s_variance_rbf_kernel'] = 1.0
            scm_generator_args['s_mean_function_weights'] = [0.0, 0.0]
            scm_generator_args['t_gamma_rbf_kernel'] = 1.0
            scm_generator_args['t_variance_rbf_kernel'] = 1.0
            scm_generator_args['t_mean_function_weights'] = [0.0, 0.0]
            
            scm_generator = 'ocd.data.synthetic.GaussianProcessBasedSCMGeberator'
            
            dataset_args = {
                'seed': np.random.randint(1000),
                'scm_generator': scm_generator,
                'scm_generator_args': scm_generator_args,
                'observation_size': observation_size
            }
            
            conf = {
                'class_path': 'lightning_toolbox.DataModule',
                'init_args': {
                    'dataset': 'ocd.data.SyntheticOCDDataset',
                    'dataset_args': dataset_args,
                    'val_size': 0.1,
                    'batch_size': 128,
                },
            }
            
            conf_name = f"non_parametric_non_linear_gaussian_{n}_{observation_size}_{scm_generator_args['graph_generator_args']['graph_type']}.yaml"
            
            # write conf to conf_name in yaml format
            with open(conf_name, 'w') as f:
                # yaml.dump(conf, f)
                yaml.safe_dump(conf, f, indent=4)
                

            

In [14]:
ns = [2, 3, 4, 5, 10, 25, 50, 100]
n_configs = [10, 10, 10, 10, 10, 10, 3, 3]
observation_size = [5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000]

non_parametric_gaussian_process_generator(ns, n_configs, observation_size)