# From simulator to inference with HDDM (LAN version)

In [None]:
# package to help train networks
# !pip install git+https://github.com/AlexanderFengler/LANfactory

In [None]:
# !conda install --quiet --yes scipy

In [1]:
# HDDM
import hddm

# Package to help train networks (explained above)
# import lanfactory

# Package containing simulators for ssms (explained above)
import ssms

# Other misc packages
import os
import numpy as np
from copy import deepcopy
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import torch

In [2]:
# MAKE CONFIGS
from ssms.config import data_generator_config

# Initialize the generator config (for MLP LANs)

# (We start from a supplied example in the ssms package)
# generator_config = deepcopy(data_generator_config['lan']['mlp'])
generator_config = deepcopy(data_generator_config['lan'])

# Specify generative model (one from the list of included models in the ssms package)
generator_config['dgp_list'] = 'ddm'

# Specify number of parameter sets to simulate
generator_config['n_parameter_sets'] = 5000

# Specify how many samples a simulation run should entail
generator_config['n_samples'] = 2000

# Specify how many training examples to extract from
# a single parametervector
generator_config['n_training_examples_by_parameter_set'] = 2000

# Specify folder in which to save generated data
generator_config['output_folder'] = 'lan_to_hddm_tmp_data/lan_mlp/'

# Make model config dict
model_config = ssms.config.model_config['ddm']

# Show
model_config

{'name': 'ddm',
 'params': ['v', 'a', 'z', 't'],
 'param_bounds': [[-3.0, 0.3, 0.1, 0.0], [3.0, 2.5, 0.9, 2.0]],
 'boundary': <function ssms.basic_simulators.boundary_functions.constant(t=0)>,
 'n_params': 4,
 'default_params': [0.0, 1.0, 0.5, 0.001],
 'hddm_include': ['z'],
 'nchoices': 2}

In [3]:
generator_config

{'output_folder': 'lan_to_hddm_tmp_data/lan_mlp/',
 'dgp_list': 'ddm',
 'nbins': 0,
 'n_samples': 2000,
 'n_parameter_sets': 5000,
 'n_parameter_sets_rejected': 100,
 'n_training_samples_by_parameter_set': 1000,
 'max_t': 20.0,
 'delta_t': 0.001,
 'pickleprotocol': 4,
 'n_cpus': 'all',
 'kde_data_mixture_probabilities': [0.8, 0.1, 0.1],
 'simulation_filters': {'mode': 20,
  'choice_cnt': 0,
  'mean_rt': 17,
  'std': 0,
  'mode_cnt_rel': 0.9},
 'negative_rt_cutoff': -66.77497,
 'n_subruns': 10,
 'bin_pointwise': False,
 'separate_response_channels': False,
 'n_training_examples_by_parameter_set': 2000}

In [4]:
my_dataset_generator = ssms.dataset_generators.data_generator(generator_config = generator_config,
                                                              model_config = model_config)

training_data = my_dataset_generator.generate_data_training_uniform(save = True)

n_cpus used:  5
checking:  lan_to_hddm_tmp_data/lan_mlp/
simulation round: 1  of 10
simulation round: 2  of 10
simulation round: 3  of 10
simulation round: 4  of 10
simulation round: 5  of 10
simulation round: 6  of 10
simulation round: 7  of 10
simulation round: 8  of 10
simulation round: 9  of 10
simulation round: 10  of 10
Writing to file:  lan_to_hddm_tmp_data/lan_mlp//training_data_75134f52266511eeb41a0242ac110002.pickle


In [7]:
import pickle

In [8]:
fn = 'lan_to_hddm_tmp_data/lan_mlp/training_data_75134f52266511eeb41a0242ac110002.pickle'
tmp_data = pickle.load(open(fn, "rb"))

Structure of training data:
`data`: Simulated data. Last column is choice and second to last it RT. Columns before are parameters that generated the observation. Has shape (`n_parameter_sets` x `n_training_samples_by_parameter_set`, `n_params` + 2)
`labels`: KDE of likelihood of simulated data. Used (?) as the labels to train the network. Has shape (`n_parameter_sets` x `n_training_samples_by_parameter_set`,)
`choice_p`: Thought these would be choice proportions for each parameter combination but they don't seem to match the proportions in simulated data.
`thetas`: Parameter combinations used to generate simulated data. Has shape (`n_parameter_sets` x `n_params`)
`binned_128`:
`binned_256`:
`generator_config`: Same as defined above
`model_config`: Same as defined above

In [10]:
tmp_data.keys()

dict_keys(['data', 'labels', 'choice_p', 'thetas', 'binned_128', 'binned_256', 'generator_config', 'model_config'])

In [15]:
tmp_data['data'].shape

(5000000, 6)

In [16]:
tmp_data['data']

array([[-1.4388764 ,  0.5708366 ,  0.5322329 ,  1.1107221 ,  1.2620727 ,
        -1.        ],
       [-1.4388764 ,  0.5708366 ,  0.5322329 ,  1.1107221 ,  1.2400848 ,
        -1.        ],
       [-1.4388764 ,  0.5708366 ,  0.5322329 ,  1.1107221 ,  1.3780861 ,
        -1.        ],
       ...,
       [ 2.7743032 ,  0.32321885,  0.84490323,  1.4482005 , -0.9035801 ,
         1.        ],
       [ 2.7743032 ,  0.32321885,  0.84490323,  1.4482005 , -0.53397965,
         1.        ],
       [ 2.7743032 ,  0.32321885,  0.84490323,  1.4482005 , -0.28388363,
         1.        ]], dtype=float32)

In [17]:
tmp_data['thetas']

array([[-1.4388764 ,  0.5708366 ,  0.5322329 ,  1.1107221 ],
       [ 1.6167252 ,  1.3564736 ,  0.32852015,  1.4234009 ],
       [ 0.6532003 ,  1.6370366 ,  0.61324537,  1.4685416 ],
       ...,
       [ 0.9171699 ,  0.6016883 ,  0.73059785,  1.2719735 ],
       [-0.4157134 ,  0.3144693 ,  0.32055506,  1.034292  ],
       [ 2.7743032 ,  0.32321885,  0.84490323,  1.4482005 ]],
      dtype=float32)

In [18]:
tmp_data['thetas'].shape

(5000, 4)

In [19]:
# kde of likelihood for simulated data OR max negative RT
# called "label" because (?) used as labels for training the network    
tmp_data['labels']

array([  0.8377411,   0.8177988,   0.4450194, ..., -66.77497  ,
       -66.77497  , -66.77497  ], dtype=float32)

In [20]:
tmp_data['labels'].shape

(5000000,)

In [39]:
tmp_data['choice_p']

array([0.1845, 0.9575, 0.9445, ..., 0.894 , 0.266 , 0.9755], dtype=float32)

In [49]:
tmp_data['choice_p'].shape

(5000,)

In [53]:
[i[5] for i in tmp_data['data'][0:1000,]].count(1)/1000

0.252

In [57]:
[i[5] for i in tmp_data['data'][(5000000-(999*2)+1):(5000000-999),]].count(1)/1000

0.312

In [None]:
len(tmp)