# Stimulus batching

Creates metadata for corpus collection on gadget and structure domains.

Partitions train/test data into batches of 25 stimuli (20 train and 5 test).

Multiple families of batches can be specified.

In [2]:
from __future__ import division

import numpy as np
import os, sys
from PIL import Image
import pandas as pd
import json
import pickle

from matplotlib import pylab, mlab, pyplot
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from matplotlib.path import Path
import matplotlib.patches as patches
%matplotlib inline

from IPython.core.pylabtools import figsize, getfigs

import seaborn as sns
from sklearn.model_selection import StratifiedKFold

import random

from scipy.stats import norm
from IPython.display import clear_output

import copy
import importlib

# import urllib library
from urllib.request import urlopen

### Add Paths

## root paths
curr_dir = os.getcwd()
proj_dir = os.path.abspath(os.path.join(curr_dir,'..')) ## u\e relative paths

## add helpers to python path
import sys
if os.path.join(proj_dir, 'stimuli') not in sys.path:
    sys.path.append(os.path.join(proj_dir, 'stimuli'))

### Data storage setup

In [3]:
s3_bucket_path_template = "https://lax-{}-{}-all.s3.amazonaws.com/"

### Structure setup

In [180]:
domain = 'structures'
subdomains = ['bridge', 'castle', 'house', 'city']

# select subdomain
subdomain = subdomains[1]

### Gadgets setup

In [169]:
domain = 'drawing'
subdomains = ['nuts-bolts','wheels','furniture','dials']

# select subdomain
subdomain = subdomains[0]

### Get stimulus data

In [181]:
experiment_name = 'lax_{}_{}_all'.format(domain, subdomain)

# generate bucket path
s3_bucket_path = s3_bucket_path_template.format(domain, subdomain)

if domain == 'structures':
    # read manifest data
    df = pd.read_csv(s3_bucket_path + 'df_{}.csv'.format(subdomain))

    # assign grouping column
    df['group'] = np.where(df['train'], 'train', 'test')

    # assign id column
    df['stim_id'] = df['structure_number'] 
    
    
elif domain == 'drawing':
    
    # read manifest data
    df = pd.read_csv(s3_bucket_path + 'manifest.csv'.format(subdomain))
    
    # assign grouping column
    df['group'] = df['data_split']

    # assign id column
    df['stim_id'] = df['stim_id'].apply(lambda x: x[-3:])

In [182]:
df

Unnamed: 0.1,Unnamed: 0,structure_type,blocks,structure_width,structure_height,n_blocks,wall_height,wall_size,tower_height,roof_name,central_roof_size,tower_roof_size,dreamcoder_program,program_whole_squares,structure_number,train,group,stim_id
0,0,castle,"[{'x': 0, 'y': 0, 'height': 1, 'width': 2}, {'...",14.0,7.0,40.0,1.0,2.0,1.0,dome,6.0,4.0,(h h (r 1) t (l 2) t (r 6) t (l 2) t (r 1) h h...,(h h (r 1) t (l 1) t (r 3) t (l 1) t h h (l 1)...,0,True,train,0
1,1,castle,"[{'x': 1, 'y': 0, 'height': 2, 'width': 1}, {'...",16.0,8.0,37.0,1.0,2.0,1.0,dome,8.0,4.0,((r 1) t (r 3) h (l 1) t (l 3) h (r 2) h (l 2)...,((r 1) t (r 1) h t (l 2) h (r 1) h (l 1) h (r ...,1,True,train,1
2,2,castle,"[{'x': 1, 'y': 0, 'height': 2, 'width': 1}, {'...",16.0,8.0,41.0,1.0,2.0,1.0,dome,8.0,4.0,((r 1) t (r 2) t (l 3) h (r 4) h (l 2) h (l 2)...,((r 1) t (r 1) t (l 2) h (r 2) h (l 1) h (l 1)...,2,True,train,2
3,3,castle,"[{'x': 0, 'y': 0, 'height': 1, 'width': 2}, {'...",16.0,11.0,45.0,1.0,2.0,1.0,dome,8.0,4.0,(h (r 3) t (l 4) t (r 5) h (l 2) h (l 2) h (r ...,(h (r 2) t (l 2) t (r 2) h (l 1) h (l 1) h (r ...,3,True,train,3
4,4,castle,"[{'x': 0, 'y': 0, 'height': 1, 'width': 2}, {'...",14.0,7.0,32.0,1.0,2.0,1.0,dome,6.0,4.0,(h (r 3) t (l 4) t (r 5) h (l 2) h (l 2) h (r ...,(h (r 2) t (l 2) t (r 2) h (l 1) h (l 1) h (r ...,4,True,train,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,245,castle,"[{'x': 0, 'y': 0, 'height': 1, 'width': 2}, {'...",20.0,18.0,107.0,4.0,4.0,1.0,pyramid,12.0,4.0,(h (r 4) h (l 5) t t (r 6) t t (l 5) h (r 4) h...,(h (r 2) h (l 2) t t (r 3) t t (l 3) h (r 2) h...,245,True,train,245
246,246,castle,"[{'x': 0, 'y': 0, 'height': 2, 'width': 1}, {'...",20.0,18.0,99.0,4.0,4.0,1.0,pyramid,12.0,4.0,((l 1) t (r 6) t (l 5) h (r 4) h (l 4) h (r 4)...,(t (r 3) t (l 3) h (r 2) h (l 2) h (r 2) h (l ...,246,False,test,246
247,247,castle,"[{'x': 1, 'y': 0, 'height': 2, 'width': 1}, {'...",20.0,18.0,107.0,4.0,4.0,2.0,pyramid,12.0,4.0,((r 1) t (r 2) t (l 3) h (r 4) h (l 3) t (r 2)...,((r 1) t (r 1) t (l 2) h (r 2) h (l 1) t (r 1)...,247,True,train,247
248,248,castle,"[{'x': 0, 'y': 0, 'height': 1, 'width': 2}, {'...",20.0,18.0,127.0,4.0,4.0,3.0,pyramid,12.0,4.0,(h (r 4) h (l 5) t (r 6) t (l 5) h (r 4) h (l ...,(h (r 2) h (l 2) t (r 3) t (l 3) h (r 2) h (l ...,248,True,train,248


## Generate splits

In [158]:
groups = df[['stim_id','group']]
groups['group'].value_counts()

train    200
test      50
Name: group, dtype: int64

In [159]:
n_families = 1
n_splits = 10
X = df['stim_id']
y = df['group']

for i in range(0, n_families):
    
    # set up partitioning
    skf = StratifiedKFold(n_splits=n_splits, 
                          random_state=i,  # tie random state to group num
                          shuffle=True)
    
    # apply partitioning and save to df
    for split_num, (train_index, test_index) in enumerate(skf.split(X, y)):
        df.loc[test_index,'family_'+str(i)] = split_num
#         print(groups.loc[test_index])


## To run multiple versions, upload the same metadate to separate collections, and update stimColName in configs accordingly

In [160]:
versionInd = 0

In [183]:
# connect to mongo

import pymongo as pm

# set vars 
auth = pd.read_csv('../auth.txt', header = None) # this auth.txt file contains the password for the sketchloop user
pswd = auth.values[0][0]
user = 'sketchloop'
host = 'cogtoolslab.org' ## experiment server ip address

conn = pm.MongoClient('mongodb://sketchloop:' + pswd + '@127.0.0.1')
db = conn['stimuli']
coll = db[experiment_name]

In [162]:
s3_bucket_path

'https://lax-drawing-furniture-all.s3.amazonaws.com/'

In [163]:
# convert to lists of stimulus names

metadata = []

for f in range(0, n_families):
    for s in range(0,n_splits):
        stimIDs = list(df.groupby('family_'+str(f)).get_group(s)['stim_id'].apply(lambda x: str(x).zfill(3)))
#         print(stim_numbers)
        metadata.append(
            {
                'partitionFamily': f,
                'splitNumber': s,
                'stimIDs': stimIDs,
                'stimURLS': [s3_bucket_path + "lax-{}-{}-{}-all.png".format(domain,
                                                              subdomain,
                                                              stimID) for stimID in stimIDs],
                'ntrials': len(stimIDs),
                'stimGroups': {n: df.groupby('family_'+str(f)).get_group(s).reset_index().loc[i,'group'] for i, n in enumerate(stimIDs)},
                'numGames': 0,
                'games': [],
                'experimentType': 'corpus_collection',
                'experimentName': experiment_name,
                's3_bucket_url': s3_bucket_path,
                'versionInd': versionInd
            })

In [164]:
metadata

[{'partitionFamily': 0,
  'splitNumber': 0,
  'stimIDs': ['004',
   '009',
   '034',
   '046',
   '047',
   '056',
   '070',
   '071',
   '090',
   '094',
   '111',
   '113',
   '126',
   '132',
   '143',
   '145',
   '154',
   '161',
   '166',
   '171',
   '199',
   '216',
   '228',
   '235',
   '243'],
  'stimURLS': ['https://lax-drawing-furniture-all.s3.amazonaws.com/lax-drawing-furniture-004-all.png',
   'https://lax-drawing-furniture-all.s3.amazonaws.com/lax-drawing-furniture-009-all.png',
   'https://lax-drawing-furniture-all.s3.amazonaws.com/lax-drawing-furniture-034-all.png',
   'https://lax-drawing-furniture-all.s3.amazonaws.com/lax-drawing-furniture-046-all.png',
   'https://lax-drawing-furniture-all.s3.amazonaws.com/lax-drawing-furniture-047-all.png',
   'https://lax-drawing-furniture-all.s3.amazonaws.com/lax-drawing-furniture-056-all.png',
   'https://lax-drawing-furniture-all.s3.amazonaws.com/lax-drawing-furniture-070-all.png',
   'https://lax-drawing-furniture-all.s3.amaz

In [165]:
experiment_name

'lax_drawing_furniture_all'

In [166]:
# Clear metadata collection

really_run = True;

if really_run:
    db.drop_collection(experiment_name)

In [167]:
## now, iterate through each version and insert into mongo
## loop through list of records and insert each into collection
reallyRun = True
if reallyRun:
    for (i,j) in enumerate(metadata):
        coll.insert_one(j)
        print('Inserted version {} of stimDict.'.format(j['versionInd']))
        clear_output(wait=True)

else:
    print('Did not insert any new data.')

Inserted version 0 of stimDict.


In [185]:
list(coll.find())

[{'_id': ObjectId('615b35a11e075f60ec090570'),
  'partitionFamily': 0,
  'splitNumber': 0,
  'stimIDs': ['000',
   '006',
   '008',
   '016',
   '021',
   '032',
   '035',
   '036',
   '044',
   '054',
   '075',
   '095',
   '106',
   '111',
   '125',
   '137',
   '145',
   '153',
   '157',
   '168',
   '177',
   '178',
   '186',
   '238',
   '241'],
  'stimURLS': ['https://lax-structures-castle-all.s3.amazonaws.com/lax-structures-castle-000-all.png',
   'https://lax-structures-castle-all.s3.amazonaws.com/lax-structures-castle-006-all.png',
   'https://lax-structures-castle-all.s3.amazonaws.com/lax-structures-castle-008-all.png',
   'https://lax-structures-castle-all.s3.amazonaws.com/lax-structures-castle-016-all.png',
   'https://lax-structures-castle-all.s3.amazonaws.com/lax-structures-castle-021-all.png',
   'https://lax-structures-castle-all.s3.amazonaws.com/lax-structures-castle-032-all.png',
   'https://lax-structures-castle-all.s3.amazonaws.com/lax-structures-castle-035-all.png'

## Test which versions have been run

This grabs the dataframe created by data generator, to see which records need to be run more times.

It wipes the metadata from mongo, and replaces it with individual records for each additional partition that needs to be run.

In [125]:
version = 'procedural'
# version = 'categorization'

In [126]:
# WARNING: THIS JUST GRABS A LOCAL CSV. ENSURE YOU HAVE CREATED THIS WITH DATA GENERATOR NOTEBOOK

run_results_through_data_generator = True


if run_results_through_data_generator:
    results_csv_directory = "../../results/csv"
    # print("Possible CSV results files to load:")
    # print(os.listdir(results_csv_directory))

    result_csv = 'lax-tower-4-block-unique-silhouettes-' + version +'-pilot_3.csv'

    df_trial = pd.read_csv(os.path.join(results_csv_directory, result_csv))

In [127]:
complete_counts = (df_trial.groupby(['partitionFamily','splitNumber']).count()/13)['datatype'].reset_index()
complete_counts

Unnamed: 0,partitionFamily,splitNumber,datatype
0,0.0,0.0,6.0
1,0.0,1.0,5.0
2,0.0,2.0,5.0
3,0.0,3.0,5.0
4,0.0,4.0,5.0
5,0.0,5.0,5.0
6,0.0,6.0,5.0
7,0.0,7.0,5.0
8,0.0,8.0,5.0
9,0.0,9.0,5.0


In [128]:
# add a record in metadata for each additional time it needs to be run

n_expected = 5

extra_metadata = []

for f in range(0, n_families):
    for s in range(0,n_splits):
        
        n_completed = complete_counts[(complete_counts.partitionFamily == f) & (complete_counts.splitNumber == s)]\
            .reset_index().loc[0,'datatype']
        
        i = n_completed
        
        while i < n_expected:
            
            i = i + 1

            stim_numbers = list(silhouette_superset.groupby('family_'+str(f)).get_group(s)['tower_number_str'])
    #         print(stim_numbers)
            extra_metadata.append(
                {
                    'partitionFamily': f,
                    'splitNumber': s,
                    'stimNumbers': stim_numbers,
                    'ntrials': len(stim_numbers),
                    'stimGroups': {n: silhouette_superset.groupby('family_'+str(f)).get_group(s).reset_index().loc[i,'group'] for i, n in enumerate(stim_numbers)},
                    'numGames': 0,
                    'games': [],
                    'experimentType': 'prior_elicitation',
                    'experimentName': experiment_name,
                    's3_bucket': 'lax-tower-4-block-unique-silhouettes-json',
                    'versionInd': versionInd,
                    'extra_metadata_index': i
                })

In [129]:
print(str(len(extra_metadata)) + ' extra records to upload')

0 extra records to upload


In [174]:
# Delete metadata from db
really_run = False

if really_run:
    db.drop_collection(experiment_name)

In [175]:
## now, iterate through each version and insert into mongo
## loop through list of records and insert each into collection
reallyRun = True
if reallyRun:
    for (i,j) in enumerate(extra_metadata):        
        coll.insert_one(j)
        print('Inserted version {} of stimDict.'.format(j['versionInd']))
        clear_output(wait=True)

else:
    print('Did not insert any new data.')

NameError: name 'extra_metadata' is not defined

In [177]:
list(coll.find())

[{'_id': ObjectId('615b36761e075f60ec090591'),
  'partitionFamily': 0,
  'splitNumber': 0,
  'stimIDs': ['004',
   '009',
   '034',
   '046',
   '047',
   '056',
   '070',
   '071',
   '090',
   '094',
   '111',
   '113',
   '126',
   '132',
   '143',
   '145',
   '154',
   '161',
   '166',
   '171',
   '199',
   '216',
   '228',
   '235',
   '243'],
  'stimURLS': ['https://lax-drawing-nuts-bolts-all.s3.amazonaws.com/lax-drawing-nuts-bolts-004-all.png',
   'https://lax-drawing-nuts-bolts-all.s3.amazonaws.com/lax-drawing-nuts-bolts-009-all.png',
   'https://lax-drawing-nuts-bolts-all.s3.amazonaws.com/lax-drawing-nuts-bolts-034-all.png',
   'https://lax-drawing-nuts-bolts-all.s3.amazonaws.com/lax-drawing-nuts-bolts-046-all.png',
   'https://lax-drawing-nuts-bolts-all.s3.amazonaws.com/lax-drawing-nuts-bolts-047-all.png',
   'https://lax-drawing-nuts-bolts-all.s3.amazonaws.com/lax-drawing-nuts-bolts-056-all.png',
   'https://lax-drawing-nuts-bolts-all.s3.amazonaws.com/lax-drawing-nuts-bolt

In [None]:
experiment_name

In [97]:
# metadata_pre_top_up = pd.DataFrame(coll.find())

# metadata_pre_top_up.to_csv('./metadata/first_batch_procedural.csv')

In [64]:
experiment_name

'ca_prior_elicitation_4_block_unique_silhouettes_categorization'