In [1]:
import pandas as pd
import pymongo
from pprint import pprint

# Basic usage of SACRED with PyMongo

In [15]:
# Connect to client
from pymongo import MongoClient
client = MongoClient('localhost', 27017)

### List databases, collections, and select from them

In [16]:
print('databases:', client.list_database_names())

databases: ['ZSdebug', 'ZSdebug-CUB_Xian', 'ZSdebug|CUB_Xian', 'ZSdebug|None', 'admin', 'config', 'hyperopt', 'labwatch_demo_keras', 'local', 'sacred_keras_example', 'sacred_mnist']


In [17]:
db = client['sacred_mnist']

In [18]:
print('Collections of {} db: {}'.format(db.name, db.list_collection_names()))


Collections of sacred_mnist db: ['metrics', 'fs.files', 'fs.chunks', 'runs']


### Example of a record (a "document") for a single experiment

In [19]:
exp = list(db.runs.find())[-1]
pprint(exp.keys())
pprint('Example of a "document" that represents a single experiment:')
pprint('\n\n Example of configurations of several experiemts:')
pprint(exp)

dict_keys(['_id', 'experiment', 'format', 'command', 'host', 'start_time', 'config', 'meta', 'status', 'resources', 'artifacts', 'captured_out', 'info', 'heartbeat', 'result', 'stop_time'])
'Example of a "document" that represents a single experiment:'
'\n\n Example of configurations of several experiemts:'
{'_id': 530,
 'artifacts': [],
 'captured_out': "INFO - My_Experiment - Running command 'main'\n"
                 'INFO - My_Experiment - Started run with ID "530"\n'
                 "{'seed': 0, 'lr': 0.0003, 'dropout_rate': 0.6, 'fc_dim': "
                 "600, 'epochs': 10, 'batch_size': 32}\n"
                 "{'seed': 0, 'lr': 0.0003, 'dropout_rate': 0.6, 'fc_dim': "
                 "600, 'epochs': 10, 'batch_size': 32, 'disable_logging': 0, "
                 "'verbose': 1, 'gpu_memory_fraction': -1}\n"
                 '2018-08-28 17:54:11.407927: I '
                 'tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU '
                 'supports instructions 

### Example of configurations of several experiments

In [20]:
print('Example of configurations of several experiments:\n')
for i, exp in enumerate(list(db.runs.find())[0:3]):
    pprint(exp['config'])
    print('')
    

Example of configurations of several experiments:

{'batch_size': 32,
 'dropout_rate': 0.30000000000000004,
 'epochs': 10,
 'fc_dim': 96.0,
 'lr': 0.00042,
 'seed': 0}

{'batch_size': 32,
 'dropout_rate': 0.8,
 'epochs': 10,
 'fc_dim': 192.0,
 'lr': 0.0033900000000000002,
 'seed': 0}

{'batch_size': 32,
 'dropout_rate': 0.5,
 'epochs': 10,
 'fc_dim': 112.0,
 'lr': 6.000000000000001e-05,
 'seed': 0}



# Pandas + SACRED MongoDB

## Get a dataframe that summarize the experiements, according to a PANDAS query

In [24]:
from collections import OrderedDict
import pandas as pd
import re

def slice_dict(d, keys):
    """ Returns a dictionary ordered and sliced by given keys
        keys can be a list, or a CSV string
    """
    if isinstance(keys, str):
        keys = keys[:-1] if keys[-1] == ',' else keys
        keys = re.split(', |[, ]', keys)

    return dict((k, d[k]) for k in keys)

def sacred_to_df(db_runs, mongo_query=None, ):
    """
    db_runs is usually db.runs
    returns a dataframe that summarizes the experiments, where 
    config and info fields are flattened to their keys.
    Summary DF contains the following columns:
    _id, experiment.name, **config, result, **info, status, start_time
    """
    # get all experiment according to mongo query and represent as a pandas DataFrame    
    df = pd.DataFrame(list(db_runs.find(mongo_query)))

    # Take only the interesting columns
    df = df.loc[:, '_id, experiment, config, result, info, status, start_time'.split(', ')]

    def _summerize_experiment(s):
        """
        Take only the 
        """
        o = OrderedDict()
        o['_id'] = s['_id']
        o['name']=s['experiment']['name']
        o.update(s['config'])
        o.update(s['info'])

        o.update(slice_dict(s.to_dict(), 'result, status, start_time'))
        return pd.Series(o)
    
    sum_list = []
    for ix, s in df.iterrows():
        sum_list.append(_summerize_experiment(s))
    df_summary = pd.DataFrame(sum_list)
    
    return df_summary

In [25]:
sacred_to_df(db.runs).query('status=="COMPLETED" and val_acc>0.92').sort_values('val_acc', ascending=False).head()

Unnamed: 0,_id,name,batch_size,epochs,lr,fc_dim,dropout_rate,seed,result,status,start_time,metrics,test_acc,val_acc
101,102,My_Experiment,32,10,0.00755,72.0,0.65,0,0.942,COMPLETED,2018-08-24 18:50:49.721,,0.922,0.942
134,483,My_Experiment,32,10,0.003,1000.0,0.6,0,0.94,COMPLETED,2018-08-28 10:20:55.482,,0.945,0.94
104,105,My_Experiment,32,10,0.0019,56.0,0.55,0,0.937,COMPLETED,2018-08-24 18:50:53.974,,0.933,0.937
120,469,My_Experiment,32,10,0.003,30.0,0.3,0,0.93,COMPLETED,2018-08-28 10:15:17.061,,0.936,0.93
135,484,My_Experiment,32,10,0.001,1000.0,0.2,0,0.929,COMPLETED,2018-08-28 10:20:55.721,,0.921,0.929


## Get a mongo iterator, according to a PANDAS query

In [26]:
def query_by_pandas(mongo_db_runs, query):
    ids = sacred_to_df(mongo_db_runs).query(query).loc[:, '_id']
    mongo_cursor = db.runs.find(dict(_id={'$in':ids.tolist()}))
    return mongo_cursor

In [27]:
# Get a mongo iterator, according to a PANDAS query
mongo_cursor = query_by_pandas(db.runs, 'status=="COMPLETED" and val_acc>0.92')

# aggregate the results of a specific field
pd.DataFrame(list(mongo_cursor)).loc[:, 'artifacts']

0     []
1     []
2     []
3     []
4     []
5     []
6     []
7     []
8     []
9     []
10    []
11    []
Name: artifacts, dtype: object