In [13]:
import pandas as pd
import pymongo
from pprint import pprint

# Basic usage of SACRED with PyMongo

In [14]:
# Connect to client
from pymongo import MongoClient
client = MongoClient('localhost', 27017)

### List databases, collections, and select from them

In [15]:
print('databases:', client.list_database_names())

databases: ['ZSdebug', 'ZSdebug-CUB_Xian', 'ZSdebug|CUB_Xian', 'ZSdebug|None', 'admin', 'config', 'hyperopt', 'labwatch_demo_keras', 'local', 'sacred_keras_example', 'sacred_mnist', 'sacred_mnist_example']


In [16]:
db = client['sacred_mnist_example']

In [17]:
print('Collections of {} db: {}'.format(db.name, db.list_collection_names()))


Collections of sacred_mnist_example db: ['runs', 'fs.files', 'fs.chunks', 'metrics']


### Example of a record (a "document") for a single experiment

In [None]:
exp = list(db.runs.find())[-1]
pprint(exp.keys())
pprint('Example of a "document" that represents a single experiment:')
pprint('\n\n Example of configurations of several experiemts:')
pprint(exp)

### Example of configurations of several experiments

In [19]:
print('Example of configurations of several experiments:\n')
for i, exp in enumerate(list(db.runs.find())[0:3]):
    pprint(exp['config'])
    print('')
    

Example of configurations of several experiments:

{'batch_size': 32,
 'dropout_rate': 0.4,
 'epochs': 10,
 'fc_dim': 30,
 'lr': 0.001,
 'seed': 0}

{'batch_size': 32,
 'dropout_rate': 0.2,
 'epochs': 10,
 'fc_dim': 300,
 'lr': 0.01,
 'seed': 0}

{'batch_size': 32,
 'dropout_rate': 0.2,
 'epochs': 10,
 'fc_dim': 200,
 'lr': 0.001,
 'seed': 0}



# Pandas + SACRED MongoDB

## Get a dataframe that summarize the experiements, according to a PANDAS query

In [20]:
from collections import OrderedDict
import pandas as pd
import re

def slice_dict(d, keys):
    """ Returns a dictionary ordered and sliced by given keys
        keys can be a list, or a CSV string
    """
    if isinstance(keys, str):
        keys = keys[:-1] if keys[-1] == ',' else keys
        keys = re.split(', |[, ]', keys)

    return dict((k, d[k]) for k in keys)

def sacred_to_df(db_runs, mongo_query=None, ):
    """
    db_runs is usually db.runs
    returns a dataframe that summarizes the experiments, where 
    config and info fields are flattened to their keys.
    Summary DF contains the following columns:
    _id, experiment.name, **config, result, **info, status, start_time
    """
    # get all experiment according to mongo query and represent as a pandas DataFrame    
    df = pd.DataFrame(list(db_runs.find(mongo_query)))

    # Take only the interesting columns
    df = df.loc[:, '_id, experiment, config, result, info, status, start_time'.split(', ')]

    def _summerize_experiment(s):
        """
        Take only the 
        """
        o = OrderedDict()
        o['_id'] = s['_id']
        o['name']=s['experiment']['name']
        o.update(s['config'])
        o.update(s['info'])

        o.update(slice_dict(s.to_dict(), 'result, status, start_time'))
        return pd.Series(o)
    
    sum_list = []
    for ix, s in df.iterrows():
        sum_list.append(_summerize_experiment(s))
    df_summary = pd.DataFrame(sum_list).set_index('_id')
    
    return df_summary

### Filter according to some query

In [99]:
# Get the COMPLETED experiments with dim<=100 and val. accuracy > 0.92
query = 'status=="COMPLETED" and val_acc>0.91 and fc_dim<=100'
df_summary = sacred_to_df(db.runs).query(query)
# Sort them in descending order (best performer is first).
df_summary = df_summary.sort_values('val_acc', ascending=False)
display(df_summary)

Unnamed: 0_level_0,name,batch_size,dropout_rate,epochs,fc_dim,lr,seed,test_acc,val_acc,result,status,start_time,metrics
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
36,My_Experiment,32,0.2,10,60,0.003,0,0.933,0.938,0.938,COMPLETED,2018-09-03 11:16:17.615,
57,My_Experiment,32,0.2,10,60,0.001,0,0.928,0.938,0.938,COMPLETED,2018-09-03 11:18:01.669,
43,My_Experiment,32,0.2,10,100,0.001,0,0.916,0.927,0.927,COMPLETED,2018-09-03 11:17:07.184,
16,My_Experiment,32,0.3,10,60,0.001,0,0.917,0.925,0.925,COMPLETED,2018-09-03 11:07:38.196,
20,My_Experiment,32,0.6,10,60,0.01,0,0.904,0.923,0.923,COMPLETED,2018-09-03 11:07:43.808,
31,My_Experiment,32,0.2,10,30,0.01,0,0.923,0.92,0.92,COMPLETED,2018-09-03 11:16:11.016,
54,My_Experiment,32,0.3,10,20,0.001,0,0.922,0.912,0.912,COMPLETED,2018-09-03 11:17:56.249,
38,My_Experiment,32,0.2,10,60,0.01,0,0.914,0.911,0.911,COMPLETED,2018-09-03 11:16:17.698,
45,My_Experiment,32,0.5,10,20,0.003,0,0.918,0.911,0.911,COMPLETED,2018-09-03 11:17:07.236,


## Get raw experiments, according to the summary dataframe

In [171]:
def query_by_df(mongo_db_runs, df=None, ids=None):
    """
    Get raw experiments, according to the summary dataframe, or list of ids
    """
    if ids is None:
        ids = df.index.tolist()
        
    # Get a mongo iterator, according to the pandas DataFrame. 
    mongo_cursor = mongo_db_runs.find(dict(_id={'$in':ids}))
    
    # put raw results to a datafram
    df_raw_results = pd.DataFrame(list(mongo_cursor)).set_index('_id')
    
    # reorder results according to given dataframe because Cursor iterator does not preserve order from given dataframe 
    df_raw_results = df_raw_results.reindex(ids)
    return df_raw_results

### Show raw experiments artifacts

In [100]:
df_raw = query_by_df(db.runs, df_summary)
# show the results of the 'artifacts' field (resulting files)
display(df_raw.artifacts)
# who the dataframe that holds the raw experiments information
df_raw.head()

_id
36    [{'name': 'mnist_model.h5', 'file_id': 5b8d182...
57    [{'name': 'mnist_model.h5', 'file_id': 5b8d188...
43    [{'name': 'mnist_model.h5', 'file_id': 5b8d186...
16    [{'name': 'mnist_model.h5', 'file_id': 5b8d161...
20    [{'name': 'mnist_model.h5', 'file_id': 5b8d162...
31    [{'name': 'mnist_model.h5', 'file_id': 5b8d182...
54    [{'name': 'mnist_model.h5', 'file_id': 5b8d188...
38    [{'name': 'mnist_model.h5', 'file_id': 5b8d182...
45    [{'name': 'mnist_model.h5', 'file_id': 5b8d185...
Name: artifacts, dtype: object

Unnamed: 0_level_0,artifacts,captured_out,command,config,experiment,format,heartbeat,host,info,meta,resources,result,start_time,status,stop_time
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
36,"[{'name': 'mnist_model.h5', 'file_id': 5b8d182...",INFO - My_Experiment - Running command 'main'\...,main,"{'batch_size': 32, 'dropout_rate': 0.2, 'epoch...","{'name': 'My_Experiment', 'base_dir': '/home/l...",MongoObserver-0.7.0,2018-09-03 11:16:59.617,"{'hostname': 'ctx19', 'os': ['Linux', 'Linux-3...","{'test_acc': 0.933, 'val_acc': 0.938}","{'command': 'main', 'options': {'--capture': N...",[],0.938,2018-09-03 11:16:17.615,COMPLETED,2018-09-03 11:16:59.598
57,"[{'name': 'mnist_model.h5', 'file_id': 5b8d188...",INFO - My_Experiment - Running command 'main'\...,main,"{'batch_size': 32, 'dropout_rate': 0.2, 'epoch...","{'name': 'My_Experiment', 'base_dir': '/home/l...",MongoObserver-0.7.0,2018-09-03 11:18:38.465,"{'hostname': 'ctx19', 'os': ['Linux', 'Linux-3...","{'test_acc': 0.928, 'val_acc': 0.938}","{'command': 'main', 'options': {'--print_confi...",[],0.938,2018-09-03 11:18:01.669,COMPLETED,2018-09-03 11:18:38.449
43,"[{'name': 'mnist_model.h5', 'file_id': 5b8d186...",INFO - My_Experiment - Running command 'main'\...,main,"{'batch_size': 32, 'dropout_rate': 0.2, 'epoch...","{'name': 'My_Experiment', 'base_dir': '/home/l...",MongoObserver-0.7.0,2018-09-03 11:17:55.196,"{'hostname': 'ctx19', 'os': ['Linux', 'Linux-3...","{'test_acc': 0.916, 'val_acc': 0.927}","{'command': 'main', 'options': {'--comment': N...",[],0.927,2018-09-03 11:17:07.184,COMPLETED,2018-09-03 11:17:55.178
16,"[{'name': 'mnist_model.h5', 'file_id': 5b8d161...",INFO - My_Experiment - Running command 'main'\...,main,"{'batch_size': 32, 'dropout_rate': 0.3, 'epoch...","{'name': 'My_Experiment', 'base_dir': '/home/l...",MongoObserver-0.7.0,2018-09-03 11:08:16.104,"{'hostname': 'ctx19', 'os': ['Linux', 'Linux-3...","{'test_acc': 0.917, 'val_acc': 0.925}","{'command': 'main', 'options': {'--priority': ...",[],0.925,2018-09-03 11:07:38.196,COMPLETED,2018-09-03 11:08:16.093
20,"[{'name': 'mnist_model.h5', 'file_id': 5b8d162...",INFO - My_Experiment - Running command 'main'\...,main,"{'batch_size': 32, 'dropout_rate': 0.6, 'epoch...","{'name': 'My_Experiment', 'base_dir': '/home/l...",MongoObserver-0.7.0,2018-09-03 11:08:26.072,"{'hostname': 'ctx19', 'os': ['Linux', 'Linux-3...","{'test_acc': 0.904, 'val_acc': 0.923}","{'command': 'main', 'options': {'--loglevel': ...",[],0.923,2018-09-03 11:07:43.808,COMPLETED,2018-09-03 11:08:26.052


### Load and evaluate the best model

#### Load the best model

In [28]:
!mkdir /tmp/mnist_model

In [77]:
import gridfs
fs = gridfs.GridFS(db)

def exp_artifacts_to_dict(list_artifacts):
    """ Converts a list of artifacts to a dictionary of {filename:file_id}  """
    d = {}
    for ar in list_artifacts:
        ar_name = ar['name']
        if ar_name in d.keys():
            raise(RuntimeError('%s artifact has duplicates'%ar_name))
        d[ar_name] = ar['file_id']
    return d    

In [79]:
model_fname = 'mnist_model.h5'

# get file_id of best model from the first row in the df_raw DataFrame
gfs_best_model = exp_artifacts_to_dict(df_raw.artifacts.iloc[0])[model_fname]

# Can't load Keras model directly from GridFS. Therefore, we first copy the model 
# to /tmp (on OS filesystem), and then load it from there

# Read the model from GridFS
model_bytes = fs.get(gfs_best_model).read()
# Write the model to the OS filesystem
tmp_model_fname = '/tmp/mnist_model/model.h5'
with open(tmp_model_fname, 'wb') as f:
    f.write(model_bytes)

# Load the model to Keras, from OS filesystem
import tensorflow.keras as keras
model = keras.models.load_model(tmp_model_fname)
    

#### Evaluate data on the model

In [81]:
# Load data
import mnist_keras
_, _, x_train, y_train, x_val, y_val, x_test, y_test = mnist_keras.prepare_data()

x_train shape: (1000, 28, 28, 1)
1000 train samples
1000 val samples
1000 test samples


In [82]:
# Evaluate model
val_loss, val_accuracy = model.evaluate(x_val, y_val)
print('val accuracy = ', val_accuracy)

val accuracy =  0.938


## Delete model files ("artifacts") from all experiments except best  3 

In [168]:
import gridfs
def delete_artifact(name_to_delete, experiment_id, db):
    fs = gridfs.GridFS(db)
    ex = db.runs.find_one(dict(_id=experiment_id))
    updated_artifacts = []
    for i, artifact in enumerate(ex['artifacts']):
        if artifact['name'] == name_to_delete:
            fs.delete(artifact['file_id'])
            if fs.exists(artifact['file_id']):
                raise RuntimeError('Failed to delete artifact, {}'.format(artifact))
            else:
                # deleted successfuly, then don't add to updated_artifacts list 
                pass
        else:
            updated_artifacts.append(artifact)
    ex['artifacts'] = updated_artifacts
    db.runs.update_one({'_id':experiment_id}, {"$set": ex})


In [169]:
# Delete all model files except of best 3 models
for ex_id in df_summary.index.tolist()[3:]:
    delete_artifact('mnist_model.h5', ex_id, db)

In [172]:
# Show results
df_raw = query_by_df(db.runs, df_summary)
df_raw.artifacts

_id
36    [{'name': 'mnist_model.h5', 'file_id': 5b8d182...
57    [{'name': 'mnist_model.h5', 'file_id': 5b8d188...
43    [{'name': 'mnist_model.h5', 'file_id': 5b8d186...
16                                                   []
20                                                   []
31                                                   []
54                                                   []
38                                                   []
45                                                   []
Name: artifacts, dtype: object