# regress.ipynb
Author:  Kevin Tran <ktran@andrew.cmu.edu>

This python notebook performs regressions on data pulled from a processed mongo DB created by GASpy. It then saves these regressions into pickles (for later use) and creates parity plots of the regression fits.

# Initialize

## Importing

In [1]:
# Debugging & other Python tools
import pdb
import sys
from pprint import pprint
import itertools
# Saving/loading
import dill as pickle
pickle.settings['recurse'] = True     # required to pickle lambdify functions (for alamopy)
# Regression
from sklearn.gaussian_process import GaussianProcessRegressor
from tpot import TPOTRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared
# Plotting
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.plotly as py
import plotly.graph_objs as go
# GASpy
from regression_processor import RegressionProcessor
from pull_features import PullFeatures
sys.path.append('..')
from gaspy.utils import vasp_settings_to_str

## Scope

In [2]:
# Define the feature sets that you want to investigate. They should be
# string names of the PullFeatures methods that you want to use.
FEATURE_SETS = [
                #'energy_fr_coordcount',
                'energy_fr_coordcount_nncoord',
                #'energy_fr_coordcount_ads',
                #'energy_fr_coordcount_nncoord_ads',
                #'energy_fr_nncoord',
                #'energy_fr_gcn_ads',
               ]

# Only pull data that used the following vasp settings
VASP_SETTINGS = vasp_settings_to_str({'gga': 'RP',
                                      'pp_version': '5.4',
                                      'encut': 350})
#VASP_SETTINGS = None

# This is a dictionary that will hold all of the data we need for plotting
DATA = {}

# Regress

## SKLearn Gaussian Process

### Execute

In [None]:
# Specify the kernel to use. If it's `None`, then it uses SKLearn's default RBF
K = 1.0 * RBF(length_scale=0.05)+1.0*RBF(length_scale=0.2)+1.0*WhiteKernel(noise_level=0.05**2.0) 
#K = None
n_restarts = 2
# Create the model that you want to use to perform the regression
regressor = GaussianProcessRegressor(kernel=K, n_restarts_optimizer=n_restarts)
# Specify the model blocking. Use [] if you don't want blocking (this will help with saving)
blocks = ['adsorbate']
#blocks = []

# Initialize the results
models = dict.fromkeys(FEATURE_SETS)
rmses = dict.fromkeys(FEATURE_SETS)
errors = dict.fromkeys(FEATURE_SETS)
x = dict.fromkeys(FEATURE_SETS)
y = dict.fromkeys(FEATURE_SETS)
p_docs = dict.fromkeys(FEATURE_SETS)
block_list = dict.fromkeys(FEATURE_SETS)
pp = dict.fromkeys(FEATURE_SETS)
norm = dict.fromkeys(FEATURE_SETS)

for feature_set in FEATURE_SETS:
    # Pull the data out and store some of the processing information for plotting purposes
    rp = RegressionProcessor(feature_set, blocks=blocks, vasp_settings=VASP_SETTINGS)
    x[feature_set] = rp.x
    y[feature_set] = rp.y
    p_docs[feature_set] = rp.p_docs
    block_list[feature_set] = rp.block_list
    pp[feature_set] = rp.pp
    norm[feature_set] = rp.norm
    # Perform the regression
    models[feature_set], rmses[feature_set], errors[feature_set] = \
            rp.sk_regressor(regressor)

# Package the data that'll be used for plotting
DATA['GP'] = {'models': models,
              'rmses': rmses,
              'errors': errors,
              'x': x,
              'y': y,
              'p_docs': p_docs,
              'blocks': blocks,
              'block_list': block_list,
              'pp': pp,
              'norm': norm}

### Save

In [None]:
# Save the regressions
for feature_set in FEATURE_SETS:
    # Save the models alone for GASpy_predict to use
    pkl = {'model': DATA['GP']['models'][feature_set],
           'pp': DATA['GP']['pp'][feature_set],
           'norm': DATA['GP']['norm'][feature_set]}
    with open('pkls/models/GP_model_' + feature_set + '_' \
              + '-'.join(DATA['GP']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(pkl, f)
        
    # Save the entire package to use later in this notebook
    data = {}
    for datum in ['models', 'rmses', 'errors', 'x', 'y', 'p_docs', 'block_list', 'pp']:
        data[datum] = DATA['GP'][datum][feature_set]
    with open('pkls/data/GP_data_' + feature_set + '_' + \
              '-'.join(DATA['GP']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(data, f)

### Load

In [3]:
# What blocking-types do we want to open?
blocks = ['adsorbate']
#blocks = []

# Initialize the data ball
DATA['GP'] = {}
for datum in ['models', 'rmses', 'errors', 'x', 'y', 'p_docs', 'block_list', 'pp']:
    DATA['GP'][datum] = dict.fromkeys(FEATURE_SETS)
    
# Open all the databalls and put them into DATA
for feature_set in FEATURE_SETS:
    with open('pkls/data/GP_data_' + feature_set + '_' + '-'.join(blocks) + '.pkl', 'rb') as f:
        data = pickle.load(f)
    for key, value in data.iteritems():
        DATA['GP'][key][feature_set] = value

## TPOT

### Execute

In [None]:
# Create the model that you want to use to perform the regression
regressor = TPOTRegressor(generations=4,
                          population_size=16,
                          verbosity=2,
                          random_state=42)
# Specify the model blocking. Use [] if you don't want blocking (this will help with saving)
blocks = ['adsorbate']
#blocks = []

# Initialize the results
models = dict.fromkeys(FEATURE_SETS)
rmses = dict.fromkeys(FEATURE_SETS)
errors = dict.fromkeys(FEATURE_SETS)
x = dict.fromkeys(FEATURE_SETS)
y = dict.fromkeys(FEATURE_SETS)
p_docs = dict.fromkeys(FEATURE_SETS)
block_list = dict.fromkeys(FEATURE_SETS)
pp = dict.fromkeys(FEATURE_SETS)
norm = dict.fromkeys(FEATURE_SETS)

for feature_set in FEATURE_SETS:
    # Pull the data out and store some of the processing information for plotting purposes
    rp = RegressionProcessor(feature_set, blocks=blocks, vasp_settings=VASP_SETTINGS)
    x[feature_set] = rp.x
    y[feature_set] = rp.y
    p_docs[feature_set] = rp.p_docs
    block_list[feature_set] = rp.block_list
    pp[feature_set] = rp.pp
    norm[feature_set] = rp.norm
    # Perform the regression
    models[feature_set], rmses[feature_set], errors[feature_set] = \
            rp.tpot(regressor)

# Package the data that'll be used for plotting
DATA['TPOT'] = {'models': models,
                'rmses': rmses,
                'errors': errors,
                'x': x,
                'y': y,
                'p_docs': p_docs,
                'blocks': blocks,
                'block_list': block_list,
                'pp': pp,
                'norm': norm}

### Save

In [None]:
# Save the regressions
for feature_set in FEATURE_SETS:
    # Save the models alone for GASpy_predict to use
    pkl = {'model': DATA['TPOT']['models'][feature_set],
           'pp': DATA['TPOT']['pp'][feature_set],
           'norm': DATA['TPOT']['norm'][feature_set]}
    with open('pkls/models/TPOT_model_' + feature_set + '_' \
              + '-'.join(DATA['TPOT']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(pkl, f)
        
    # Save the entire package to use later in this notebook
    data = {}
    for datum in ['models', 'rmses', 'errors', 'x', 'y', 'p_docs', 'block_list', 'pp']:
        data[datum] = DATA['TPOT'][datum][feature_set]
    with open('pkls/data/TPOT_data_' + feature_set + '_' + \
              '-'.join(DATA['TPOT']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(data, f)

### Load

In [4]:
# What blocking-types do we want to open?
blocks = ['adsorbate']
#blocks = []

# Initialize the data ball
DATA['TPOT'] = {}
for datum in ['models', 'rmses', 'errors', 'x', 'y', 'p_docs', 'block_list', 'pp']:
    DATA['TPOT'][datum] = dict.fromkeys(FEATURE_SETS)
    
# Open all the databalls and put them into DATA
for feature_set in FEATURE_SETS:
    with open('pkls/data/TPOT_data_' + feature_set + '_' + '-'.join(blocks) + '.pkl', 'rb') as f:
        data = pickle.load(f)
    for key, value in data.iteritems():
        DATA['TPOT'][key][feature_set] = value

## Hierarchical
TODO:  Test the iterable nature of these cells (i.e., use more than one outer and inner combo)

### Execute

In [5]:
# Specify the model blocking. Use [] if you don't want blocking (this will help with saving)
blocks = ['adsorbate']
#blocks = []

# Outer regression information
OUTER_FEATURE_SETS = ['energy_fr_coordcount']
OUTER_REGRESSORS = [TPOTRegressor(generations=4,
                                  population_size=16,
                                  verbosity=2,
                                  random_state=42)]
OUTER_REGRESSION_METHODS = ['tpot']
OUTER_SYSTEMS = [(outer_feature_set, OUTER_REGRESSORS[i], OUTER_REGRESSION_METHODS[i])
                 for i, outer_feature_set in enumerate(OUTER_FEATURE_SETS)]
# Inner regression information
INNER_FEATURE_SETS = ['energy_fr_nncoord']
#K = 1.0*RBF(length_scale=1.0) + 1.0*WhiteKernel(noise_level=0.05**2.0) 
K = None
INNER_REGRESSORS = [GaussianProcessRegressor(kernel=K, n_restarts_optimizer=2)]
INNER_REGRESSION_METHODS = ['sk_regressor']
INNER_SYSTEMS = [(inner_feature_set, INNER_REGRESSORS[i], INNER_REGRESSION_METHODS[i])
                 for i, inner_feature_set in enumerate(INNER_FEATURE_SETS)]

# `FEATURE_COMBINATIONS` is a list of tuples for the different combinations
# of the outer and inner regressors we want. We use it to initialize the dictionaries
# of our results.
FEATURE_COMBINATIONS = [combo
                        for combo in itertools.product(*[OUTER_FEATURE_SETS,
                                                         INNER_FEATURE_SETS])]
models = dict.fromkeys(FEATURE_COMBINATIONS)
rmses = dict.fromkeys(FEATURE_COMBINATIONS)
errors = dict.fromkeys(FEATURE_COMBINATIONS)
x = dict.fromkeys(FEATURE_COMBINATIONS)
y = dict.fromkeys(FEATURE_COMBINATIONS)
p_docs = dict.fromkeys(FEATURE_COMBINATIONS)
pp = dict.fromkeys(FEATURE_COMBINATIONS)
block_list = dict.fromkeys(FEATURE_COMBINATIONS)
# Initialize other output dictionaries
RPs = dict.fromkeys(OUTER_FEATURE_SETS)
norm = dict.fromkeys(OUTER_FEATURE_SETS+FEATURE_COMBINATIONS)

# Perform the regressions for each combination of feature sets
for o_feature_set, o_regressor, o_regression_method in OUTER_SYSTEMS:
    # Initialize `RegressionProcessor` to pull the data
    RPs[o_feature_set] = RegressionProcessor(o_feature_set,
                                             blocks=blocks,
                                             vasp_settings=VASP_SETTINGS)
    # Perform the outer regressions
    outer_models, outer_rmses, outer_errors = \
            getattr(RPs[o_feature_set], o_regression_method)(o_regressor)
    # Perform the inner regressions
    for i_feature_set, i_regressor, i_regression_method in INNER_SYSTEMS:
        models[(o_feature_set, i_feature_set)], \
            rmses[(o_feature_set, i_feature_set)], \
            errors[(o_feature_set, i_feature_set)], \
            _, inner_norm \
                = RPs[o_feature_set].hierarchical(outer_models, outer_rmses, outer_errors,
                                                  i_feature_set,
                                                  i_regression_method,
                                                  i_regressor)
        # Store some of the RegressionProcessor attributes for later use
        x[(o_feature_set, i_feature_set)] = RPs[o_feature_set].x
        y[(o_feature_set, i_feature_set)] = RPs[o_feature_set].y
        p_docs[(o_feature_set, i_feature_set)] = RPs[o_feature_set].p_docs
        pp[(o_feature_set, i_feature_set)] = RPs[o_feature_set].pp
        block_list[(o_feature_set, i_feature_set)] = RPs[o_feature_set].block_list
        norm[(o_feature_set, i_feature_set)] = inner_norm
    norm[o_feature_set] = RPs[o_feature_set].norm
        
# Package the data that'll be used for plotting
DATA['GPinTPOT'] = {'models': models,
                    'rmses': rmses,
                    'errors': errors,
                    'x': x,
                    'y': y,
                    'p_docs': p_docs,
                    'blocks': blocks,
                    'block_list': block_list,
                    'pp': pp,
                    'norm': norm}


This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.


Data with input dtype int64 was converted to float64 by the normalize function.

Optimization Progress:  38%|███▊      | 30/80 [00:28<00:51,  1.03s/pipeline]

Generation 1 - Current best internal CV score: 0.550046790163


Optimization Progress:  57%|█████▊    | 46/80 [00:33<00:17,  1.91pipeline/s]

Generation 2 - Current best internal CV score: 0.550046790163


Optimization Progress:  78%|███████▊  | 62/80 [00:40<00:08,  2.18pipeline/s]

Generation 3 - Current best internal CV score: 0.538598820552


                                                                            

Generation 4 - Current best internal CV score: 0.528823486221

Best pipeline: GradientBoostingRegressor(ZeroCount(input_matrix), GradientBoostingRegressor__alpha=0.85, GradientBoostingRegressor__learning_rate=DEFAULT, GradientBoostingRegressor__loss=DEFAULT, GradientBoostingRegressor__max_depth=5, GradientBoostingRegressor__max_features=DEFAULT, GradientBoostingRegressor__min_samples_leaf=2, GradientBoostingRegressor__min_samples_split=19, GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.45)


Optimization Progress:  38%|███▊      | 30/80 [00:37<01:09,  1.38s/pipeline]

Generation 1 - Current best internal CV score: 0.278178653563


Optimization Progress:  56%|█████▋    | 45/80 [00:46<00:25,  1.40pipeline/s]

Generation 2 - Current best internal CV score: 0.271652824379


Optimization Progress:  76%|███████▋  | 61/80 [01:01<00:19,  1.00s/pipeline]

Generation 3 - Current best internal CV score: 0.261660511455


                                                                            

Generation 4 - Current best internal CV score: 0.261660511455

Best pipeline: XGBRegressor(XGBRegressor(input_matrix, XGBRegressor__learning_rate=0.01, XGBRegressor__max_depth=8, XGBRegressor__min_child_weight=4, XGBRegressor__n_estimators=DEFAULT, XGBRegressor__nthread=1, XGBRegressor__subsample=0.3), XGBRegressor__learning_rate=DEFAULT, XGBRegressor__max_depth=7, XGBRegressor__min_child_weight=14, XGBRegressor__n_estimators=DEFAULT, XGBRegressor__nthread=1, XGBRegressor__subsample=0.3)


Optimization Progress:  38%|███▊      | 30/80 [00:36<01:07,  1.35s/pipeline]

Generation 1 - Current best internal CV score: 0.216563344753


Optimization Progress:  57%|█████▊    | 46/80 [00:52<00:44,  1.32s/pipeline]

Generation 2 - Current best internal CV score: 0.216563344753


Optimization Progress:  78%|███████▊  | 62/80 [01:25<00:29,  1.66s/pipeline]

Generation 3 - Current best internal CV score: 0.213609100633


                                                                            

Generation 4 - Current best internal CV score: 0.213429346313

Best pipeline: RandomForestRegressor(RandomForestRegressor(input_matrix, RandomForestRegressor__bootstrap=DEFAULT, RandomForestRegressor__max_features=0.35, RandomForestRegressor__min_samples_leaf=14, RandomForestRegressor__min_samples_split=9, RandomForestRegressor__n_estimators=DEFAULT), RandomForestRegressor__bootstrap=True, RandomForestRegressor__max_features=0.8, RandomForestRegressor__min_samples_leaf=5, RandomForestRegressor__min_samples_split=9, RandomForestRegressor__n_estimators=100)


Optimization Progress:  38%|███▊      | 30/80 [00:26<00:47,  1.06pipeline/s]

Generation 1 - Current best internal CV score: 0.450053288054


Optimization Progress:  55%|█████▌    | 44/80 [00:57<00:51,  1.44s/pipeline]

Generation 2 - Current best internal CV score: 0.450053288054


Optimization Progress:  75%|███████▌  | 60/80 [01:12<00:23,  1.16s/pipeline]

Generation 3 - Current best internal CV score: 0.446530809428


                                                                            

Generation 4 - Current best internal CV score: 0.446530809428

Best pipeline: RandomForestRegressor(KNeighborsRegressor(input_matrix, KNeighborsRegressor__n_neighbors=41, KNeighborsRegressor__p=1, KNeighborsRegressor__weights=DEFAULT), RandomForestRegressor__bootstrap=True, RandomForestRegressor__max_features=0.75, RandomForestRegressor__min_samples_leaf=5, RandomForestRegressor__min_samples_split=9, RandomForestRegressor__n_estimators=100)


Optimization Progress:  38%|███▊      | 30/80 [00:24<00:44,  1.13pipeline/s]

Generation 1 - Current best internal CV score: 0.281950356301


Optimization Progress:  57%|█████▊    | 46/80 [00:41<00:29,  1.17pipeline/s]

Generation 2 - Current best internal CV score: 0.281950356301


Optimization Progress:  75%|███████▌  | 60/80 [00:54<00:15,  1.25pipeline/s]

Generation 3 - Current best internal CV score: 0.281950356301


                                                                            

Generation 4 - Current best internal CV score: 0.281950356301

Best pipeline: ElasticNetCV(Normalizer(input_matrix, Normalizer__norm=l1), ElasticNetCV__l1_ratio=0.4, ElasticNetCV__tol=1e-05)


Optimization Progress:  38%|███▊      | 30/80 [00:24<00:44,  1.12pipeline/s]

Generation 1 - Current best internal CV score: 0.612240236178


Optimization Progress:  57%|█████▊    | 46/80 [00:51<00:41,  1.22s/pipeline]

Generation 2 - Current best internal CV score: 0.612240236178


Optimization Progress:  75%|███████▌  | 60/80 [00:59<00:17,  1.12pipeline/s]

Generation 3 - Current best internal CV score: 0.612240236178


                                                                            

Generation 4 - Current best internal CV score: 0.612240236178

Best pipeline: ElasticNetCV(XGBRegressor(input_matrix, XGBRegressor__learning_rate=0.5, XGBRegressor__max_depth=2, XGBRegressor__min_child_weight=9, XGBRegressor__n_estimators=100, XGBRegressor__nthread=1, XGBRegressor__subsample=0.7), ElasticNetCV__l1_ratio=0.4, ElasticNetCV__tol=1e-05)


### Save

In [6]:
# Save the regressions
for o_feature_set in OUTER_FEATURE_SETS:
    for i_feature_set in INNER_FEATURE_SETS:
        # Save the models alone for GASpy_predict to use
        with open('pkls/models/GPinTPOT_model_' \
                  + i_feature_set + '-inside-' + o_feature_set + '_' \
                  + '-'.join(DATA['GPinTPOT']['blocks']) + '.pkl', 'wb') as f:
            pkl = {'model': DATA['GPinTPOT']['models'][(o_feature_set, i_feature_set)],
                   'pp': DATA['GPinTPOT']['pp'][(o_feature_set, i_feature_set)],
                   'norm': {'outer': DATA['GPinTPOT']['norm'][o_feature_set],
                            'inner': DATA['GPinTPOT']['norm'][(o_feature_set, i_feature_set)]}}
            pickle.dump(pkl, f)

        # Save the entire package to use later in this notebook
        data = {}
        for datum in ['models', 'rmses', 'errors', 'x', 'y', 'p_docs', 'block_list', 'pp']:
            data[datum] = DATA['GPinTPOT'][datum][(o_feature_set, i_feature_set)]
        with open('pkls/data/GPinTPOT_data_' \
                  + i_feature_set + '-inside-' + o_feature_set + '_' \
                  + '-'.join(DATA['GPinTPOT']['blocks']) + '.pkl', 'wb') as f:
            pickle.dump(data, f)

### Load

In [None]:
# What blocking-types do we want to open?
blocks = ['adsorbate']
#blocks = []

INNER_FEATURE_SETS = ['energy_fr_nncoord']
OUTER_FEATURE_SETS = ['energy_fr_coordcount']
FEATURE_COMBINATIONS = [combo
                        for combo in itertools.product(*[OUTER_FEATURE_SETS,
                                                         INNER_FEATURE_SETS])]

# Initialize the data ball
DATA['GPinTPOT'] = {}
for datum in ['models', 'rmses', 'errors', 'x', 'y', 'p_docs', 'block_list', 'pp']:
    DATA['GPinTPOT'][datum] = dict.fromkeys(FEATURE_COMBINATIONS)
    
# Open all the databalls and put them into DATA
for o_feature_set in OUTER_FEATURE_SETS:
    for i_feature_set in INNER_FEATURE_SETS:
        with open('pkls/data/GPinTPOT_data_' \
                  + i_feature_set + '-inside-' + o_feature_set + '_' \
                  + '-'.join(blocks) + '.pkl', 'rb') as f:
            data = pickle.load(f)
        for key, value in data.iteritems():
            DATA['GPinTPOT'][key][(o_feature_set, i_feature_set)] = value

# Plot

In [7]:
# Look at all of the different regressor types
for regressor, data in DATA.iteritems():
    print('Regressor:  ' + str(regressor))
    # Pull data out of the data ball
    y = data['y']
    models = data['models']
    rmses = data['rmses']
    errors = data['errors']
    p_docs = data['p_docs']
    
    # Create a plot for each of the regressor type/feature set combinations
    for feature_set in models:
        print('    Features:  ' + str(feature_set))
        traces = []
        # Within each plot, create a separate data set for each block
        for block, model in models[feature_set].iteritems():
            # Ignore sub-structures that come from hierarchical models
            if not (block == 'outer_model' or block == 'inner_model'):
                print('        Block:  ' + str(block))
                # Pull out the data for this feature set/block combination
                _y = y[feature_set][block]['train+test']
                _p_docs = p_docs[feature_set][block]['train+test']
                _errors = errors[feature_set][block]['train+test']
                _rmses = rmses[feature_set][block]

                # Print the RMSE values
                for dataset, rmse in _rmses.iteritems():
                    print('                ' + str(dataset) + ':  ' + str(rmse))
                # Label each data point depending on what type of feature set we're using
                if feature_set == 'energy_fr_coordcount':
                    coords = [coord for coord in _p_docs['coordination']]
                    text = ['Site:  %s' % coord for coord in coords]
                elif feature_set == 'energy_fr_coordcount_ads':
                    coords = [coord for coord in _p_docs['coordination']]
                    ads = [ads for ads in _p_docs['adsorbate']]
                    text = ['Site:  %s, Ads:  %s' % (coord, ads[i])
                            for i, coord in enumerate(coords)]
                elif feature_set == 'energy_fr_coordcount_nncoord_ads':
                    coords = [coord for coord in _p_docs['coordination']]
                    ads = [ads for ads in _p_docs['adsorbate']]
                    nncoords = [nncoord for nncoord in _p_docs['nextnearestcoordination']]
                    text = ['Site:  %s, Ads:  %s, NNC:  %s' % (coord, ads[i], nncoords[i])
                            for i, coord in enumerate(coords)]
                elif feature_set == 'energy_fr_gcn_ads':
                    coords = [coord for coord in _p_docs['coordination']]
                    ads = [ads for ads in _p_docs['adsorbate']]
                    text = ['Site:  %s, Ads:  %s' % (coord, ads[i])
                            for i, coord in enumerate(coords)]
                elif feature_set == 'energy_fr_nncoord':
                    nncoords = [nncoord for nncoord in _p_docs['nextnearestcoordination']]
                    text = ['NNC:  %s' % nncoords[i] for i, coord in enumerate(coords)]
                    pass
                elif feature_set == ('energy_fr_coordcount_ads', 'energy_fr_nncoord'):
                    coords = [coord for coord in _p_docs['coordination']]
                    ads = [ads for ads in _p_docs['adsorbate']]
                    text = ['Site:  %s, Ads:  %s' % (coord, ads[i])
                            for i, coord in enumerate(coords)]
                elif feature_set == ('energy_fr_coordcount', 'energy_fr_nncoord'):
                    coords = [coord for coord in _p_docs['coordination']]
                    ads = [ads for ads in _p_docs['adsorbate']]
                    nncoords = [nncoord for nncoord in _p_docs['nextnearestcoordination']]
                    text = ['Site:  %s, Ads:  %s, NNC:  %s' % (coord, ads[i], nncoords[i])
                            for i, coord in enumerate(coords)]
                elif feature_set == 'energy_fr_coordcount_nncoord':
                    coords = [coord for coord in _p_docs['coordination']]
                    ads = [ads for ads in _p_docs['adsorbate']]
                    nncoords = [nncoord for nncoord in _p_docs['nextnearestcoordination']]
                    text = ['Site:  %s, Ads:  %s, NNC:  %s' % (coord, ads[i], nncoords[i])
                            for i, coord in enumerate(coords)]
                else:
                    raise Exception('You still need to hard-code the text for the %s' \
                                    % feature_set)
                # Add the data to the trace
                traces.append(go.Scatter(x=_y, y=_y+_errors,
                                         name=str(block), text=text, mode='markers'))

        # Create a diagonal line for the parity plot
        lims = [-4, 6]
        traces.append(go.Scatter(x=lims, y=lims,
                                 line=dict(color=('black'), dash='dash'),
                                 name='Parity line'))
        # Format and plot
        layout = go.Layout(xaxis=dict(title='DFT (eV)'),
                           yaxis=dict(title='Regressed (eV)'),
                           title='Predicting %s using a %s model' \
                                 % (feature_set, regressor))
        iplot(go.Figure(data=traces, layout=layout))

Regressor:  GPinTPOT
    Features:  ('energy_fr_coordcount', 'energy_fr_nncoord')
        Block:  (u'C',)
                test:  0.913476909267
                train:  0.448120961056
                train+test:  0.588675384407
        Block:  (u'H',)
                test:  0.572566234939
                train:  0.373068503895
                train+test:  0.431303879601
        Block:  (u'CO',)
                test:  0.539492772059
                train:  0.42515842633
                train+test:  0.456362526535
        Block:  (u'OH',)
                test:  1.42165429116
                train:  0.316859983888
                train+test:  0.790778780993
        Block:  (u'OOH',)
                test:  3.96319529025
                train:  0.279345272537
                train+test:  1.82111257364
        Block:  (u'O',)
                test:  1.06685670581
                train:  0.437528708605
                train+test:  0.667314741757


Regressor:  TPOT
    Features:  energy_fr_coordcount_nncoord
        Block:  (u'C',)
                test:  0.731396951476
                train:  0.539258059014
                train+test:  0.591913233683
        Block:  (u'H',)
                test:  0.520270504238
                train:  0.412591923883
                train+test:  0.441376372807
        Block:  (u'CO',)
                test:  0.463306912208
                train:  0.46177466344
                train+test:  0.462173830995
        Block:  (u'OH',)
                test:  0.553444393336
                train:  0.480936594407
                train+test:  0.500139024057
        Block:  (u'OOH',)
                test:  0.88188692001
                train:  0.644799767674
                train+test:  0.707495334497
        Block:  (u'O',)
                test:  0.678568636476
                train:  0.473442309799
                train+test:  0.531008866904


Regressor:  GP
    Features:  energy_fr_coordcount_nncoord
        Block:  (u'C',)
                test:  0.75506521354
                train:  0.307643274994
                train+test:  0.458960092432
        Block:  (u'H',)
                test:  0.536021182599
                train:  0.421277766096
                train+test:  0.45205628448
        Block:  (u'CO',)
                test:  0.489013443729
                train:  0.426886775633
                train+test:  0.44388930056
        Block:  (u'OH',)
                test:  0.550195456224
                train:  0.411867328326
                train+test:  0.450624522471
        Block:  (u'OOH',)
                test:  0.953940219529
                train:  0.0237663732977
                train+test:  0.462153809522
        Block:  (u'O',)
                test:  0.650451139555
                train:  0.168674055317
                train+test:  0.353575223122
