# regress.ipynb
Author:  Kevin Tran <ktran@andrew.cmu.edu>

This python notebook performs regressions on data pulled from a processed mongo DB created by GASpy. It then saves these regressions into pickles (for later use) and creates parity plots of the regression fits.

# Initialize

## Importing

In [1]:
# Debugging & other Python tools
import pdb
import sys
from pprint import pprint as pp
import itertools
# Saving/loading
import dill as pickle
pickle.settings['recurse'] = True     # required to pickle lambdify functions (for alamopy)
# Regression
from sklearn.gaussian_process import GaussianProcessRegressor
from tpot import TPOTRegressor
#from sklearn.gaussian_process.kernels import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared
# Plotting
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.plotly as py
import plotly.graph_objs as go
# GASpy
from regression_processor import RegressionProcessor
from pull_features import PullFeatures
sys.path.append('..')
from gaspy.utils import vasp_settings_to_str

## Scope

In [2]:
# Define the feature sets that you want to investigate. They should be
# string names of the PullFeatures methods that you want to use.
FEATURE_SETS = [
                #'energy_fr_coordcount',
                'energy_fr_coordcount_ads',
                #'energy_fr_coordcount_nncoord_ads',
                #'energy_fr_nncoord',
                #'energy_fr_gcn_ads',
               ]

# Only pull data that used the following vasp settings
VASP_SETTINGS = vasp_settings_to_str({'gga': 'RP',
                                      'pp_version': '5.4',
                                      'encut': 350})
#VASP_SETTINGS = None

# This is a dictionary that will hold all of the data we need for plotting
DATA = {}

# Regress

## SKLearn Gaussian Process

### Execute

In [3]:
# Specify the kernel to use. If it's `None`, then it uses SKLearn's default RBF
#K = 1.0*RBF(length_scale=1.0) + 1.0*WhiteKernel(noise_level=0.05**2.0) 
K = None
n_restarts = 0
# Create the model that you want to use to perform the regression
regressor = GaussianProcessRegressor(kernel=K, n_restarts_optimizer=n_restarts)
# Specify the model blocking. Use [] if you don't want blocking (this will help with saving)
#blocks = ['adsorbate']
blocks = []

# Initialize the results
models = dict.fromkeys(FEATURE_SETS)
rmses = dict.fromkeys(FEATURE_SETS)
errors = dict.fromkeys(FEATURE_SETS)
x = dict.fromkeys(FEATURE_SETS)
y = dict.fromkeys(FEATURE_SETS)
p_docs = dict.fromkeys(FEATURE_SETS)
block_list = dict.fromkeys(FEATURE_SETS)

for feature_set in FEATURE_SETS:
    # Pull the data out and store some of the processing information for plotting purposes
    rp = RegressionProcessor(feature_set, blocks=blocks, vasp_settings=VASP_SETTINGS)
    x[feature_set] = rp.x
    y[feature_set] = rp.y
    p_docs[feature_set] = rp.p_docs
    block_list[feature_set] = rp.block_list
    # Perform the regression
    models[feature_set], rmses[feature_set], errors[feature_set] = \
            rp.sk_regressor(regressor)

# Package the data that'll be used for plotting
DATA['GP'] = {'models': models,
              'rmses': rmses,
              'errors': errors,
              'x': x,
              'y': y,
              'p_docs': p_docs,
              'blocks': blocks,
              'block_list': block_list}

### Save

In [4]:
# Save the regressions
for feature_set in FEATURE_SETS:
    # Save the models alone for GASpy_predict to use
    with open('pkls/models/GP_model_' + feature_set + '_' \
              + '-'.join(DATA['GP']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(DATA['GP']['models'][feature_set], f)
        
    # Save the entire package to use later in this notebook
    data = {}
    for datum in ['models', 'rmses', 'errors', 'x', 'y', 'p_docs', 'block_list']:
        data[datum] = DATA['TPOT'][datum][feature_set]
    with open('pkls/data/GP_data_' + feature_set + '_' + \
              '-'.join(DATA['GP']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(data, f)

### Load

In [3]:
# What blocking-types do we want to open?
#blocks = ['adsorbate']
blocks = []

# Initialize the data ball
DATA['GP'] = {}
for datum in ['models', 'rmses', 'errors', 'x', 'y', 'p_docs', 'block_list']:
    DATA['TPOT'][datum] = dict.fromkeys(FEATURE_SETS)
    
# Open all the databalls and put them into DATA
for feature_set in FEATURE_SETS:
    with open('pkls/data/GP_data_' + feature_set + '_' + '-'.join(blocks) + '.pkl', 'rb') as f:
        data = pickle.load(f)
    for key, value in data.iteritems():
        DATA['GP'][key][feature_set] = value

## TPOT

### Execute

In [5]:
# Create the model that you want to use to perform the regression
regressor = TPOTRegressor(generations=10,
                          population_size=10,
                          verbosity=2,
                          random_state=42)
# Specify the model blocking. Use [] if you don't want blocking (this will help with saving)
#blocks = ['adsorbate']
blocks = []

# Initialize the results
models = dict.fromkeys(FEATURE_SETS)
rmses = dict.fromkeys(FEATURE_SETS)
errors = dict.fromkeys(FEATURE_SETS)
x = dict.fromkeys(FEATURE_SETS)
y = dict.fromkeys(FEATURE_SETS)
p_docs = dict.fromkeys(FEATURE_SETS)
block_list = dict.fromkeys(FEATURE_SETS)

for feature_set in FEATURE_SETS:
    # Pull the data out and store some of the processing information for plotting purposes
    rp = RegressionProcessor(feature_set, blocks=blocks, vasp_settings=VASP_SETTINGS)
    x[feature_set] = rp.x
    y[feature_set] = rp.y
    p_docs[feature_set] = rp.p_docs
    block_list[feature_set] = rp.block_list
    # Perform the regression
    models[feature_set], rmses[feature_set], errors[feature_set] = \
            rp.tpot(regressor)

# Package the data that'll be used for plotting
DATA['TPOT'] = {'models': models,
                'rmses': rmses,
                'errors': errors,
                'x': x,
                'y': y,
                'p_docs': p_docs,
                'blocks': blocks,
                'block_list': block_list}


This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.

Optimization Progress:  18%|█▊        | 20/110 [00:26<03:23,  2.26s/pipeline]

Generation 1 - Current best internal CV score: 0.846430263023


Optimization Progress:  27%|██▋       | 30/110 [00:37<01:47,  1.35s/pipeline]

Generation 2 - Current best internal CV score: 0.846430263023


Optimization Progress:  35%|███▌      | 39/110 [00:45<02:05,  1.77s/pipeline]

Generation 3 - Current best internal CV score: 0.846430263023


Optimization Progress:  45%|████▍     | 49/110 [00:49<00:52,  1.17pipeline/s]

Generation 4 - Current best internal CV score: 0.846430263023


Optimization Progress:  53%|█████▎    | 58/110 [00:53<00:27,  1.91pipeline/s]

Generation 5 - Current best internal CV score: 0.843276763709


Optimization Progress:  59%|█████▉    | 65/110 [00:59<00:34,  1.30pipeline/s]

Generation 6 - Current best internal CV score: 0.843276763709


Optimization Progress:  65%|██████▌   | 72/110 [01:02<00:25,  1.46pipeline/s]

Generation 7 - Current best internal CV score: 0.843276763709


Optimization Progress:  73%|███████▎  | 80/110 [01:09<00:23,  1.30pipeline/s]

Generation 8 - Current best internal CV score: 0.843276763709


Optimization Progress:  81%|████████  | 89/110 [01:16<00:17,  1.20pipeline/s]

Generation 9 - Current best internal CV score: 0.843276763709


                                                                             

Generation 10 - Current best internal CV score: 0.843276763709

Best pipeline: XGBRegressor(LassoLarsCV(LassoLarsCV(input_matrix, LassoLarsCV__normalize=True), LassoLarsCV__normalize=True), XGBRegressor__learning_rate=DEFAULT, XGBRegressor__max_depth=7, XGBRegressor__min_child_weight=13, XGBRegressor__n_estimators=DEFAULT, XGBRegressor__nthread=1, XGBRegressor__subsample=0.3)


Optimization Progress:  18%|█▊        | 20/110 [00:44<05:34,  3.72s/pipeline]

Generation 1 - Current best internal CV score: 0.767777575435


Optimization Progress:  27%|██▋       | 30/110 [01:09<04:27,  3.34s/pipeline]

Generation 2 - Current best internal CV score: 0.767777575435


Optimization Progress:  35%|███▌      | 39/110 [01:25<03:21,  2.83s/pipeline]

Generation 3 - Current best internal CV score: 0.767777575435


Optimization Progress:  45%|████▍     | 49/110 [02:07<03:04,  3.02s/pipeline]

Generation 4 - Current best internal CV score: 0.767777575435


Optimization Progress:  53%|█████▎    | 58/110 [02:14<01:12,  1.39s/pipeline]

Generation 5 - Current best internal CV score: 0.767777575435


Optimization Progress:  61%|██████    | 67/110 [02:20<00:44,  1.03s/pipeline]

Generation 6 - Current best internal CV score: 0.767777575435


Optimization Progress:  67%|██████▋   | 74/110 [02:30<00:49,  1.37s/pipeline]

Generation 7 - Current best internal CV score: 0.767777575435


Optimization Progress:  75%|███████▌  | 83/110 [02:37<00:28,  1.07s/pipeline]

Generation 8 - Current best internal CV score: 0.767777575435


Optimization Progress:  85%|████████▍ | 93/110 [02:43<00:09,  1.71pipeline/s]

Generation 9 - Current best internal CV score: 0.767777575435


                                                                             ]

Generation 10 - Current best internal CV score: 0.767777575435

Best pipeline: XGBRegressor(LassoLarsCV(input_matrix, LassoLarsCV__normalize=True), XGBRegressor__learning_rate=DEFAULT, XGBRegressor__max_depth=7, XGBRegressor__min_child_weight=14, XGBRegressor__n_estimators=DEFAULT, XGBRegressor__nthread=1, XGBRegressor__subsample=0.3)


Optimization Progress:  18%|█▊        | 20/110 [00:30<03:49,  2.55s/pipeline]

Generation 1 - Current best internal CV score: 0.805610123361


Optimization Progress:  27%|██▋       | 30/110 [00:47<03:04,  2.30s/pipeline]

Generation 2 - Current best internal CV score: 0.805610123361


Optimization Progress:  36%|███▋      | 40/110 [01:03<02:36,  2.24s/pipeline]

Generation 3 - Current best internal CV score: 0.805610123361


Optimization Progress:  45%|████▌     | 50/110 [01:38<03:19,  3.33s/pipeline]

Generation 4 - Current best internal CV score: 0.796519389482


Optimization Progress:  54%|█████▎    | 59/110 [01:53<01:45,  2.06s/pipeline]

Generation 5 - Current best internal CV score: 0.796519389482


Optimization Progress:  61%|██████    | 67/110 [02:05<01:30,  2.10s/pipeline]

Generation 6 - Current best internal CV score: 0.796519389482


Optimization Progress:  70%|███████   | 77/110 [02:18<01:00,  1.84s/pipeline]

Generation 7 - Current best internal CV score: 0.796519389482


Optimization Progress:  78%|███████▊  | 86/110 [02:47<01:45,  4.40s/pipeline]

Generation 8 - Current best internal CV score: 0.796519389482


Optimization Progress:  86%|████████▋ | 95/110 [02:57<00:37,  2.48s/pipeline]

Generation 9 - Current best internal CV score: 0.796519389482


                                                                              

Generation 10 - Current best internal CV score: 0.795910765548

Best pipeline: ExtraTreesRegressor(RidgeCV(LassoLarsCV(input_matrix, LassoLarsCV__normalize=True)), ExtraTreesRegressor__bootstrap=False, ExtraTreesRegressor__max_features=0.9, ExtraTreesRegressor__min_samples_leaf=13, ExtraTreesRegressor__min_samples_split=13, ExtraTreesRegressor__n_estimators=100)


Optimization Progress:  18%|█▊        | 20/110 [00:23<02:58,  1.98s/pipeline]

Generation 1 - Current best internal CV score: 0.930652077886


Optimization Progress:  27%|██▋       | 30/110 [00:52<04:20,  3.26s/pipeline]

Generation 2 - Current best internal CV score: 0.909602396291


Optimization Progress:  35%|███▌      | 39/110 [01:01<01:56,  1.64s/pipeline]

Generation 3 - Current best internal CV score: 0.909602396291


Optimization Progress:  44%|████▎     | 48/110 [01:10<01:23,  1.35s/pipeline]

Generation 4 - Current best internal CV score: 0.909602396291


Optimization Progress:  53%|█████▎    | 58/110 [01:23<00:51,  1.01pipeline/s]

Generation 5 - Current best internal CV score: 0.895692591907


Optimization Progress:  62%|██████▏   | 68/110 [01:37<01:15,  1.81s/pipeline]

Generation 6 - Current best internal CV score: 0.894803445902


Optimization Progress:  71%|███████   | 78/110 [02:18<02:00,  3.77s/pipeline]

Generation 7 - Current best internal CV score: 0.894803445902


Optimization Progress:  80%|████████  | 88/110 [02:33<00:36,  1.64s/pipeline]

Generation 8 - Current best internal CV score: 0.894803445902


Optimization Progress:  89%|████████▉ | 98/110 [02:50<00:29,  2.46s/pipeline]

Generation 9 - Current best internal CV score: 0.894803445902


                                                                              

Generation 10 - Current best internal CV score: 0.894803445902

Best pipeline: RandomForestRegressor(input_matrix, RandomForestRegressor__bootstrap=DEFAULT, RandomForestRegressor__max_features=0.4, RandomForestRegressor__min_samples_leaf=4, RandomForestRegressor__min_samples_split=DEFAULT, RandomForestRegressor__n_estimators=100)


Optimization Progress:  18%|█▊        | 20/110 [00:23<02:52,  1.92s/pipeline]

Generation 1 - Current best internal CV score: 0.920146061615


Optimization Progress:  25%|██▍       | 27/110 [00:30<02:02,  1.48s/pipeline]

Generation 2 - Current best internal CV score: 0.920146061615


Optimization Progress:  33%|███▎      | 36/110 [00:34<01:00,  1.23pipeline/s]

Generation 3 - Current best internal CV score: 0.907075322391


Optimization Progress:  41%|████      | 45/110 [00:37<00:39,  1.65pipeline/s]

Generation 4 - Current best internal CV score: 0.907075322391


Optimization Progress:  50%|█████     | 55/110 [00:50<01:05,  1.19s/pipeline]

Generation 5 - Current best internal CV score: 0.907075322391


Optimization Progress:  59%|█████▉    | 65/110 [01:05<00:46,  1.04s/pipeline]

Generation 6 - Current best internal CV score: 0.907075322391


Optimization Progress:  67%|██████▋   | 74/110 [01:16<00:52,  1.46s/pipeline]

Generation 7 - Current best internal CV score: 0.907075322391


Optimization Progress:  73%|███████▎  | 80/110 [01:24<00:52,  1.76s/pipeline]

Generation 8 - Current best internal CV score: 0.907075322391


Optimization Progress:  82%|████████▏ | 90/110 [01:36<00:22,  1.14s/pipeline]

Generation 9 - Current best internal CV score: 0.898287918657


                                                                             

Generation 10 - Current best internal CV score: 0.898287918657

Best pipeline: RidgeCV(ExtraTreesRegressor(input_matrix, ExtraTreesRegressor__bootstrap=DEFAULT, ExtraTreesRegressor__max_features=0.7, ExtraTreesRegressor__min_samples_leaf=6, ExtraTreesRegressor__min_samples_split=15, ExtraTreesRegressor__n_estimators=100))


Optimization Progress:  18%|█▊        | 20/110 [00:21<02:42,  1.81s/pipeline]

Generation 1 - Current best internal CV score: 1.04925043944


Optimization Progress:  26%|██▋       | 29/110 [00:26<01:26,  1.07s/pipeline]

Generation 2 - Current best internal CV score: 1.03648822221


Optimization Progress:  35%|███▌      | 39/110 [00:36<01:01,  1.15pipeline/s]

Generation 3 - Current best internal CV score: 1.03648822221


Optimization Progress:  45%|████▍     | 49/110 [00:38<00:30,  1.98pipeline/s]

Generation 4 - Current best internal CV score: 1.03648822221


Optimization Progress:  54%|█████▎    | 59/110 [00:42<00:20,  2.50pipeline/s]

Generation 5 - Current best internal CV score: 1.03150411852


Optimization Progress:  61%|██████    | 67/110 [00:44<00:12,  3.55pipeline/s]

Generation 6 - Current best internal CV score: 1.03150411852


Optimization Progress:  70%|███████   | 77/110 [00:48<00:15,  2.10pipeline/s]

Generation 7 - Current best internal CV score: 1.03150411852


Optimization Progress:  79%|███████▉  | 87/110 [00:51<00:08,  2.59pipeline/s]

Generation 8 - Current best internal CV score: 1.01209789232


Optimization Progress:  88%|████████▊ | 97/110 [00:56<00:04,  2.82pipeline/s]

Generation 9 - Current best internal CV score: 1.00485827353


                                                                              

Generation 10 - Current best internal CV score: 1.00485827353

Best pipeline: ElasticNetCV(Normalizer(ElasticNetCV(input_matrix, ElasticNetCV__l1_ratio=0.15, ElasticNetCV__tol=0.0001), Normalizer__norm=l1), ElasticNetCV__l1_ratio=0.15, ElasticNetCV__tol=0.1)


### Save

In [6]:
# Save the regressions
for feature_set in FEATURE_SETS:
    # Save the models alone for GASpy_predict to use
    with open('pkls/models/TPOT_model_' + feature_set + '_' \
              + '-'.join(DATA['TPOT']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(DATA['TPOT']['models'][feature_set], f)
        
    # Save the entire package to use later in this notebook
    data = {}
    for datum in ['models', 'rmses', 'errors', 'x', 'y', 'p_docs', 'block_list']:
        data[datum] = DATA['TPOT'][datum][feature_set]
    with open('pkls/data/TPOT_data_' + feature_set + '_' + \
              '-'.join(DATA['TPOT']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(data, f)

### Load

In [None]:
# What blocking-types do we want to open?
#blocks = ['adsorbate']
blocks = []

# Initialize the data ball
DATA['TPOT'] = {}
for datum in ['models', 'rmses', 'errors', 'x', 'y', 'p_docs', 'block_list']:
    DATA['TPOT'][datum] = dict.fromkeys(FEATURE_SETS)
    
# Open all the databalls and put them into DATA
for feature_set in FEATURE_SETS:
    with open('pkls/data/TPOT_data_' + feature_set + '_' + '-'.join(blocks) + '.pkl', 'rb') as f:
        data = pickle.load(f)
    for key, value in data.iteritems():
        DATA['TPOT'][key][feature_set] = value

## Hierarchical
TODO:  Test the iterable nature of these cells (i.e., use more than one outer and inner combo)

### Execute

In [None]:
# Specify the model blocking. Use [] if you don't want blocking (this will help with saving)
#blocks = ['adsorbate']
blocks = []

# TODO:  Turn the outer and inner information into tuples
# Outer regression information
OUTER_FEATURE_SETS = ['energy_fr_coordcount_ads']
OUTER_REGRESSORS = [TPOTRegressor(generations=10,
                                  population_size=10,
                                  verbosity=2,
                                  random_state=42)]
OUTER_REGRESSION_METHODS = ['tpot']
# Inner regression information
INNER_FEATURE_SETS = ['energy_fr_nncoord']
#K = 1.0*RBF(length_scale=1.0) + 1.0*WhiteKernel(noise_level=0.05**2.0) 
K = None
INNER_REGRESSORS = [GaussianProcessRegressor(kernel=K, n_restarts_optimizer=20)]
INNER_REGRESSION_METHODS = ['sk_regressor']

# `FEATURE_COMBINATIONS` is a list of tuples for the different combinations
# of the outer and inner regressors we want. We use it to initialize the dictionaries
# of our results.
FEATURE_COMBINATIONS = [combo
                        for combo in itertools.product(*[OUTER_FEATURE_SETS,
                                                         INNER_FEATURE_SETS])]
models = dict.fromkeys(FEATURE_COMBINATIONS)
rmses = dict.fromkeys(FEATURE_COMBINATIONS)
errors = dict.fromkeys(FEATURE_COMBINATIONS)
x = dict.fromkeys(FEATURE_COMBINATIONS)
y = dict.fromkeys(FEATURE_COMBINATIONS)
p_docs = dict.fromkeys(FEATURE_COMBINATIONS)
block_list = dict.fromkeys(FEATURE_COMBINATIONS)
# Dictionaries containing the outer and inner RegressionProcessor class instances
O_RPs = dict.fromkeys(OUTER_FEATURE_SETS)
I_RPs = dict.fromkeys(FEATURE_COMBINATIONS)

# Pull the data. They need to be right next to each other in order for them to pull
# the same amount of data (and therefore be able to regress together).
for o_feature_set in OUTER_FEATURE_SETS:
    O_RPs[o_feature_set] = RegressionProcessor(o_feature_set,
                                               blocks=blocks,
                                               vasp_settings=VASP_SETTINGS)
    for i_feature_set in INNER_FEATURE_SETS:
        I_RPs[(o_feature_set, i_feature_set)] = RegressionProcessor(i_feature_set,
                                                                    blocks=blocks,
                                                                    vasp_settings=VASP_SETTINGS)

# Perform the regressions for each combination of feature sets
for o, o_feature_set in enumerate(OUTER_FEATURE_SETS):
    # Perform the outer regressions
    outer_models, outer_rmses, outer_errors = \
            getattr(O_RPs[o_feature_set], OUTER_REGRESSION_METHODS[o])(OUTER_REGRESSORS[o])
    # Perform the inner regressions
    for i, i_feature_set in enumerate(INNER_FEATURE_SETS):
        models[(o_feature_set, i_feature_set)], \
            rmses[(o_feature_set, i_feature_set)], \
            errors[(o_feature_set, i_feature_set)] \
                = I_RPs[(o_feature_set, i_feature_set)].hierarchical(outer_models,
                                                                     outer_rmses,
                                                                     outer_errors,
                                                                     INNER_REGRESSION_METHODS[i],
                                                                     INNER_REGRESSORS[i])
        x[(o_feature_set, i_feature_set)] = I_RPs[(o_feature_set, i_feature_set)].x
        y[(o_feature_set, i_feature_set)] = I_RPs[(o_feature_set, i_feature_set)].y
        p_docs[(o_feature_set, i_feature_set)] = I_RPs[(o_feature_set, i_feature_set)].p_docs
        block_list[(o_feature_set, i_feature_set)] = I_RPs[(o_feature_set, i_feature_set)].block_list
        
# Package the data that'll be used for plotting
DATA['GPinTPOT'] = {'models': models,
                    'rmses': rmses,
                    'errors': errors,
                    'x': x,
                    'y': y,
                    'p_docs': p_docs,
                    'blocks': blocks,
                    'block_list': block_list}

### Save

In [5]:
# Save the regressions
for o_feature_set in OUTER_FEATURE_SETS:
    for i_feature_set in INNER_FEATURE_SETS:
        # Save the models alone for GASpy_predict to use
        with open('pkls/models/GPinTPOT_model_' \
                  + i_feature_set + '-inside-' + o_feature_set + '_' \
                  + '-'.join(DATA['GPinTPOT']['blocks']) + '.pkl', 'wb') as f:
            pickle.dump(DATA['GPinTPOT']['models'][(o_feature_set, i_feature_set)], f)

        # Save the entire package to use later in this notebook
        data = {}
        for datum in ['models', 'rmses', 'errors', 'x', 'y', 'p_docs', 'block_list']:
            data[datum] = DATA['GPinTPOT'][datum][(o_feature_set, i_feature_set)]
        with open('pkls/data/GPinTPOT_data_' \
                  + i_feature_set + '-inside-' + o_feature_set + '_' \
                  + '-'.join(DATA['GPinTPOT']['blocks']) + '.pkl', 'wb') as f:
            pickle.dump(data, f)

### Load

In [3]:
# What blocking-types do we want to open?
#blocks = ['adsorbate']
blocks = []

INNER_FEATURE_SETS = ['energy_fr_nncoord']
OUTER_FEATURE_SETS = ['energy_fr_coordcount_ads']
FEATURE_COMBINATIONS = [combo
                        for combo in itertools.product(*[OUTER_FEATURE_SETS,
                                                         INNER_FEATURE_SETS])]

# Initialize the data ball
DATA['GPinTPOT'] = {}
for datum in ['models', 'rmses', 'errors', 'x', 'y', 'p_docs', 'block_list']:
    DATA['GPinTPOT'][datum] = dict.fromkeys(FEATURE_COMBINATIONS)
    
# Open all the databalls and put them into DATA
for o_feature_set in OUTER_FEATURE_SETS:
    for i_feature_set in INNER_FEATURE_SETS:
        with open('pkls/data/GPinTPOT_data_' \
                  + i_feature_set + '-inside-' + o_feature_set + '_' \
                  + '-'.join(blocks) + '.pkl', 'rb') as f:
            data = pickle.load(f)
        for key, value in data.iteritems():
            DATA['GPinTPOT'][key][(o_feature_set, i_feature_set)] = value

In [7]:
data = DATA['GPinTPOT']

models = data['models']
rmses = data['rmses']
x = data['x']
y = data['y']
errors = data['errors']
p_docs = data['p_docs']


feature_set = models.keys()[0]

In [8]:
print(feature_set)

('energy_fr_coordcount_ads', 'energy_fr_nncoord')


# Plot

In [10]:
# Look at all of the different regressor types
for regressor, data in DATA.iteritems():
    print('Regressor:  ' + str(regressor))
    # Pull data out of the data ball
    models = data['models']
    rmses = data['rmses']
    x = data['x']
    y = data['y']
    errors = data['errors']
    p_docs = data['p_docs']
    
    # Create a plot for each of the regressor type/feature set combinations
    for feature_set in models:
        print('    Features:  ' + str(feature_set))
        traces = []
        # Within each plot, create a separate data set for each block
        for block, model in models[feature_set].iteritems():
            # Ignore sub-structures that come from hierarchical models
            if not (block == 'outer_model' or block == 'inner_model'):
                print('        Block:  ' + str(block))
                # Pull out the data for this feature set/block combination
                _y = y[feature_set][block]['train+test']
                _p_docs = p_docs[feature_set][block]['train+test']
                _errors = errors[feature_set][block]['train+test']
                _rmses = rmses[feature_set][block]

                # Print the RMSE values
                for dataset, rmse in _rmses.iteritems():
                    print('                ' + str(dataset) + ':  ' + str(rmse))
                # Label each data point depending on what type of feature set we're using
                if feature_set == 'energy_fr_coordcount':
                    coords = [coord for coord in _p_docs['coordination']]
                    text = ['Site:  %s' % coord for coord in coords]
                elif feature_set == 'energy_fr_coordcount_ads':
                    coords = [coord for coord in _p_docs['coordination']]
                    ads = [ads for ads in _p_docs['adsorbate']]
                    text = ['Site:  %s, Ads:  %s' % (coord, ads[i])
                            for i, coord in enumerate(coords)]
                elif feature_set == 'energy_fr_coordcount_nncoord_ads':
                    coords = [coord for coord in _p_docs['coordination']]
                    ads = [ads for ads in _p_docs['adsorbate']]
                    nncoords = [nncoord for nncoord in _p_docs['nextnearestcoordination']]
                    text = ['Site:  %s, Ads:  %s, NNC:  %s' % (coord, ads[i], nncoords[i])
                            for i, coord in enumerate(coords)]
                elif feature_set == 'energy_fr_gcn_ads':
                    coords = [coord for coord in _p_docs['coordination']]
                    ads = [ads for ads in _p_docs['adsorbate']]
                    text = ['Site:  %s, Ads:  %s' % (coord, ads[i])
                            for i, coord in enumerate(coords)]
                elif feature_set == 'energy_fr_nncoord':
                    pass
                elif feature_set == ('energy_fr_coordcount_ads', 'energy_fr_nncoord'):
                    coords = [coord for coord in _p_docs['coordination']]
                    ads = [ads for ads in _p_docs['adsorbate']]
                    text = ['Site:  %s, Ads:  %s' % (coord, ads[i])
                            for i, coord in enumerate(coords)]
                else:
                    raise Exception('You still need to hard-code the text for the %s' \
                                    % feature_set)
                # Add the data to the trace
                traces.append(go.Scatter(x=_y, y=_y+_errors,
                                         name=str(block), text=text, mode='markers'))

                # Create a diagonal line for the parity plot
                lims = [-4, 6]
                traces.append(go.Scatter(x=lims, y=lims,
                                         line=dict(color=('black'), dash='dash'),
                                         name='Parity line'))
                # Format and plot
                layout = go.Layout(xaxis=dict(title='DFT (eV)'),
                                   yaxis=dict(title='Regressed (eV)'),
                                   title='Predicting %s using a %s model' \
                                         % (feature_set, regressor))
                iplot(go.Figure(data=traces, layout=layout))

Regressor:  GPinTPOT
    Features:  ('energy_fr_coordcount_ads', 'energy_fr_nncoord')
        Block:  no_block
                test:  0.586301401239
                train:  0.443420003014
                train+test:  0.483128702932
