# regress.ipynb
Author:  Kevin Tran <ktran@andrew.cmu.edu>

This python notebook performs regressions on data pulled from a processed mongo DB created by GASpy. It then saves these regressions into pickles (for later use) and creates parity plots of the regression fits.

# Initialize

## Importing

In [1]:
# Debugging & other Python tools
import pdb
import sys
from pprint import pprint as pp
import itertools
# Saving/loading
import dill as pickle
pickle.settings['recurse'] = True     # required to pickle lambdify functions (for alamopy)
# Regression
from sklearn.gaussian_process import GaussianProcessRegressor
from tpot import TPOTRegressor
#from sklearn.gaussian_process.kernels import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared
# Plotting
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.plotly as py
import plotly.graph_objs as go
# GASpy
from regression_processor import RegressionProcessor
from pull_features import PullFeatures
sys.path.append('..')
from gaspy.utils import vasp_settings_to_str

## Scope

In [2]:
# Define the feature sets that you want to investigate. They should be
# string names of the PullFeatures methods that you want to use.
FEATURE_SETS = [
                #'energy_fr_coordcount',
                'energy_fr_coordcount_ads',
                #'energy_fr_coordcount_nncoord_ads',
                #'energy_fr_nncoord',
                #'energy_fr_gcn_ads',
               ]

# Only pull data that used the following vasp settings
VASP_SETTINGS = vasp_settings_to_str({'gga': 'RP',
                                      'pp_version': '5.4',
                                      'encut': 350})
#VASP_SETTINGS = None

# This is a dictionary that will hold all of the data we need for plotting
DATA = {}

# Regress

## SKLearn Gaussian Process

### Execute

In [3]:
# Specify the kernel to use. If it's `None`, then it uses SKLearn's default RBF
#K = 1.0*RBF(length_scale=1.0) + 1.0*WhiteKernel(noise_level=0.05**2.0) 
K = None
n_restarts = 0
# Create the model that you want to use to perform the regression
regressor = GaussianProcessRegressor(kernel=K, n_restarts_optimizer=n_restarts)
# Specify the model blocking. Use [] if you don't want blocking (this will help with saving)
#blocks = ['adsorbate']
blocks = []

# Initialize the results
models = dict.fromkeys(FEATURE_SETS)
rmses = dict.fromkeys(FEATURE_SETS)
errors = dict.fromkeys(FEATURE_SETS)
x = dict.fromkeys(FEATURE_SETS)
y = dict.fromkeys(FEATURE_SETS)
p_docs = dict.fromkeys(FEATURE_SETS)
block_list = dict.fromkeys(FEATURE_SETS)
pp = dict.fromkeys(FEATURE_SETS)

for feature_set in FEATURE_SETS:
    # Pull the data out and store some of the processing information for plotting purposes
    rp = RegressionProcessor(feature_set, blocks=blocks, vasp_settings=VASP_SETTINGS)
    x[feature_set] = rp.x
    y[feature_set] = rp.y
    p_docs[feature_set] = rp.p_docs
    block_list[feature_set] = rp.block_list
    pp[feature_set] = rp.pp
    # Perform the regression
    models[feature_set], rmses[feature_set], errors[feature_set] = \
            rp.sk_regressor(regressor)

# Package the data that'll be used for plotting
DATA['GP'] = {'models': models,
              'rmses': rmses,
              'errors': errors,
              'x': x,
              'y': y,
              'p_docs': p_docs,
              'blocks': blocks,
              'block_list': block_list,
              'pp': pp}

### Save

In [5]:
# Save the regressions
for feature_set in FEATURE_SETS:
    # Save the models alone for GASpy_predict to use
    pkl = {'model': DATA['GP']['models'][feature_set],
           'pp': DATA['GP']['pp'][feature_set]}
    with open('pkls/models/GP_model_' + feature_set + '_' \
              + '-'.join(DATA['GP']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(pkl, f)
        
    # Save the entire package to use later in this notebook
    data = {}
    for datum in ['models', 'rmses', 'errors', 'x', 'y', 'p_docs', 'block_list', 'pp']:
        data[datum] = DATA['GP'][datum][feature_set]
    with open('pkls/data/GP_data_' + feature_set + '_' + \
              '-'.join(DATA['GP']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(data, f)

### Load

In [6]:
# What blocking-types do we want to open?
#blocks = ['adsorbate']
blocks = []

# Initialize the data ball
DATA['GP'] = {}
for datum in ['models', 'rmses', 'errors', 'x', 'y', 'p_docs', 'block_list', 'pp']:
    DATA['GP'][datum] = dict.fromkeys(FEATURE_SETS)
    
# Open all the databalls and put them into DATA
for feature_set in FEATURE_SETS:
    with open('pkls/data/GP_data_' + feature_set + '_' + '-'.join(blocks) + '.pkl', 'rb') as f:
        data = pickle.load(f)
    for key, value in data.iteritems():
        DATA['GP'][key][feature_set] = value

## TPOT

### Execute

In [7]:
# Create the model that you want to use to perform the regression
regressor = TPOTRegressor(generations=10,
                          population_size=10,
                          verbosity=2,
                          random_state=42)
# Specify the model blocking. Use [] if you don't want blocking (this will help with saving)
#blocks = ['adsorbate']
blocks = []

# Initialize the results
models = dict.fromkeys(FEATURE_SETS)
rmses = dict.fromkeys(FEATURE_SETS)
errors = dict.fromkeys(FEATURE_SETS)
x = dict.fromkeys(FEATURE_SETS)
y = dict.fromkeys(FEATURE_SETS)
p_docs = dict.fromkeys(FEATURE_SETS)
block_list = dict.fromkeys(FEATURE_SETS)
pp = dict.fromkeys(FEATURE_SETS)

for feature_set in FEATURE_SETS:
    # Pull the data out and store some of the processing information for plotting purposes
    rp = RegressionProcessor(feature_set, blocks=blocks, vasp_settings=VASP_SETTINGS)
    x[feature_set] = rp.x
    y[feature_set] = rp.y
    p_docs[feature_set] = rp.p_docs
    block_list[feature_set] = rp.block_list
    pp[feature_set] = rp.pp
    # Perform the regression
    models[feature_set], rmses[feature_set], errors[feature_set] = \
            rp.tpot(regressor)

# Package the data that'll be used for plotting
DATA['TPOT'] = {'models': models,
                'rmses': rmses,
                'errors': errors,
                'x': x,
                'y': y,
                'p_docs': p_docs,
                'blocks': blocks,
                'block_list': block_list,
                'pp': pp}


This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.

Optimization Progress:  18%|█▊        | 20/110 [01:06<08:25,  5.62s/pipeline]

Generation 1 - Current best internal CV score: 0.328439208529


Optimization Progress:  27%|██▋       | 30/110 [01:48<07:46,  5.83s/pipeline]

Generation 2 - Current best internal CV score: 0.326912525058


Optimization Progress:  35%|███▍      | 38/110 [02:19<06:13,  5.18s/pipeline]

Generation 3 - Current best internal CV score: 0.320639304584


Optimization Progress:  44%|████▎     | 48/110 [02:53<04:03,  3.92s/pipeline]

Generation 4 - Current best internal CV score: 0.320134859608


Optimization Progress:  53%|█████▎    | 58/110 [03:25<03:12,  3.70s/pipeline]

Generation 5 - Current best internal CV score: 0.319098678138


Optimization Progress:  62%|██████▏   | 68/110 [04:08<02:55,  4.17s/pipeline]

Generation 6 - Current best internal CV score: 0.319098678138


Optimization Progress:  71%|███████   | 78/110 [04:48<01:41,  3.18s/pipeline]

Generation 7 - Current best internal CV score: 0.319098678138


Optimization Progress:  80%|████████  | 88/110 [05:26<01:40,  4.59s/pipeline]

Generation 8 - Current best internal CV score: 0.315728733537


Optimization Progress:  87%|████████▋ | 96/110 [06:06<01:09,  4.94s/pipeline]

Generation 9 - Current best internal CV score: 0.315728733537


                                                                              

Generation 10 - Current best internal CV score: 0.313718974201

Best pipeline: RandomForestRegressor(LassoLarsCV(input_matrix, LassoLarsCV__normalize=False), RandomForestRegressor__bootstrap=DEFAULT, RandomForestRegressor__max_features=0.55, RandomForestRegressor__min_samples_leaf=3, RandomForestRegressor__min_samples_split=11, RandomForestRegressor__n_estimators=100)


### Save

In [8]:
# Save the regressions
for feature_set in FEATURE_SETS:
    # Save the models alone for GASpy_predict to use
    pkl = {'model': DATA['TPOT']['models'][feature_set],
           'pp': DATA['TPOT']['pp'][feature_set]}
    with open('pkls/models/TPOT_model_' + feature_set + '_' \
              + '-'.join(DATA['TPOT']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(pkl, f)
        
    # Save the entire package to use later in this notebook
    data = {}
    for datum in ['models', 'rmses', 'errors', 'x', 'y', 'p_docs', 'block_list', 'pp']:
        data[datum] = DATA['TPOT'][datum][feature_set]
    with open('pkls/data/TPOT_data_' + feature_set + '_' + \
              '-'.join(DATA['TPOT']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(data, f)

### Load

In [9]:
# What blocking-types do we want to open?
#blocks = ['adsorbate']
blocks = []

# Initialize the data ball
DATA['TPOT'] = {}
for datum in ['models', 'rmses', 'errors', 'x', 'y', 'p_docs', 'block_list', 'pp']:
    DATA['TPOT'][datum] = dict.fromkeys(FEATURE_SETS)
    
# Open all the databalls and put them into DATA
for feature_set in FEATURE_SETS:
    with open('pkls/data/TPOT_data_' + feature_set + '_' + '-'.join(blocks) + '.pkl', 'rb') as f:
        data = pickle.load(f)
    for key, value in data.iteritems():
        DATA['TPOT'][key][feature_set] = value

## Hierarchical
TODO:  Test the iterable nature of these cells (i.e., use more than one outer and inner combo)

### Execute

In [10]:
# Specify the model blocking. Use [] if you don't want blocking (this will help with saving)
#blocks = ['adsorbate']
blocks = []

# TODO:  Turn the outer and inner information into tuples
# Outer regression information
OUTER_FEATURE_SETS = ['energy_fr_coordcount_ads']
OUTER_REGRESSORS = [TPOTRegressor(generations=10,
                                  population_size=10,
                                  verbosity=2,
                                  random_state=42)]
OUTER_REGRESSION_METHODS = ['tpot']
# Inner regression information
INNER_FEATURE_SETS = ['energy_fr_nncoord']
#K = 1.0*RBF(length_scale=1.0) + 1.0*WhiteKernel(noise_level=0.05**2.0) 
K = None
INNER_REGRESSORS = [GaussianProcessRegressor(kernel=K, n_restarts_optimizer=20)]
INNER_REGRESSION_METHODS = ['sk_regressor']

# `FEATURE_COMBINATIONS` is a list of tuples for the different combinations
# of the outer and inner regressors we want. We use it to initialize the dictionaries
# of our results.
FEATURE_COMBINATIONS = [combo
                        for combo in itertools.product(*[OUTER_FEATURE_SETS,
                                                         INNER_FEATURE_SETS])]
models = dict.fromkeys(FEATURE_COMBINATIONS)
rmses = dict.fromkeys(FEATURE_COMBINATIONS)
errors = dict.fromkeys(FEATURE_COMBINATIONS)
x = dict.fromkeys(FEATURE_COMBINATIONS)
y = dict.fromkeys(FEATURE_COMBINATIONS)
p_docs = dict.fromkeys(FEATURE_COMBINATIONS)
block_list = dict.fromkeys(FEATURE_COMBINATIONS)
# Dictionaries containing the outer and inner RegressionProcessor class instances,
# along with the pre-processors
O_RPs = dict.fromkeys(OUTER_FEATURE_SETS)
O_PPs = dict.fromkeys(OUTER_FEATURE_SETS)
I_RPs = dict.fromkeys(FEATURE_COMBINATIONS)
I_PPs = dict.fromkeys(FEATURE_COMBINATIONS)

# Pull the data. They need to be right next to each other in order for them to pull
# the same amount of data (and therefore be able to regress together).
for o_feature_set in OUTER_FEATURE_SETS:
    O_RPs[o_feature_set] = RegressionProcessor(o_feature_set,
                                               blocks=blocks,
                                               vasp_settings=VASP_SETTINGS)
    O_PPs[o_feature_set] = O_RPs[o_feature_set].pp
    for i_feature_set in INNER_FEATURE_SETS:
        I_RPs[(o_feature_set, i_feature_set)] = RegressionProcessor(i_feature_set,
                                                                    blocks=blocks,
                                                                    vasp_settings=VASP_SETTINGS)
    I_PPs[(o_feature_set, i_feature_set)] = I_RPs[(o_feature_set, i_feature_set)].pp

# Perform the regressions for each combination of feature sets
for o, o_feature_set in enumerate(OUTER_FEATURE_SETS):
    # Perform the outer regressions
    outer_models, outer_rmses, outer_errors = \
            getattr(O_RPs[o_feature_set], OUTER_REGRESSION_METHODS[o])(OUTER_REGRESSORS[o])
    # Perform the inner regressions
    for i, i_feature_set in enumerate(INNER_FEATURE_SETS):
        models[(o_feature_set, i_feature_set)], \
            rmses[(o_feature_set, i_feature_set)], \
            errors[(o_feature_set, i_feature_set)] \
                = I_RPs[(o_feature_set, i_feature_set)].hierarchical(outer_models,
                                                                     outer_rmses,
                                                                     outer_errors,
                                                                     INNER_REGRESSION_METHODS[i],
                                                                     INNER_REGRESSORS[i])
        x[(o_feature_set, i_feature_set)] = I_RPs[(o_feature_set, i_feature_set)].x
        y[(o_feature_set, i_feature_set)] = I_RPs[(o_feature_set, i_feature_set)].y
        p_docs[(o_feature_set, i_feature_set)] = I_RPs[(o_feature_set, i_feature_set)].p_docs
        block_list[(o_feature_set, i_feature_set)] = I_RPs[(o_feature_set, i_feature_set)].block_list
        pp[(o_feature_set, i_feature_set)] = O_PPs[o_feature_set]
        pp[(o_feature_set, i_feature_set)].update(I_PPs[(o_feature_set, i_feature_set)])
        
# Package the data that'll be used for plotting
DATA['GPinTPOT'] = {'models': models,
                    'rmses': rmses,
                    'errors': errors,
                    'x': x,
                    'y': y,
                    'p_docs': p_docs,
                    'blocks': blocks,
                    'block_list': block_list,
                    'pp': pp}

Optimization Progress:  18%|█▊        | 20/110 [01:04<08:09,  5.44s/pipeline]

Generation 1 - Current best internal CV score: 0.328439208529


Optimization Progress:  27%|██▋       | 30/110 [01:45<07:32,  5.66s/pipeline]

Generation 2 - Current best internal CV score: 0.326912525058


Optimization Progress:  35%|███▍      | 38/110 [02:16<06:05,  5.08s/pipeline]

Generation 3 - Current best internal CV score: 0.320639304584


Optimization Progress:  44%|████▎     | 48/110 [02:51<04:04,  3.95s/pipeline]

Generation 4 - Current best internal CV score: 0.320134859608


Optimization Progress:  53%|█████▎    | 58/110 [03:23<03:11,  3.69s/pipeline]

Generation 5 - Current best internal CV score: 0.319098678138


Optimization Progress:  62%|██████▏   | 68/110 [04:06<02:55,  4.18s/pipeline]

Generation 6 - Current best internal CV score: 0.319098678138


Optimization Progress:  71%|███████   | 78/110 [04:45<01:42,  3.20s/pipeline]

Generation 7 - Current best internal CV score: 0.319098678138


Optimization Progress:  80%|████████  | 88/110 [05:24<01:39,  4.53s/pipeline]

Generation 8 - Current best internal CV score: 0.315728733537


Optimization Progress:  87%|████████▋ | 96/110 [06:02<01:08,  4.87s/pipeline]

Generation 9 - Current best internal CV score: 0.315728733537


                                                                              

Generation 10 - Current best internal CV score: 0.313718974201

Best pipeline: RandomForestRegressor(LassoLarsCV(input_matrix, LassoLarsCV__normalize=False), RandomForestRegressor__bootstrap=DEFAULT, RandomForestRegressor__max_features=0.55, RandomForestRegressor__min_samples_leaf=3, RandomForestRegressor__min_samples_split=11, RandomForestRegressor__n_estimators=100)


### Save

In [11]:
# Save the regressions
for o_feature_set in OUTER_FEATURE_SETS:
    for i_feature_set in INNER_FEATURE_SETS:
        # Save the models alone for GASpy_predict to use
        with open('pkls/models/GPinTPOT_model_' \
                  + i_feature_set + '-inside-' + o_feature_set + '_' \
                  + '-'.join(DATA['GPinTPOT']['blocks']) + '.pkl', 'wb') as f:
            pkl = {'model': DATA['GPinTPOT']['models'][(o_feature_set, i_feature_set)],
                   'pp': DATA['GPinTPOT']['pp'][(o_feature_set, i_feature_set)]}
            pickle.dump(pkl, f)

        # Save the entire package to use later in this notebook
        data = {}
        for datum in ['models', 'rmses', 'errors', 'x', 'y', 'p_docs', 'block_list', 'pp']:
            data[datum] = DATA['GPinTPOT'][datum][(o_feature_set, i_feature_set)]
        with open('pkls/data/GPinTPOT_data_' \
                  + i_feature_set + '-inside-' + o_feature_set + '_' \
                  + '-'.join(DATA['GPinTPOT']['blocks']) + '.pkl', 'wb') as f:
            pickle.dump(data, f)

### Load

In [12]:
# What blocking-types do we want to open?
#blocks = ['adsorbate']
blocks = []

INNER_FEATURE_SETS = ['energy_fr_nncoord']
OUTER_FEATURE_SETS = ['energy_fr_coordcount_ads']
FEATURE_COMBINATIONS = [combo
                        for combo in itertools.product(*[OUTER_FEATURE_SETS,
                                                         INNER_FEATURE_SETS])]

# Initialize the data ball
DATA['GPinTPOT'] = {}
for datum in ['models', 'rmses', 'errors', 'x', 'y', 'p_docs', 'block_list', 'pp']:
    DATA['GPinTPOT'][datum] = dict.fromkeys(FEATURE_COMBINATIONS)
    
# Open all the databalls and put them into DATA
for o_feature_set in OUTER_FEATURE_SETS:
    for i_feature_set in INNER_FEATURE_SETS:
        with open('pkls/data/GPinTPOT_data_' \
                  + i_feature_set + '-inside-' + o_feature_set + '_' \
                  + '-'.join(blocks) + '.pkl', 'rb') as f:
            data = pickle.load(f)
        for key, value in data.iteritems():
            DATA['GPinTPOT'][key][(o_feature_set, i_feature_set)] = value

# Plot

In [13]:
# Look at all of the different regressor types
for regressor, data in DATA.iteritems():
    print('Regressor:  ' + str(regressor))
    # Pull data out of the data ball
    models = data['models']
    rmses = data['rmses']
    x = data['x']
    y = data['y']
    errors = data['errors']
    p_docs = data['p_docs']
    
    # Create a plot for each of the regressor type/feature set combinations
    for feature_set in models:
        print('    Features:  ' + str(feature_set))
        traces = []
        # Within each plot, create a separate data set for each block
        for block, model in models[feature_set].iteritems():
            # Ignore sub-structures that come from hierarchical models
            if not (block == 'outer_model' or block == 'inner_model'):
                print('        Block:  ' + str(block))
                # Pull out the data for this feature set/block combination
                _y = y[feature_set][block]['train+test']
                _p_docs = p_docs[feature_set][block]['train+test']
                _errors = errors[feature_set][block]['train+test']
                _rmses = rmses[feature_set][block]

                # Print the RMSE values
                for dataset, rmse in _rmses.iteritems():
                    print('                ' + str(dataset) + ':  ' + str(rmse))
                # Label each data point depending on what type of feature set we're using
                if feature_set == 'energy_fr_coordcount':
                    coords = [coord for coord in _p_docs['coordination']]
                    text = ['Site:  %s' % coord for coord in coords]
                elif feature_set == 'energy_fr_coordcount_ads':
                    coords = [coord for coord in _p_docs['coordination']]
                    ads = [ads for ads in _p_docs['adsorbate']]
                    text = ['Site:  %s, Ads:  %s' % (coord, ads[i])
                            for i, coord in enumerate(coords)]
                elif feature_set == 'energy_fr_coordcount_nncoord_ads':
                    coords = [coord for coord in _p_docs['coordination']]
                    ads = [ads for ads in _p_docs['adsorbate']]
                    nncoords = [nncoord for nncoord in _p_docs['nextnearestcoordination']]
                    text = ['Site:  %s, Ads:  %s, NNC:  %s' % (coord, ads[i], nncoords[i])
                            for i, coord in enumerate(coords)]
                elif feature_set == 'energy_fr_gcn_ads':
                    coords = [coord for coord in _p_docs['coordination']]
                    ads = [ads for ads in _p_docs['adsorbate']]
                    text = ['Site:  %s, Ads:  %s' % (coord, ads[i])
                            for i, coord in enumerate(coords)]
                elif feature_set == 'energy_fr_nncoord':
                    pass
                elif feature_set == ('energy_fr_coordcount_ads', 'energy_fr_nncoord'):
                    coords = [coord for coord in _p_docs['coordination']]
                    ads = [ads for ads in _p_docs['adsorbate']]
                    text = ['Site:  %s, Ads:  %s' % (coord, ads[i])
                            for i, coord in enumerate(coords)]
                else:
                    raise Exception('You still need to hard-code the text for the %s' \
                                    % feature_set)
                # Add the data to the trace
                traces.append(go.Scatter(x=_y, y=_y+_errors,
                                         name=str(block), text=text, mode='markers'))

                # Create a diagonal line for the parity plot
                lims = [-4, 6]
                traces.append(go.Scatter(x=lims, y=lims,
                                         line=dict(color=('black'), dash='dash'),
                                         name='Parity line'))
                # Format and plot
                layout = go.Layout(xaxis=dict(title='DFT (eV)'),
                                   yaxis=dict(title='Regressed (eV)'),
                                   title='Predicting %s using a %s model' \
                                         % (feature_set, regressor))
                iplot(go.Figure(data=traces, layout=layout))

Regressor:  GPinTPOT
    Features:  ('energy_fr_coordcount_ads', 'energy_fr_nncoord')
        Block:  no_block
                test:  0.576897328195
                train:  0.423817205392
                train+test:  0.466828421789


Regressor:  TPOT
    Features:  energy_fr_coordcount_ads
        Block:  no_block
                test:  0.527769318056
                train:  0.488477278245
                train+test:  0.498593377784


Regressor:  GP
    Features:  energy_fr_coordcount_ads
        Block:  no_block
                test:  0.61112359826
                train:  0.41955757939
                train+test:  0.474766056296
