# regress.ipynb
Author:  Kevin Tran <ktran@andrew.cmu.edu>

This python notebook performs regressions on data pulled from a processed mongo DB created by GASpy. It then saves these regressions into pickles (for later use) and creates parity plots of the regression fits.

# Initialize

## Importing

In [1]:
# Debugging
import pdb
import sys
from pprint import pprint
# Saving/loading
import dill as pickle
pickle.settings['recurse'] = True     # required to pickle lambdify functions (for alamopy)
# Regression
from sklearn.gaussian_process import GaussianProcessRegressor
from tpot import TPOTRegressor
#from sklearn.gaussian_process.kernels import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared
# Plotting
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.plotly as py
import plotly.graph_objs as go
# GASpy
from regression_processor import RegressionProcessor
from pull_features import PullFeatures
sys.path.append('..')
from gaspy.utils import vasp_settings_to_str

## Scope

In [2]:
# Define the feature sets that you want to investigate. They should be
# string names of the PullFeatures methods that you want to use.
FEATURE_SETS = [
                'energy_fr_coordcount',
                'energy_fr_coordcount_ads',
                'energy_fr_coordcount_nncoord_ads',
                'energy_fr_nncoord',
                #'energy_fr_gcn_ads',
               ]

# Only pull data that used the following vasp settings
VASP_SETTINGS = vasp_settings_to_str({'gga': 'RP',
                                      'pp_version': '5.4',
                                      'encut': 350})
#VASP_SETTINGS = None

# This is a dictionary that will hold all of the data we need for plotting
DATA = {}

# Regress

## SKLearn Gaussian Process Regressor

### Execute regression

In [3]:
# Specify the kernel to use. If it's `None`, then it uses SKLearn's default RBF
#K = 1.0*RBF(length_scale=1.0) + 1.0*WhiteKernel(noise_level=0.05**2.0) 
K = None
n_restarts = 0
# Create the model that you want to use to perform the regression
model = GaussianProcessRegressor(kernel=K, n_restarts_optimizer=n_restarts)
# Specify the model blocking. Use [] if you don't want blocking (this will help with saving)
blocks = []

# Initialize the results
models = dict.fromkeys(FEATURE_SETS)
rmses = dict.fromkeys(FEATURE_SETS)
errors = dict.fromkeys(FEATURE_SETS)
x = dict.fromkeys(FEATURE_SETS)
y = dict.fromkeys(FEATURE_SETS)
p_docs = dict.fromkeys(FEATURE_SETS)
block_list = dict.fromkeys(FEATURE_SETS)

for feature_set in FEATURE_SETS:
    # Pull the data out and store some of the processing information for plotting purposes
    rp = RegressionProcessor(feature_set, blocks=blocks, vasp_settings = VASP_SETTINGS)
    x[feature_set] = rp.x
    y[feature_set] = rp.y
    p_docs[feature_set] = rp.p_docs
    block_list[feature_set] = rp.block_list
    # Perform the regression
    models[feature_set], rmses[feature_set], errors[feature_set] = \
            rp.sk_regressor(model)

# Package the data that'll be used for plotting
DATA['GP'] = {'models': models,
              'rmses': rmses,
              'errors': errors,
              'x': x,
              'y': y,
              'p_docs': p_docs,
              'blocks': blocks,
              'block_list': block_list}

### Save regression

In [4]:
# Save the GP regressions. 
for feature_set in FEATURE_SETS:
    # Save the models alone for GASpy_predict to use
    with open('pkls/models/GP_model_' + feature_set + '_' \
              + '-'.join(DATA['GP']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(DATA['GP']['models'][feature_set], f)
        
    # Save the entire package to use later in this notebook
    data = {
            'models': DATA['GP']['models'][feature_set],
            'rmses': DATA['GP']['rmses'][feature_set],
            'errors': DATA['GP']['errors'][feature_set],
            'x': DATA['GP']['x'][feature_set],
            'y': DATA['GP']['y'][feature_set],
            'p_docs': DATA['GP']['p_docs'][feature_set],
            'block_list': DATA['GP']['block_list'][feature_set],
            }
    with open('pkls/data/GP_data_' + feature_set + '_' + \
              '-'.join(DATA['GP']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(data, f)

### Load regression

In [5]:
# What blocking-types do we want to open?
blocks = []

# Initialize the data ball
DATA['GP'] = {
              'models': dict.fromkeys(FEATURE_SETS),
              'rmses': dict.fromkeys(FEATURE_SETS),
              'errors': dict.fromkeys(FEATURE_SETS),
              'x': dict.fromkeys(FEATURE_SETS),
              'y': dict.fromkeys(FEATURE_SETS),
              'p_docs': dict.fromkeys(FEATURE_SETS),
              'block_list': dict.fromkeys(FEATURE_SETS),
             }
# Open all the databalls and put them into DATA
for feature_set in FEATURE_SETS:
    with open('pkls/data/GP_data_' + feature_set + '_' + '-'.join(blocks) + '.pkl', 'rb') as f:
        data = pickle.load(f)
    for key, value in data.iteritems():
        DATA['GP'][key][feature_set] = value

## TPOT Regressor

### Execute regression

In [6]:
# Create the model that you want to use to perform the regression
model = TPOTRegressor(generations=10,
                      population_size=10,
                      verbosity=2,
                      random_state=42)
# Specify the model blocking. Use [] if you don't want blocking (this will help with saving)
blocks = []

# Initialize the results
models = dict.fromkeys(FEATURE_SETS)
rmses = dict.fromkeys(FEATURE_SETS)
errors = dict.fromkeys(FEATURE_SETS)
x = dict.fromkeys(FEATURE_SETS)
y = dict.fromkeys(FEATURE_SETS)
p_docs = dict.fromkeys(FEATURE_SETS)
block_list = dict.fromkeys(FEATURE_SETS)

for feature_set in FEATURE_SETS:
    # Pull the data out and store some of the processing information for plotting purposes
    rp = RegressionProcessor(feature_set, blocks=blocks, vasp_settings = VASP_SETTINGS)
    x[feature_set] = rp.x
    y[feature_set] = rp.y
    p_docs[feature_set] = rp.p_docs
    block_list[feature_set] = rp.block_list
    # Perform the regression
    models[feature_set], rmses[feature_set], errors[feature_set] = \
            rp.tpot(model)

# Package the data that'll be used for plotting
DATA['TPOT'] = {'models': models,
                'rmses': rmses,
                'errors': errors,
                'x': x,
                'y': y,
                'p_docs': p_docs,
                'blocks': blocks,
                'block_list': block_list}


This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.

Optimization Progress:  18%|█▊        | 20/110 [00:48<06:03,  4.04s/pipeline]

Generation 1 - Current best internal CV score: 0.807304749529


Optimization Progress:  26%|██▋       | 29/110 [01:13<04:52,  3.61s/pipeline]

Generation 2 - Current best internal CV score: 0.807304749529


Optimization Progress:  35%|███▌      | 39/110 [01:37<04:10,  3.53s/pipeline]

Generation 3 - Current best internal CV score: 0.807304749529


Optimization Progress:  45%|████▍     | 49/110 [01:49<02:08,  2.10s/pipeline]

Generation 4 - Current best internal CV score: 0.80715638538


Optimization Progress:  54%|█████▎    | 59/110 [02:11<01:59,  2.34s/pipeline]

Generation 5 - Current best internal CV score: 0.798344424946


Optimization Progress:  63%|██████▎   | 69/110 [02:33<01:35,  2.33s/pipeline]

Generation 6 - Current best internal CV score: 0.798344424946


Optimization Progress:  71%|███████   | 78/110 [02:52<01:05,  2.06s/pipeline]

Generation 7 - Current best internal CV score: 0.798344424946


Optimization Progress:  76%|███████▋  | 84/110 [03:02<00:52,  2.00s/pipeline]

Generation 8 - Current best internal CV score: 0.798344424946


Optimization Progress:  85%|████████▌ | 94/110 [03:30<00:43,  2.70s/pipeline]

Generation 9 - Current best internal CV score: 0.798344424946


                                                                              

Generation 10 - Current best internal CV score: 0.798344424946

Best pipeline: ExtraTreesRegressor(PCA(LassoLarsCV(MaxAbsScaler(input_matrix), LassoLarsCV__normalize=DEFAULT), PCA__iterated_power=7, PCA__svd_solver=randomized), ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.9, ExtraTreesRegressor__min_samples_leaf=13, ExtraTreesRegressor__min_samples_split=3, ExtraTreesRegressor__n_estimators=100)


Optimization Progress:  18%|█▊        | 20/110 [00:55<07:02,  4.69s/pipeline]

Generation 1 - Current best internal CV score: 0.33097473379


Optimization Progress:  26%|██▋       | 29/110 [01:36<08:32,  6.33s/pipeline]

Generation 2 - Current best internal CV score: 0.318363523411


Optimization Progress:  35%|███▌      | 39/110 [02:01<04:38,  3.93s/pipeline]

Generation 3 - Current best internal CV score: 0.318363523411


Optimization Progress:  43%|████▎     | 47/110 [02:34<04:28,  4.26s/pipeline]

Generation 4 - Current best internal CV score: 0.318363523411


Optimization Progress:  51%|█████     | 56/110 [03:29<04:18,  4.79s/pipeline]

Generation 5 - Current best internal CV score: 0.318363523411


Optimization Progress:  58%|█████▊    | 64/110 [04:06<04:18,  5.61s/pipeline]

Generation 6 - Current best internal CV score: 0.318363523411


Optimization Progress:  66%|██████▋   | 73/110 [04:42<02:24,  3.91s/pipeline]

Generation 7 - Current best internal CV score: 0.318363523411


Optimization Progress:  75%|███████▌  | 83/110 [05:45<03:34,  7.94s/pipeline]

Generation 8 - Current best internal CV score: 0.318363523411


Optimization Progress:  84%|████████▎ | 92/110 [06:19<01:36,  5.35s/pipeline]

Generation 9 - Current best internal CV score: 0.315480738217


                                                                              

Generation 10 - Current best internal CV score: 0.313157412376

Best pipeline: XGBRegressor(LassoLarsCV(PCA(input_matrix, PCA__iterated_power=10, PCA__svd_solver=DEFAULT), LassoLarsCV__normalize=True), XGBRegressor__learning_rate=DEFAULT, XGBRegressor__max_depth=7, XGBRegressor__min_child_weight=14, XGBRegressor__n_estimators=DEFAULT, XGBRegressor__nthread=1, XGBRegressor__subsample=0.3)


Optimization Progress:  18%|█▊        | 20/110 [01:33<12:01,  8.02s/pipeline]

Generation 1 - Current best internal CV score: 0.332270359396


Optimization Progress:  27%|██▋       | 30/110 [02:20<06:45,  5.07s/pipeline]

Generation 2 - Current best internal CV score: 0.319859138396


Optimization Progress:  35%|███▌      | 39/110 [02:47<06:31,  5.51s/pipeline]

Generation 3 - Current best internal CV score: 0.31543871079


Optimization Progress:  45%|████▍     | 49/110 [03:17<03:02,  2.99s/pipeline]

Generation 4 - Current best internal CV score: 0.315136654525


Optimization Progress:  54%|█████▎    | 59/110 [03:56<04:05,  4.81s/pipeline]

Generation 5 - Current best internal CV score: 0.308160721313


Optimization Progress:  63%|██████▎   | 69/110 [04:28<02:13,  3.26s/pipeline]

Generation 6 - Current best internal CV score: 0.305037112906


Optimization Progress:  72%|███████▏  | 79/110 [05:09<02:39,  5.14s/pipeline]

Generation 7 - Current best internal CV score: 0.305037112906


Optimization Progress:  81%|████████  | 89/110 [05:56<01:27,  4.18s/pipeline]

Generation 8 - Current best internal CV score: 0.30336870789


Optimization Progress:  90%|█████████ | 99/110 [06:31<00:54,  4.93s/pipeline]

Generation 9 - Current best internal CV score: 0.30336870789


                                                                              

Generation 10 - Current best internal CV score: 0.30336870789

Best pipeline: ExtraTreesRegressor(input_matrix, ExtraTreesRegressor__bootstrap=True, ExtraTreesRegressor__max_features=0.6, ExtraTreesRegressor__min_samples_leaf=1, ExtraTreesRegressor__min_samples_split=10, ExtraTreesRegressor__n_estimators=100)


Optimization Progress:  18%|█▊        | 20/110 [00:56<07:09,  4.78s/pipeline]

Generation 1 - Current best internal CV score: 1.01544150459


Optimization Progress:  27%|██▋       | 30/110 [01:23<05:28,  4.11s/pipeline]

Generation 2 - Current best internal CV score: 1.01544150459


Optimization Progress:  35%|███▌      | 39/110 [02:21<06:12,  5.24s/pipeline]

Generation 3 - Current best internal CV score: 1.01544150459


Optimization Progress:  45%|████▍     | 49/110 [02:48<04:03,  3.99s/pipeline]

Generation 4 - Current best internal CV score: 1.01544150459


Optimization Progress:  53%|█████▎    | 58/110 [03:12<02:14,  2.60s/pipeline]

Generation 5 - Current best internal CV score: 1.01544150459


Optimization Progress:  62%|██████▏   | 68/110 [03:47<02:32,  3.62s/pipeline]

Generation 6 - Current best internal CV score: 1.01406597162


Optimization Progress:  70%|███████   | 77/110 [04:05<01:21,  2.47s/pipeline]

Generation 7 - Current best internal CV score: 1.00915099116


Optimization Progress:  78%|███████▊  | 86/110 [04:31<01:05,  2.73s/pipeline]

Generation 8 - Current best internal CV score: 1.00915099116


Optimization Progress:  87%|████████▋ | 96/110 [04:59<00:33,  2.37s/pipeline]

Generation 9 - Current best internal CV score: 1.00915099116


                                                                              

Generation 10 - Current best internal CV score: 1.00915099116

Best pipeline: ExtraTreesRegressor(LassoLarsCV(input_matrix, LassoLarsCV__normalize=DEFAULT), ExtraTreesRegressor__bootstrap=False, ExtraTreesRegressor__max_features=0.5, ExtraTreesRegressor__min_samples_leaf=13, ExtraTreesRegressor__min_samples_split=3, ExtraTreesRegressor__n_estimators=100)


### Save regression

In [7]:
# Save the TPOT regressions. 
for feature_set in FEATURE_SETS:
    # Save the models alone for GASpy_predict to use
    with open('pkls/models/TPOT_model_' + feature_set + '_' \
              + '-'.join(DATA['TPOT']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(DATA['TPOT']['models'][feature_set], f)
        
    # Save the entire package to use later in this notebook
    data = {
            'models': DATA['TPOT']['models'][feature_set],
            'rmses': DATA['TPOT']['rmses'][feature_set],
            'errors': DATA['TPOT']['errors'][feature_set],
            'x': DATA['TPOT']['x'][feature_set],
            'y': DATA['TPOT']['y'][feature_set],
            'p_docs': DATA['TPOT']['p_docs'][feature_set],
            'block_list': DATA['TPOT']['block_list'][feature_set],
            }
    with open('pkls/data/TPOT_data_' + feature_set + '_' + \
              '-'.join(DATA['TPOT']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(data, f)

### Load regression

In [8]:
# What blocking-types do we want to open?
blocks = []

# Initialize the data ball
DATA['TPOT'] = {
              'models': dict.fromkeys(FEATURE_SETS),
              'rmses': dict.fromkeys(FEATURE_SETS),
              'errors': dict.fromkeys(FEATURE_SETS),
              'x': dict.fromkeys(FEATURE_SETS),
              'y': dict.fromkeys(FEATURE_SETS),
              'p_docs': dict.fromkeys(FEATURE_SETS),
              'block_list': dict.fromkeys(FEATURE_SETS),
             }
# Open all the databalls and put them into DATA
for feature_set in FEATURE_SETS:
    with open('pkls/data/TPOT_data_' + feature_set + '_' + '-'.join(blocks) + '.pkl', 'rb') as f:
        data = pickle.load(f)
    for key, value in data.iteritems():
        DATA['TPOT'][key][feature_set] = value

# Plot

In [13]:
# Look at all of the different regressor types
for regressor, data in DATA.iteritems():
    print('Regressor:  ' + str(regressor))
    # Pull data out of the data ball
    models = data['models']
    rmses = data['rmses']
    x = data['x']
    y = data['y']
    errors = data['errors']
    p_docs = data['p_docs']
    
    # Create a plot for each of the regressor type/feature set combinations
    for feature_set in models:
        print('    Features:  ' + str(feature_set))
        traces = []
        # Within each plot, create a separate data set for each block
        for block, model in models[feature_set].iteritems():
            print('        Block:  ' + str(block))
            # Pull out the data for this feature set/block combination
            _y = y[feature_set][block]['train+test']
            _p_docs = p_docs[feature_set][block]['train+test']
            _errors = errors[feature_set][block]['train+test']
            _rmses = rmses[feature_set][block]
            
            # Print the RMSE values
            for dataset, rmse in _rmses.iteritems():
                print('                ' + str(dataset) + ':  ' + str(rmse))
            
            # Label each data point depending on what type of feature set we're using
            if feature_set == 'energy_fr_coordcount':
                coords = [coord for coord in _p_docs['coordination']]
                text = ['Site:  %s' % coord for coord in coords]
            elif feature_set == 'energy_fr_coordcount_ads':
                coords = [coord for coord in _p_docs['coordination']]
                ads = [ads for ads in _p_docs['adsorbate']]
                text = ['Site:  %s, Ads:  %s' % (coord, ads[i])
                        for i, coord in enumerate(coords)]
            elif feature_set == 'energy_fr_coordcount_nncoord_ads':
                coords = [coord for coord in _p_docs['coordination']]
                ads = [ads for ads in _p_docs['adsorbate']]
                nncoords = [nncoord for nncoord in _p_docs['nextnearestcoordination']]
                text = ['Site:  %s, Ads:  %s, NNC:  %s' % (coord, ads[i], nncoords[i])
                        for i, coord in enumerate(coords)]
            elif feature_set == 'energy_fr_gcn_ads':
                coords = [coord for coord in _p_docs['coordination']]
                ads = [ads for ads in _p_docs['adsorbate']]
                text = ['Site:  %s, Ads:  %s' % (coord, ads[i])
                        for i, coord in enumerate(coords)]
            elif feature_set == 'energy_fr_nncoord':
                pass
            else:
                raise Exception('You still need to hard-code the text for the %s' \
                                % feature_set)
            # Add the data to the trace
            traces.append(go.Scatter(x=_y, y=_y+_errors,
                                     name=block, text=text, mode='markers'))

        # Create a diagonal line for the parity plot
        lims = [-4, 6]
        traces.append(go.Scatter(x=lims, y=lims,
                                 line=dict(color=('black'), dash='dash'),
                                 name='Parity line'))
        # Format and plot
        layout = go.Layout(xaxis=dict(title='DFT (eV)'),
                           yaxis=dict(title='Regressed (eV)'),
                           title='Predicting %s using a %s model' \
                                 % (feature_set, regressor))
        iplot(go.Figure(data=traces, layout=layout))

Regressor:  TPOT
    Features:  energy_fr_coordcount_ads
        Block:  no_block
                test:  0.552547955073
                train:  0.469347699352
                train+test:  0.49147918433


    Features:  energy_fr_coordcount
        Block:  no_block
                test:  0.919360457934
                train:  0.861722143645
                train+test:  0.87649335802


    Features:  energy_fr_nncoord
        Block:  no_block
                test:  0.95673310799
                train:  0.961994543258
                train+test:  0.960681327826


    Features:  energy_fr_coordcount_nncoord_ads
        Block:  no_block
                test:  0.568330122792
                train:  0.379657642759
                train+test:  0.434596047301


Regressor:  GP
    Features:  energy_fr_coordcount_ads
        Block:  no_block
                test:  0.60480211908
                train:  0.427649779498
                train+test:  0.478144973418


    Features:  energy_fr_coordcount
        Block:  no_block
                test:  0.990632731059
                train:  0.821621372494
                train+test:  0.866968605007


    Features:  energy_fr_nncoord
        Block:  no_block
                test:  1.02692750495
                train:  0.873394021555
                train+test:  0.914214883401


IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.
