# regress.ipynb
Author:  Kevin Tran <ktran@andrew.cmu.edu>

This python notebook performs regressions on data pulled from a processed mongo DB created by GASpy. It then saves these regressions into pickles (for later use) and creates parity plots of the regression fits.

# Initialize

## Importing

In [1]:
# Debugging
import pdb
import sys
from pprint import pprint
# Saving/loading
import dill as pickle
pickle.settings['recurse'] = True     # required to pickle lambdify functions (for alamopy)
# Regression
from sklearn.gaussian_process import GaussianProcessRegressor
from tpot import TPOTRegressor
#from sklearn.gaussian_process.kernels import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared
# Plotting
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.plotly as py
import plotly.graph_objs as go
# GASpy
from regression_processor import RegressionProcessor
from pull_features import PullFeatures
sys.path.append('..')
from gaspy.utils import vasp_settings_to_str

## Scope

In [2]:
# Define the feature sets that you want to investigate. They should be
# string names of the PullFeatures methods that you want to use.
FEATURE_SETS = [
                'energy_fr_coordcount',
                'energy_fr_coordcount_ads',
                'energy_fr_coordcount_nncoord_ads',
                'energy_fr_nncoord',
                #'energy_fr_gcn_ads',
               ]

# Only pull data that used the following vasp settings
VASP_SETTINGS = vasp_settings_to_str({'gga': 'RP',
                                      'pp_version': '5.4',
                                      'encut': 350})
#VASP_SETTINGS = None

# This is a dictionary that will hold all of the data we need for plotting
DATA = {}

# Regress

## SKLearn Gaussian Process Regressor

### Execute regression

In [21]:
# Specify the kernel to use. If it's `None`, then it uses SKLearn's default RBF
#K = 1.0*RBF(length_scale=1.0) + 1.0*WhiteKernel(noise_level=0.05**2.0) 
K = None
n_restarts = 0
# Create the model that you want to use to perform the regression
model = GaussianProcessRegressor(kernel=K, n_restarts_optimizer=n_restarts)
# Specify the model blocking. Use [] if you don't want blocking (this will help with saving)
blocks = []

# Initialize the results
models = dict.fromkeys(FEATURE_SETS)
rmses = dict.fromkeys(FEATURE_SETS)
errors = dict.fromkeys(FEATURE_SETS)
x = dict.fromkeys(FEATURE_SETS)
y = dict.fromkeys(FEATURE_SETS)
p_docs = dict.fromkeys(FEATURE_SETS)
block_list = dict.fromkeys(FEATURE_SETS)

for feature_set in FEATURE_SETS:
    # Pull the data out and store some of the processing information for plotting purposes
    rp = RegressionProcessor(feature_set, blocks=blocks, vasp_settings = VASP_SETTINGS)
    x[feature_set] = rp.x
    y[feature_set] = rp.y
    p_docs[feature_set] = rp.p_docs
    block_list[feature_set] = rp.block_list
    # Perform the regression
    models[feature_set], rmses[feature_set], errors[feature_set] = \
            rp.sk_regressor(model)

# Package the data that'll be used for plotting
DATA['GP'] = {'models': models,
              'rmses': rmses,
              'errors': errors,
              'x': x,
              'y': y,
              'p_docs': p_docs,
              'blocks': blocks,
              'block_list': block_list}

### Save regression

In [22]:
# Save the GP regressions. 
for feature_set in FEATURE_SETS:
    # Save the models alone for GASpy_predict to use
    with open('pkls/models/GP_model_' + feature_set + '_' \
              + '-'.join(DATA['GP']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(DATA['GP']['models'][feature_set], f)
        
    # Save the entire package to use later in this notebook
    data = {
            'models': DATA['GP']['models'][feature_set],
            'rmses': DATA['GP']['rmses'][feature_set],
            'errors': DATA['GP']['errors'][feature_set],
            'x': DATA['GP']['x'][feature_set],
            'y': DATA['GP']['y'][feature_set],
            'p_docs': DATA['GP']['p_docs'][feature_set],
            'block_list': DATA['GP']['block_list'][feature_set],
            }
    with open('pkls/data/GP_data_' + feature_set + '_' + \
              '-'.join(DATA['GP']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(data, f)

### Load regression

In [23]:
# What blocking-types do we want to open?
blocks = []

# Initialize the data ball
DATA['GP'] = {
              'models': dict.fromkeys(FEATURE_SETS),
              'rmses': dict.fromkeys(FEATURE_SETS),
              'errors': dict.fromkeys(FEATURE_SETS),
              'x': dict.fromkeys(FEATURE_SETS),
              'y': dict.fromkeys(FEATURE_SETS),
              'p_docs': dict.fromkeys(FEATURE_SETS),
              'block_list': dict.fromkeys(FEATURE_SETS),
             }
# Open all the databalls and put them into DATA
for feature_set in FEATURE_SETS:
    with open('pkls/data/GP_data_' + feature_set + '_' + '-'.join(blocks) + '.pkl', 'rb') as f:
        data = pickle.load(f)
    for key, value in data.iteritems():
        DATA['GP'][key][feature_set] = value

## TPOT Regressor

### Execute regression

In [3]:
# Create the model that you want to use to perform the regression
model = TPOTRegressor(generations=1,
                      population_size=4,
                      verbosity=2,
                      random_state=42)
# Specify the model blocking. Use [] if you don't want blocking (this will help with saving)
blocks = []

# Initialize the results
models = dict.fromkeys(FEATURE_SETS)
rmses = dict.fromkeys(FEATURE_SETS)
errors = dict.fromkeys(FEATURE_SETS)
x = dict.fromkeys(FEATURE_SETS)
y = dict.fromkeys(FEATURE_SETS)
p_docs = dict.fromkeys(FEATURE_SETS)
block_list = dict.fromkeys(FEATURE_SETS)

for feature_set in FEATURE_SETS:
    # Pull the data out and store some of the processing information for plotting purposes
    rp = RegressionProcessor(feature_set, blocks=blocks, vasp_settings = VASP_SETTINGS)
    x[feature_set] = rp.x
    y[feature_set] = rp.y
    p_docs[feature_set] = rp.p_docs
    block_list[feature_set] = rp.block_list
    # Perform the regression
    models[feature_set], rmses[feature_set], errors[feature_set] = \
            rp.tpot(model)

# Package the data that'll be used for plotting
DATA['TPOT'] = {'models': models,
                'rmses': rmses,
                'errors': errors,
                'x': x,
                'y': y,
                'p_docs': p_docs,
                'blocks': blocks,
                'block_list': block_list}


This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.

                                                                          

Generation 1 - Current best internal CV score: 0.812526784571

Best pipeline: XGBRegressor(MaxAbsScaler(input_matrix), XGBRegressor__learning_rate=DEFAULT, XGBRegressor__max_depth=7, XGBRegressor__min_child_weight=14, XGBRegressor__n_estimators=DEFAULT, XGBRegressor__nthread=1, XGBRegressor__subsample=0.3)


                                                                          

Generation 1 - Current best internal CV score: 0.309514890112

Best pipeline: RandomForestRegressor(input_matrix, RandomForestRegressor__bootstrap=True, RandomForestRegressor__max_features=0.75, RandomForestRegressor__min_samples_leaf=1, RandomForestRegressor__min_samples_split=9, RandomForestRegressor__n_estimators=100)


                                                                  


TPOT closed prematurely. Will use the current best pipeline.


RuntimeError: A pipeline has not yet been optimized. Please call fit() first.

### Save regression

In [48]:
# Save the TPOT regressions. 
for feature_set in FEATURE_SETS:
    # Save the models alone for GASpy_predict to use
    with open('pkls/models/TPOT_model_' + feature_set + '_' \
              + '-'.join(DATA['TPOT']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(DATA['TPOT']['models'][feature_set], f)
        
    # Save the entire package to use later in this notebook
    data = {
            'models': DATA['TPOT']['models'][feature_set],
            'rmses': DATA['TPOT']['rmses'][feature_set],
            'errors': DATA['TPOT']['errors'][feature_set],
            'x': DATA['TPOT']['x'][feature_set],
            'y': DATA['TPOT']['y'][feature_set],
            'p_docs': DATA['TPOT']['p_docs'][feature_set],
            'block_list': DATA['TPOT']['block_list'][feature_set],
            }
    with open('pkls/data/TPOT_data_' + feature_set + '_' + \
              '-'.join(DATA['TPOT']['blocks']) + '.pkl', 'wb') as f:
        pickle.dump(data, f)

PicklingError: Can't pickle <class 'tpot.operator_utils.RandomForestRegressor__bootstrap'>: it's not found as tpot.operator_utils.RandomForestRegressor__bootstrap

### Load regression

In [49]:
# What blocking-types do we want to open?
blocks = []

# Initialize the data ball
DATA['TPOT'] = {
              'models': dict.fromkeys(FEATURE_SETS),
              'rmses': dict.fromkeys(FEATURE_SETS),
              'errors': dict.fromkeys(FEATURE_SETS),
              'x': dict.fromkeys(FEATURE_SETS),
              'y': dict.fromkeys(FEATURE_SETS),
              'p_docs': dict.fromkeys(FEATURE_SETS),
              'block_list': dict.fromkeys(FEATURE_SETS),
             }
# Open all the databalls and put them into DATA
for feature_set in FEATURE_SETS:
    with open('pkls/data/TPOT_data_' + feature_set + '_' + '-'.join(blocks) + '.pkl', 'rb') as f:
        data = pickle.load(f)
    for key, value in data.iteritems():
        DATA['TPOT'][key][feature_set] = value

IOError: [Errno 2] No such file or directory: 'pkls/data/TPOT_data_energy_fr_coordcount_.pkl'

# Plot

In [44]:
# Look at all of the different regressor types
for regressor, data in DATA.iteritems():
    print('Regressor:  ' + str(regressor))
    # Pull data out of the data ball
    models = data['models']
    rmses = data['rmses']
    x = data['x']
    y = data['y']
    p_docs = data['p_docs']
    
    # Create a plot for each of the regressor type/feature set combinations
    for feature_set in models:
        print('    Features:  ' + str(feature_set))
        traces = []
        # Within each plot, create a separate data set for each block
        for block, model in models[feature_set].iteritems():
            print('        Block:  ' + str(block))
            # Pull out the data for this feature set/block combination
            _y = y[feature_set][block]['train+test']
            _p_docs = p_docs[feature_set][block]['train+test']
            _rmses = rmses[feature_set][block]
            
            # Print the RMSE values
            for dataset, rmse in _rmses.iteritems():
                print('                ' + str(dataset) + ':  ' + str(rmse))
            
            # Label each data point depending on what type of feature set we're using
            if feature_set == 'energy_fr_coordcount':
                coords = [coord for coord in _p_docs['coordination']]
                text = ['Site:  %s' % coord for coord in coords]
            elif feature_set == 'energy_fr_coordcount_ads':
                coords = [coord for coord in _p_docs['coordination']]
                ads = [ads for ads in _p_docs['adsorbate']]
                text = ['Site:  %s, Ads:  %s' % (coord, ads[i])
                        for i, coord in enumerate(coords)]
            elif feature_set == 'energy_fr_coordcount_nncoord_ads':
                coords = [coord for coord in _p_docs['coordination']]
                ads = [ads for ads in _p_docs['adsorbate']]
                nncoords = [nncoord for nncoord in _p_docs['nextnearestcoordination']]
                text = ['Site:  %s, Ads:  %s, NNC:  %s' % (coord, ads[i], nncoords[i])
                        for i, coord in enumerate(coords)]
            elif feature_set == 'energy_fr_gcn_ads':
                coords = [coord for coord in _p_docs['coordination']]
                ads = [ads for ads in _p_docs['adsorbate']]
                text = ['Site:  %s, Ads:  %s' % (coord, ads[i])
                        for i, coord in enumerate(coords)]
            elif feature_set == 'energy_fr_nncoord':
                pass
            else:
                raise Exception('You still need to hard-code the text for the %s' \
                                % feature_set)
            # Add the data to the trace
            traces.append(go.Scatter(x=_y, y=_y+_errors,
                                     name=block, text=text, mode='markers'))

        # Create a diagonal line for the parity plot
        lims = [-4, 6]
        traces.append(go.Scatter(x=lims, y=lims,
                                 line=dict(color=('black'), dash='dash'),
                                 name='Parity line'))
        # Format and plot
        layout = go.Layout(xaxis=dict(title='DFT (eV)'),
                           yaxis=dict(title='Regressed (eV)'),
                           title='Predicting %s using a %s model' \
                                 % (feature_set, regressor))
        iplot(go.Figure(data=traces, layout=layout))

Regressor:  GP
    Features:  energy_fr_coordcount_ads
        Block:  no_block
                test:  0.588595881395
                train:  0.418852611099
                train+test:  0.46711399818


    Features:  energy_fr_coordcount
        Block:  no_block
                test:  0.915575713681
                train:  0.839934374795
                train+test:  0.85947179727


    Features:  energy_fr_nncoord
        Block:  no_block
                test:  1.06365990592
                train:  0.859434699121
                train+test:  0.914783065653


    Features:  energy_fr_coordcount_nncoord_ads
        Block:  no_block
                test:  0.861167992392
                train:  0.183997031579
                train+test:  0.459150203882
