# regress.ipynb
Author:  Kevin Tran <ktran@andrew.cmu.edu>

This python notebook performs regressions on data pulled from a local GASdb. It then saves these regressions into pickles (for later use) and creates parity plots of the regression fits.

## Initializations

###### Imports

In [2]:
from pprint import pprint   # for debugging
import sys
import math
import numpy as np
sys.path.append('..')
from gaspy.utils import vasp_settings_to_str
from gas_pull import GASPull
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels \
    import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared
from tpot import TPOTRegressor
import alamopy
import dill as pickle
pickle.settings['recurse'] = True     # required to pickle lambdify functions
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.plotly as py
import plotly.graph_objs as go

###### Load data

In [3]:
# Location of the *.db file
DB_LOC = '/global/cscratch1/sd/zulissi/GASpy_DB/'  # Cori
#DB_LOC = '/Users/KTran/Nerd/GASpy'                 # Local

# Calculation settings we want to look at
VASP_SETTINGS = vasp_settings_to_str({'gga': 'BF',
                                      'pp_version': '5.4.',
                                      'encut': 350})

# Pull the data from the database. We do it once for each set of features, since each
# set of features will create a different shape of `X`
GAS_PULL = GASPull(DB_LOC, VASP_SETTINGS, split=True)
FEATURE_SETS = ['energy_fr_coordcount_ads',
                'energy_fr_coordcount_neighborcount_ads']
X = dict.fromkeys(FEATURE_SETS)
Y = dict.fromkeys(FEATURE_SETS)
DATA = dict.fromkeys(FEATURE_SETS)
X_TRAIN = dict.fromkeys(FEATURE_SETS)
X_TEST = dict.fromkeys(FEATURE_SETS)
Y_TRAIN = dict.fromkeys(FEATURE_SETS)
Y_TEST = dict.fromkeys(FEATURE_SETS)
lb_ads = dict.fromkeys(FEATURE_SETS)
lb_coord = dict.fromkeys(FEATURE_SETS)
for feature_set in FEATURE_SETS:
    X[feature_set], Y[feature_set], DATA[feature_set], \
    X_TRAIN[feature_set], X_TEST[feature_set], \
    Y_TRAIN[feature_set], Y_TEST[feature_set], \
    lb_ads[feature_set], lb_coord[feature_set] = \
            getattr(GAS_PULL, feature_set)()

## Regressions
Create surrogate models using different methods

###### SKLearn Linear Regression

In [4]:
LR = dict.fromkeys(FEATURE_SETS)
for feature_set in FEATURE_SETS:
    LR[feature_set] = LinearRegression()
    LR[feature_set].fit(X_TRAIN[feature_set], Y_TRAIN[feature_set])
    LR[feature_set].name = 'Linear'
    pickle.dump({'model': LR[feature_set],
                 'pre_processors': {'coordination': lb_coord[feature_set],
                                    'adsorbate': lb_ads[feature_set]}},
                open('pkls/' + LR[feature_set].name + '_' + feature_set + '.pkl', 'w'))

###### SKLearn Gradient Boosting Ensemble Regression

In [5]:
GBE = dict.fromkeys(FEATURE_SETS)
for feature_set in FEATURE_SETS:
    GBE[feature_set] = GradientBoostingRegressor()
    GBE[feature_set].fit(X_TRAIN[feature_set], Y_TRAIN[feature_set])
    GBE[feature_set].name = 'GBE'
    pickle.dump({'model': GBE[feature_set],
                 'pre_processors': {'coordination': lb_coord[feature_set],
                                    'adsorbate': lb_ads[feature_set]}},
                open('pkls/' + GBE[feature_set].name + '_' + feature_set + '.pkl', 'w'))

###### SKLearn Gaussian Process Regressor

In [6]:
K = 1.0*RBF(length_scale=1.0) + 1.0*WhiteKernel(noise_level=0.05**2.0) 
GP = dict.fromkeys(FEATURE_SETS)
for feature_set in FEATURE_SETS:
    GP[feature_set] = GaussianProcessRegressor(kernel=K, n_restarts_optimizer=10)
    GP[feature_set].fit(X_TRAIN[feature_set], Y_TRAIN[feature_set])
    GP[feature_set].name = 'GP'
    pickle.dump({'model': GP[feature_set],
                 'pre_processors': {'coordination': lb_coord[feature_set],
                                    'adsorbate': lb_ads[feature_set]}},
                open('pkls/' + GP[feature_set].name + '_' + feature_set + '.pkl', 'w'))

###### TPOT Regression

In [16]:
# Run the TPOT Regression
#TPOT = dict.fromkeys(FEATURE_SETS)
for feature_set in [FEATURE_SETS[1]]:
    TPOT[feature_set] = TPOTRegressor(generations=10,
                                      population_size=10,
                                      verbosity=2,
                                      random_state=42)
    TPOT[feature_set].fit(X_TRAIN[feature_set], Y_TRAIN[feature_set])
    TPOT[feature_set].name = 'TPOT'

Optimization Progress:  18%|█▊        | 20/110 [00:41<05:14,  3.49s/pipeline]

Generation 1 - Current best internal CV score: 0.244175236446


Optimization Progress:  27%|██▋       | 30/110 [01:06<03:37,  2.72s/pipeline]

Generation 2 - Current best internal CV score: 0.238037997434


Optimization Progress:  35%|███▌      | 39/110 [01:25<04:22,  3.70s/pipeline]

Generation 3 - Current best internal CV score: 0.238037997434


Optimization Progress:  45%|████▍     | 49/110 [01:56<02:58,  2.92s/pipeline]

Generation 4 - Current best internal CV score: 0.221567313163


Optimization Progress:  54%|█████▎    | 59/110 [02:23<01:58,  2.33s/pipeline]

Generation 5 - Current best internal CV score: 0.221567313163


Optimization Progress:  63%|██████▎   | 69/110 [02:54<02:27,  3.61s/pipeline]

Generation 6 - Current best internal CV score: 0.216864350028


Optimization Progress:  72%|███████▏  | 79/110 [03:16<01:48,  3.52s/pipeline]

Generation 7 - Current best internal CV score: 0.216864350028


Optimization Progress:  81%|████████  | 89/110 [03:38<00:44,  2.11s/pipeline]

Generation 8 - Current best internal CV score: 0.216864350028


Optimization Progress:  87%|████████▋ | 96/110 [03:54<00:46,  3.32s/pipeline]

Generation 9 - Current best internal CV score: 0.213750902162


                                                                              

Generation 10 - Current best internal CV score: 0.213750902162

Best pipeline: ExtraTreesRegressor(LassoLarsCV(input_matrix, LassoLarsCV__normalize=DEFAULT), ExtraTreesRegressor__bootstrap=False, ExtraTreesRegressor__max_features=0.55, ExtraTreesRegressor__min_samples_leaf=3, ExtraTreesRegressor__min_samples_split=3, ExtraTreesRegressor__n_estimators=100)


In [17]:
# Save the regressions
for feature_set in FEATURE_SETS:
    pickle.dump({'model': TPOT[feature_set].fitted_pipeline_,
                 'pre_processors': {'coordination': lb_coord[feature_set],
                                    'adsorbate': lb_ads[feature_set]}},
                open('pkls/' + TPOT[feature_set].name + '_' + feature_set + '.pkl', 'w'))

In [None]:
# Open the TPOT models
for feature_set in FEATURE_SETS:
    pkl = pickle.load(open('pkls/TPOT_' + feature_set + '.pkl', 'w'))
    TPOT[feature_set] = pkl['model']

###### Alamo Regression

In [25]:
# Since Alamo can take awhile, we actually try to load a pickle of the previous run
# before calling alamopy. Simply delete the pickle if you want to re-run.
ALA = dict.fromkeys(FEATURE_SETS)
for feature_set in FEATURE_SETS:
    try:
        ALA[feature_set] = pickle.load(open('pkls/Alamo_' + feature_set + '.pkl', 'r'))['model']
    except IOError:
        ALA[feature_set] = alamopy.doalamo(X_TRAIN[feature_set],
                                           Y_TRAIN[feature_set].reshape(len(Y_TRAIN[feature_set]), 1),
                                           X_TEST[feature_set],
                                           Y_TEST[feature_set].reshape(len(Y_TEST[feature_set]), 1),
                                           showalm=1,
                                           linfcns=1,
                                           expfcns=1,
                                           logfcns=1,
                                           monomialpower=(1, 2, 3),
                                           multi2power=(1, 2, 3),
                                           ratiopower=(1, 2, 3)
                                          )
        ALA[feature_set]['name'] = 'Alamo'
        pickle.dump({'model': ALA[feature_set],
                     'pre_processors': {'coordination': lb_coord,
                                        'adsorbate': lb_ads}},
                    open('pkls/Alamo_' + feature_set + '.pkl', 'w'))
    pprint(ALA[feature_set]['model'])

debug : 
monomialpower 1 2 3 
debug : 
ratiopower 1 2 3 
debug : 
multi2power 1 2 3 


IOError: [Errno 2] No such file or directory: 'temptrace.trc'

## Plotting

###### SKLearn-types

In [18]:
# For each feature set and model combination...
for feature_set in FEATURE_SETS:
    for model in [LR, GBE, GP, TPOT]:
        traces = []
        # Create a parity plot where each adsorbate is shown. We do that by pulling out
        # data for each adsorbate and then plotting them.
        for ads in np.unique(DATA[feature_set]['adsorbate']):
            # We loop through all of our data and pull out the transformed features (x),
            # the DFT energy (y), and the user-readable features (text).
            x = []
            y = []
            text = []
            for i, _ads in enumerate(DATA[feature_set]['adsorbate']):
                if _ads == ads:
                    x.append(X[feature_set][i])
                    y.append(Y[feature_set][i])
                    if feature_set == 'energy_fr_coordcount_ads':
                        text.append('Site:  %s' \
                                    % DATA[feature_set]['coordination'][i])
                    elif feature_set == 'energy_fr_coordcount_neighborcount_ads':
                        text.append('Site:  %s\rNeighbor:  %s' \
                                    % (DATA[feature_set]['coordination'][i],
                                       DATA[feature_set]['nextnearestcoordination'][i]))
                    else:
                        raise Exception('You still need to hard-code the text for the %s' \
                                        % feature_set)
            # Use the transformed features (x) to calculate a predicted energy (y_predicted).
            # Then add it to `traces` for plotting.
            y_predicted = model[feature_set].predict(np.array(x))
            traces.append(go.Scatter(x=y_predicted,
                                     y=y,
                                     mode='markers',
                                     text=text,
                                     name=ads))
        # Create a diagonal line for the parity plot
        lims = [-4, 6]
        traces.append(go.Scatter(x=lims, y=lims,
                                 line=dict(color=('black'), dash='dash'), name='Parity line'))
        # Format and plot
        layout = go.Layout(xaxis=dict(title='Regressed (eV)'),
                           yaxis=dict(title='DFT (eV)'),
                           title='Predicting %s using a %s model; RMSE = %0.3f eV' \
                                 % (feature_set,
                                    model[feature_set].name,
                                    math.sqrt(metrics.mean_squared_error(Y_TEST[feature_set],
                                                                         model[feature_set].predict(X_TEST[feature_set])))))
        iplot(go.Figure(data=traces, layout=layout))

###### Alamo

In [None]:
# Create Plotly plots for each dictionary-type model
for feature_set in FEATURE_SETS:
    for model in [ALA[feature_set]]:
        traces = []
        # Create a parity plot where each adsorbate is shown. We do that by pulling out
        # data for each adsorbate and then plotting them.
        for ads in np.unique(DATA[feature_set]['adsorbate']):
            # We loop through all of our data and pull out the vectorized coordination (x),
            # the DFT energy (y), and the coordination site (text).
            x = []
            y = []
            text = []
            for i, _ads in enumerate(DATA[feature_set]['adsorbate']):
                if _ads == ads:
                    x.append(X[i][feature_set])
                    y.append(Y[i][feature_set])
                    if feature_set == 'energy_fr_coordcount_ads':
                        text.append('Site:  %s' \
                                    % DATA[feature_set]['coordination'][i])
                    elif feature_set == 'energy_fr_coordcount_neighborcount_ads':
                        text.append('Site:  %s\rNeighbor:  %s' \
                                    % (DATA[feature_set]['coordination'][i],
                                       DATA[feature_set]['nextnearestcoordination'][i]))
                    else:
                        raise Exception('You still need to hard-code the text for the %s' \
                                        % feature_set)

            # Do some footwork because Alamo returns a lambda function that doesn't accept np arrays
            def model_predict(factors):
                '''
                Turn a vector of input data, `factors`, into the model's guessed output. We use
                this function to do so because lambda functions suck. We should address this by
                making alamopy output a better lambda function.
                '''
                args = dict.fromkeys(range(0, len(factors)-1), None)
                for j, factor in enumerate(factors):
                    args[j] = factor
                return model['f(model)'](args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], args[13], args[14], args[15], args[16], args[17], args[18], args[19], args[20], args[21], args[22], args[23], args[24], args[25])
            y_predicted = map(model_predict, x)

            # Plot
            traces.append(go.Scatter(x=y_predicted,
                                     y=y,
                                     mode='markers',
                                     text=text,
                                     name=ads))
        # Create a diagonal line for the parity plot
        lims = [-4, 6]
        traces.append(go.Scatter(x=lims, y=lims,
                                 line=dict(color=('black'), dash='dash'), name='Parity line'))
        # Format and plot
        layout = go.Layout(xaxis=dict(title='Regressed (eV)'),
                           yaxis=dict(title='DFT (eV)'),
                           title='Predicting %s using a %s model; RMSE = %0.3f eV' \
                                 % (feature_set,
                                    model['name'],
                                    math.sqrt(metrics.mean_squared_error(Y_TEST, map(model_predict, X_TEST)))))
        iplot(go.Figure(data=traces, layout=layout))