# regress.ipynb
Author:  Kevin Tran <ktran@andrew.cmu.edu>

This python notebook performs regressions on data pulled from a local GASdb. It then saves these regressions into pickles (for later use) and creates parity plots of the regression fits.

## Initializations

###### Imports

In [1]:
from pprint import pprint   # for debugging
import sys
import math
import numpy as np
sys.path.append('..')
from gaspy.utils import vasp_settings_to_str
#from gas_pull import GASPull
import gas_pull
gas_pull = reload(gas_pull)
GASPull = gas_pull.GASPull
from pymatgen.matproj.rest import MPRester
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels \
    import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared
from tpot import TPOTRegressor
import alamopy
import dill as pickle
pickle.settings['recurse'] = True     # required to pickle lambdify functions
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.plotly as py
import plotly.graph_objs as go

###### Load data

In [4]:
## Location of the *.db file
DB_LOC = '/global/cscratch1/sd/zulissi/GASpy_DB/'  # Cori
#DB_LOC = '/Users/KTran/Nerd/GASpy'                 # Local

# Calculation settings we want to look at
VASP_SETTINGS = vasp_settings_to_str({
                                      #'gga': 'BF',
                                      #'pp_version': '5.4.',
                                      #'encut': 350
                                     })

# Pull the data from the database. We do it once for each set of features, since each
# set of features will create a different shape of `X`
GAS_PULL = GASPull(DB_LOC, VASP_SETTINGS, split=True)
FEATURE_SETS = [
                'energy_fr_coordcount_ads',
                'energy_fr_coordcount_nncoord_ads',
                'energy_fr_gcn_ads'
                ]
X = dict.fromkeys(FEATURE_SETS)
Y = dict.fromkeys(FEATURE_SETS)
DATA = dict.fromkeys(FEATURE_SETS)
X_TRAIN = dict.fromkeys(FEATURE_SETS)
X_TEST = dict.fromkeys(FEATURE_SETS)
Y_TRAIN = dict.fromkeys(FEATURE_SETS)
Y_TEST = dict.fromkeys(FEATURE_SETS)
lb_ads = dict.fromkeys(FEATURE_SETS)
lb_coord = dict.fromkeys(FEATURE_SETS)
for feature_set in FEATURE_SETS:
    X[feature_set], Y[feature_set], DATA[feature_set], \
    X_TRAIN[feature_set], X_TEST[feature_set], \
    Y_TRAIN[feature_set], Y_TEST[feature_set], \
    lb_ads[feature_set], lb_coord[feature_set] = \
            getattr(GAS_PULL, feature_set)()
%store X
%store Y
%store DATA
%store X_TRAIN
%store X_TEST
%store Y_TRAIN
%store Y_TEST
%store lb_ads
%store lb_coord

Pulling out data point #0 for GCN
Pulling out data point #100 for GCN
Pulling out data point #200 for GCN
Pulling out data point #300 for GCN
Pulling out data point #400 for GCN
Pulling out data point #500 for GCN
Pulling out data point #600 for GCN
Pulling out data point #700 for GCN
Pulling out data point #800 for GCN
Pulling out data point #900 for GCN
Pulling out data point #1000 for GCN
Pulling out data point #1100 for GCN
Pulling out data point #1200 for GCN
Pulling out data point #1300 for GCN
Pulling out data point #1400 for GCN
Pulling out data point #1500 for GCN
Pulling out data point #1600 for GCN
Pulling out data point #1700 for GCN
Pulling out data point #1800 for GCN
Pulling out data point #1900 for GCN
Pulling out data point #2000 for GCN
Pulling out data point #2100 for GCN
Pulling out data point #2200 for GCN
Pulling out data point #2300 for GCN
Pulling out data point #2400 for GCN
Pulling out data point #2500 for GCN
Pulling out data point #2600 for GCN
Pulling out d

In [2]:
%store -r X
%store -r Y
%store -r DATA
%store -r X_TRAIN
%store -r X_TEST
%store -r Y_TRAIN
%store -r Y_TEST
%store -r lb_ads
%store -r lb_coord

no stored variable Y DATA X_TRAIN X_TEST Y_TRAIN Y_TEST lb_ads lb_coord


## Regressions
Create surrogate models using different methods

###### SKLearn Linear Regression

In [5]:
LR = dict.fromkeys(FEATURE_SETS)
for feature_set in FEATURE_SETS:
    LR[feature_set] = LinearRegression()
    LR[feature_set].fit(X_TRAIN[feature_set], Y_TRAIN[feature_set])
    LR[feature_set].name = 'Linear'
    LR[feature_set].rmse = \
            math.sqrt(metrics.mean_squared_error(Y_TEST[feature_set],
                                                 LR[feature_set].predict(X_TEST[feature_set])))
    print('RMSE of LR on %s = %s' % (feature_set, LR[feature_set].rmse))
    pickle.dump({'model': LR[feature_set],
                 'pre_processors': {'coordination': lb_coord[feature_set],
                                    'adsorbate': lb_ads[feature_set]}},
                open('pkls/' + LR[feature_set].name + '_' + feature_set + '.pkl', 'w'))


RMSE of LR on energy_fr_coordcount_ads = 0.653951728688
RMSE of LR on energy_fr_coordcount_nncoord_ads = 0.65059873891
RMSE of LR on energy_fr_gcn_ads = 0.670309869471


###### SKLearn Gradient Boosting Ensemble Regression

In [6]:
GBE = dict.fromkeys(FEATURE_SETS)
for feature_set in FEATURE_SETS:
    GBE[feature_set] = GradientBoostingRegressor()
    GBE[feature_set].fit(X_TRAIN[feature_set], Y_TRAIN[feature_set])
    GBE[feature_set].name = 'GBE'
    GBE[feature_set].rmse = \
            math.sqrt(metrics.mean_squared_error(Y_TEST[feature_set],
                                                 GBE[feature_set].predict(X_TEST[feature_set])))
    print('RMSE of GBE on %s = %s' % (feature_set, GBE[feature_set].rmse))
    pickle.dump({'model': GBE[feature_set],
                 'pre_processors': {'coordination': lb_coord[feature_set],
                                    'adsorbate': lb_ads[feature_set]}},
                open('pkls/' + GBE[feature_set].name + '_' + feature_set + '.pkl', 'w'))

RMSE of GBE on energy_fr_coordcount_ads = 0.607421210031
RMSE of GBE on energy_fr_coordcount_nncoord_ads = 0.608158929579
RMSE of GBE on energy_fr_gcn_ads = 0.606895186581


###### SKLearn Gaussian Process Regressor

In [6]:
K = 1.0*RBF(length_scale=1.0) + 1.0*WhiteKernel(noise_level=0.05**2.0) 
GP = dict.fromkeys(FEATURE_SETS)
for feature_set in FEATURE_SETS:
    GP[feature_set] = GaussianProcessRegressor(kernel=K, n_restarts_optimizer=0)
    GP[feature_set].fit(X_TRAIN[feature_set], Y_TRAIN[feature_set])
    GP[feature_set].name = 'GP'
    GP[feature_set].rmse = \
            math.sqrt(metrics.mean_squared_error(Y_TEST[feature_set],
                                                 GP[feature_set].predict(X_TEST[feature_set])))
    print('RMSE of GP on %s = %s' % (feature_set, GP[feature_set].rmse))
    pickle.dump({'model': GP[feature_set],
                 'pre_processors': {'coordination': lb_coord[feature_set],
                                    'adsorbate': lb_ads[feature_set]}},
                open('pkls/' + GP[feature_set].name + '_' + feature_set + '.pkl', 'w'))

RMSE of GP on energy_fr_coordcount_ads = 0.646968965459
RMSE of GP on energy_fr_coordcount_nncoord_ads = 0.593565559035
RMSE of GP on energy_fr_gcn_ads = 0.769528376788


In [7]:
# Open the GP models
GP = dict.fromkeys(FEATURE_SETS)
for feature_set in FEATURE_SETS:
    with open('pkls/GP_' + feature_set + '.pkl', 'r') as fname:
        pkl = pickle.load(fname)
    GP[feature_set] = pkl['model']

###### TPOT Regression

In [8]:
# Run the TPOT Regression
TPOT = dict.fromkeys(FEATURE_SETS)
for feature_set in FEATURE_SETS:
    TPOT[feature_set] = TPOTRegressor(generations=10,
                                      population_size=10,
                                      verbosity=2,
                                      random_state=42)
    TPOT[feature_set].fit(X_TRAIN[feature_set], Y_TRAIN[feature_set])
    TPOT[feature_set].name = 'TPOT'
    TPOT[feature_set].rmse = \
            math.sqrt(metrics.mean_squared_error(Y_TEST[feature_set],
                                                 TPOT[feature_set].predict(X_TEST[feature_set])))
    print('RMSE of TPOT on %s = %s' % (feature_set, TPOT[feature_set].rmse))


This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.

Optimization Progress:  18%|█▊        | 20/110 [01:03<08:01,  5.35s/pipeline]

Generation 1 - Current best internal CV score: 0.298510497563


Optimization Progress:  27%|██▋       | 30/110 [01:32<04:42,  3.53s/pipeline]

Generation 2 - Current best internal CV score: 0.298510497563


Optimization Progress:  35%|███▌      | 39/110 [02:08<05:44,  4.85s/pipeline]

Generation 3 - Current best internal CV score: 0.295313393635


Optimization Progress:  45%|████▍     | 49/110 [02:41<03:53,  3.82s/pipeline]

Generation 4 - Current best internal CV score: 0.295313393635


Optimization Progress:  54%|█████▎    | 59/110 [03:24<04:18,  5.06s/pipeline]

Generation 5 - Current best internal CV score: 0.287472966487


Optimization Progress:  60%|██████    | 66/110 [04:03<05:45,  7.85s/pipeline]

Generation 6 - Current best internal CV score: 0.286772433652


Optimization Progress:  69%|██████▉   | 76/110 [04:38<02:35,  4.56s/pipeline]

Generation 7 - Current best internal CV score: 0.28342253174


Optimization Progress:  76%|███████▋  | 84/110 [05:58<03:17,  7.61s/pipeline]

Generation 8 - Current best internal CV score: 0.28342253174


          on Progress:  84%|████████▎ | 92/110 [07:03<02:12,  7.38s/pipeline]

Generation 9 - Current best internal CV score: 0.28342253174


                                                                              

Generation 10 - Current best internal CV score: 0.28342253174

Best pipeline: RandomForestRegressor(PCA(input_matrix, PCA__iterated_power=5, PCA__svd_solver=randomized), RandomForestRegressor__bootstrap=True, RandomForestRegressor__max_features=0.55, RandomForestRegressor__min_samples_leaf=4, RandomForestRegressor__min_samples_split=11, RandomForestRegressor__n_estimators=100)
RMSE of TPOT on energy_fr_coordcount_ads = 0.542669929192


Optimization Progress:  18%|█▊        | 20/110 [01:55<15:17, 10.19s/pipeline]

Generation 1 - Current best internal CV score: 0.311355696602


Optimization Progress:  27%|██▋       | 30/110 [03:25<15:58, 11.98s/pipeline]

Generation 2 - Current best internal CV score: 0.290183028918


Optimization Progress:  36%|███▋      | 40/110 [04:34<08:49,  7.56s/pipeline]

Generation 3 - Current best internal CV score: 0.287639894865


Optimization Progress:  45%|████▌     | 50/110 [06:42<11:28, 11.48s/pipeline]

Generation 4 - Current best internal CV score: 0.287639894865


Optimization Progress:  54%|█████▎    | 59/110 [07:45<08:51, 10.41s/pipeline]

Generation 5 - Current best internal CV score: 0.285076734832


          on Progress:  63%|██████▎   | 69/110 [08:55<06:05,  8.92s/pipeline]

Generation 6 - Current best internal CV score: 0.285076734832


          on Progress:  69%|██████▉   | 76/110 [10:11<05:52, 10.37s/pipeline]

Generation 7 - Current best internal CV score: 0.282223264668


Optimization Progress:  80%|████████  | 88/110 [10:54<02:07,  5.77s/pipeline]

Generation 8 - Current best internal CV score: 0.282223264668


Optimization Progress:  89%|████████▉ | 98/110 [11:59<01:27,  7.32s/pipeline]

Generation 9 - Current best internal CV score: 0.282223264668


                                                                              

Generation 10 - Current best internal CV score: 0.278412150428

Best pipeline: ExtraTreesRegressor(LassoLarsCV(input_matrix, LassoLarsCV__normalize=False), ExtraTreesRegressor__bootstrap=False, ExtraTreesRegressor__max_features=0.65, ExtraTreesRegressor__min_samples_leaf=DEFAULT, ExtraTreesRegressor__min_samples_split=17, ExtraTreesRegressor__n_estimators=100)
RMSE of TPOT on energy_fr_coordcount_nncoord_ads = 0.536699499429


          on Progress:  17%|█▋        | 19/110 [01:16<09:55,  6.55s/pipeline]

Generation 1 - Current best internal CV score: 0.31646640521


Optimization Progress:  26%|██▋       | 29/110 [01:57<09:04,  6.72s/pipeline]

Generation 2 - Current best internal CV score: 0.31646640521


Optimization Progress:  35%|███▍      | 38/110 [02:27<05:39,  4.72s/pipeline]

Generation 3 - Current best internal CV score: 0.305098320635


Optimization Progress:  43%|████▎     | 47/110 [03:09<05:13,  4.98s/pipeline]

Generation 4 - Current best internal CV score: 0.305098320635


          on Progress:  49%|████▉     | 54/110 [03:45<03:56,  4.23s/pipeline]

Generation 5 - Current best internal CV score: 0.298095686772


Optimization Progress:  60%|██████    | 66/110 [04:16<02:55,  4.00s/pipeline]

Generation 6 - Current best internal CV score: 0.297184537999


Optimization Progress:  69%|██████▉   | 76/110 [05:19<03:29,  6.17s/pipeline]

Generation 7 - Current best internal CV score: 0.287668408248


Optimization Progress:  78%|███████▊  | 86/110 [06:06<02:17,  5.72s/pipeline]

Generation 8 - Current best internal CV score: 0.287668408248


          on Progress:  86%|████████▋ | 95/110 [07:13<01:42,  6.82s/pipeline]

Generation 9 - Current best internal CV score: 0.287668408248


                                                                              

Generation 10 - Current best internal CV score: 0.287668408248

Best pipeline: RandomForestRegressor(LassoLarsCV(PCA(MaxAbsScaler(input_matrix), PCA__iterated_power=3, PCA__svd_solver=DEFAULT), LassoLarsCV__normalize=DEFAULT), RandomForestRegressor__bootstrap=DEFAULT, RandomForestRegressor__max_features=0.5, RandomForestRegressor__min_samples_leaf=5, RandomForestRegressor__min_samples_split=11, RandomForestRegressor__n_estimators=100)
RMSE of TPOT on energy_fr_gcn_ads = 0.546040998832


In [9]:
# Save the regressions
for feature_set in FEATURE_SETS:
    pickle.dump({'model': TPOT[feature_set].fitted_pipeline_,
                 'pre_processors': {'coordination': lb_coord[feature_set],
                                    'adsorbate': lb_ads[feature_set]}},
                open('pkls/' + TPOT[feature_set].name + '_' + feature_set + '.pkl', 'wb'))

In [6]:
# Open the TPOT models
TPOT = dict.fromkeys(FEATURE_SETS)
for feature_set in FEATURE_SETS:
    with open('pkls/TPOT_' + feature_set + '.pkl', 'r') as fname:
        pkl = pickle.load(fname)
    TPOT[feature_set] = pkl['model']
    TPOT[feature_set].name = 'TPOT'


This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.



###### Alamo Regression

In [None]:
# Since Alamo can take awhile, we actually try to load a pickle of the previous run
# before calling alamopy. Simply delete the pickle if you want to re-run.
ALA = dict.fromkeys(FEATURE_SETS)
for feature_set in FEATURE_SETS:
    try:
        ALA[feature_set] = pickle.load(open('pkls/Alamo_' + feature_set + '.pkl', 'r'))['model']
    except IOError:
        ALA[feature_set] = alamopy.doalamo(X_TRAIN[feature_set],
                                           Y_TRAIN[feature_set].reshape(len(Y_TRAIN[feature_set]), 1),
                                           X_TEST[feature_set],
                                           Y_TEST[feature_set].reshape(len(Y_TEST[feature_set]), 1),
                                           showalm=1,
                                           linfcns=1,
                                           expfcns=1,
                                           logfcns=1,
                                           monomialpower=(1, 2, 3),
                                           multi2power=(1, 2, 3),
                                           ratiopower=(1, 2, 3)
                                          )
        ALA[feature_set]['name'] = 'Alamo'
        ALA[feature_set].rmse = \
                math.sqrt(metrics.mean_squared_error(Y_TEST[feature_set],
                                                     ALA[feature_set].predict(X_TEST[feature_set])))
        print('RMSE of ALA on %s = %s' % (feature_set, ALA[feature_set].rmse))
        pickle.dump({'model': ALA[feature_set],
                     'pre_processors': {'coordination': lb_coord,
                                        'adsorbate': lb_ads}},
                    open('pkls/Alamo_' + feature_set + '.pkl', 'w'))
    pprint(ALA[feature_set]['model'])

## Plotting

###### SKLearn-types

In [10]:
# For each feature set and model combination...
for feature_set in FEATURE_SETS:
    data = DATA[feature_set]
    #for model in [LR, GBE, GP, TPOT]:
    for model in [GP, TPOT]:
        traces = []
        # Create a parity plot where each adsorbate is shown. We do that by pulling out
        # data for each adsorbate and then plotting them.
        for ads in np.unique(data['adsorbate']):
            # We loop through all of our data and pull out the transformed features (x),
            # the DFT energy (y), and the user-readable features (text).
            x = []
            y = []
            text = []
            for i, _ads in enumerate(data['adsorbate']):
                if _ads == ads:
                    x.append(X[feature_set][i])
                    y.append(Y[feature_set][i])
                    if feature_set == 'energy_fr_coordcount_ads':
                        text.append('Site:  %s' \
                                    % (data['coordination'][i]))
                    elif feature_set == 'energy_fr_coordcount_nncoord_ads':
                        text.append('Site:  %s\rNNNeighbor:  %s' \
                                    % (data['coordination'][i],
                                       data['nextnearestcoordination'][i]))
                    elif feature_set == 'energy_fr_gcn_ads':
                        text.append('Site:  %s' \
                                    % (data['coordination'][i]))
                    else:
                        raise Exception('You still need to hard-code the text for the %s' \
                                        % feature_set)
            # Use the transformed features (x) to calculate a predicted energy (y_predicted).
            # Then add it to `traces` for plotting.
            y_predicted = model[feature_set].predict(np.array(x))
            traces.append(go.Scatter(x=y_predicted,
                                     y=y,
                                     mode='markers',
                                     text=text,
                                     name=ads))
        # Create a diagonal line for the parity plot
        lims = [-4, 6]
        traces.append(go.Scatter(x=lims, y=lims,
                                 line=dict(color=('black'), dash='dash'), name='Parity line'))
        # Format and plot
        layout = go.Layout(xaxis=dict(title='Regressed (eV)'),
                           yaxis=dict(title='DFT (eV)'),
                           title='Predicting %s using a %s model; RMSE = %0.3f eV' \
                                 % (feature_set,
                                    model[feature_set].name,
                                    math.sqrt(metrics.mean_squared_error(Y_TEST[feature_set],
                                                                         model[feature_set].predict(X_TEST[feature_set])))))
        iplot(go.Figure(data=traces, layout=layout))

###### Alamo

In [None]:
# Create Plotly plots for each dictionary-type model
for feature_set in FEATURE_SETS:
    for model in [ALA[feature_set]]:
        traces = []
        # Create a parity plot where each adsorbate is shown. We do that by pulling out
        # data for each adsorbate and then plotting them.
        for ads in np.unique(DATA[feature_set]['adsorbate']):
            # We loop through all of our data and pull out the vectorized coordination (x),
            # the DFT energy (y), and the coordination site (text).
            x = []
            y = []
            text = []
            for i, _ads in enumerate(DATA[feature_set]['adsorbate']):
                if _ads == ads:
                    x.append(X[i][feature_set])
                    y.append(Y[i][feature_set])
                    if feature_set == 'energy_fr_coordcount_ads':
                        text.append('Site:  %s' \
                                    % DATA[feature_set]['coordination'][i])
                    elif feature_set == 'energy_fr_coordcount_neighborcount_ads':
                        text.append('Site:  %s\rNeighbor:  %s' \
                                    % (DATA[feature_set]['coordination'][i],
                                       DATA[feature_set]['nextnearestcoordination'][i]))
                    else:
                        raise Exception('You still need to hard-code the text for the %s' \
                                        % feature_set)

            # Do some footwork because Alamo returns a lambda function that doesn't accept np arrays
            def model_predict(factors):
                '''
                Turn a vector of input data, `factors`, into the model's guessed output. We use
                this function to do so because lambda functions suck. We should address this by
                making alamopy output a better lambda function.
                '''
                args = dict.fromkeys(range(0, len(factors)-1), None)
                for j, factor in enumerate(factors):
                    args[j] = factor
                return model['f(model)'](args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], args[13], args[14], args[15], args[16], args[17], args[18], args[19], args[20], args[21], args[22], args[23], args[24], args[25])
            y_predicted = map(model_predict, x)

            # Plot
            traces.append(go.Scatter(x=y_predicted,
                                     y=y,
                                     mode='markers',
                                     text=text,
                                     name=ads))
        # Create a diagonal line for the parity plot
        lims = [-4, 6]
        traces.append(go.Scatter(x=lims, y=lims,
                                 line=dict(color=('black'), dash='dash'), name='Parity line'))
        # Format and plot
        layout = go.Layout(xaxis=dict(title='Regressed (eV)'),
                           yaxis=dict(title='DFT (eV)'),
                           title='Predicting %s using a %s model; RMSE = %0.3f eV' \
                                 % (feature_set,
                                    model['name'],
                                    math.sqrt(metrics.mean_squared_error(Y_TEST, map(model_predict, X_TEST)))))
        iplot(go.Figure(data=traces, layout=layout))