# regress.ipynb
Author:  Kevin Tran <ktran@andrew.cmu.edu>

This python notebook performs regressions on data pulled from a local GASdb. It then saves these regressions into pickles (for later use) and creates parity plots of the regression fits.

## Initializations

###### Imports

In [None]:
from pprint import pprint   # for debugging
import sys
import math
import numpy as np
sys.path.append('..')
from gaspy.utils import vasp_settings_to_str
from gas_pull import GASPull
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from tpot import TPOTRegressor
import alamopy
import dill as pickle
pickle.settings['recurse'] = True     # required to pickle lambdify functions
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.plotly as py
import plotly.graph_objs as go

###### Load data

In [None]:
# Location of the *.db file
#DB_LOC = '/global/cscratch1/sd/zulissi/GASpy_DB/'  # Cori
DB_LOC = '/Users/KTran/Nerd/GASpy'                 # Local

# Calculation settings we want to look at
VASP_SETTINGS = utils.vasp_settings_to_str({'gga': 'BF',
                                            'pp_version': '5.4.',
                                            'encut': 350})

# Pull the data from the Local database
GAS_PULL = GASPull(DB_LOC, VASP_SETTINGS, split=True)
X, Y, DATA, X_TRAIN, X_TEST, Y_TRAIN, Y_TEST, lb_ads, lb_coord = \
        GAS_PULL.energy_fr_coordcount_neighborcount_ads()

## Regressions
Create surrogate models using different methods

###### SKLearn Linear Regression

In [None]:
LR = LinearRegression()
LR.fit(X_TRAIN, Y_TRAIN)
LR.name = 'Linear'
pickle.dump({'model': LR,
             'pre_processors': {'coordination': lb_coord,
                                'adsorbate': lb_ads}},
            open('pkls/CoordcountNeighborcountAds_Energy_LR.pkl', 'w'))

###### SKLearn Gradient Boosting Ensemble Regression

In [None]:
GBE = GradientBoostingRegressor()
GBE.fit(X_TRAIN, Y_TRAIN)
GBE.name = 'GBE'
pickle.dump({'model': GBE,
             'pre_processors': {'coordination': lb_coord,
                                'adsorbate': lb_ads}},
            open('pkls/CoordcountNeighborcountAds_Energy_GBE.pkl', 'w'))

###### SKLearn Gaussian Process Regressor

In [None]:
GP = GaussianProcessRegressor()
GP.fit(X_TRAIN, Y_TRAIN)
GP.name = 'GP'
pickle.dump({'model': GP,
             'pre_processors': {'coordination': lb_coord,
                                'adsorbate': lb_ads}},
            open('pkls/CoordcountNeighborcountAds_Energy_GP.pkl', 'w'))

###### TPOT Regression

In [None]:
TPOT = TPOTRegressor(generations=100,
                     population_size=100,
                     verbosity=2,
                     random_state=42)
TPOT.fit(X_TRAIN, Y_TRAIN)
TPOT.name = 'TPOT'

In [None]:
pickle.dump({'model': TPOT.fitted_pipeline_,
             'pre_processors': {'coordination': lb_coord,
                                'adsorbate': lb_ads}},
            open('pkls/CoordcountNeighborcountAds_Energy_TPOT.pkl', 'w'))

In [None]:
TPOT_PKL = pickle.load(open('pkls/CoordcountNeighborcountAds_Energy_TPOT.pkl', 'r'))
TPOT = TPOT_PKL['model']

###### Alamo Regression

In [None]:
# Since Alamo can take awhile, we actually try to load a pickle of the previous run
# before calling alamopy. Simply delete the pickle if you want to re-run.
try:
    ALA = pickle.load(open('pkls/CoordcountNeighborcountAds_Energy_Ala.pkl', 'r'))['model']
except IOError:
    ALA = alamopy.doalamo(X_TRAIN, Y_TRAIN.reshape(len(Y_TRAIN), 1),
                          X_TEST, Y_TEST.reshape(len(Y_TEST), 1),
                          showalm=1,
                          linfcns=1,
                          expfcns=1,
                          logfcns=1,
                          monomialpower=(1, 2, 3),
                          multi2power=(1, 2, 3),
                          ratiopower=(1, 2, 3)
                         )
    ALA['name'] = 'Alamo'
    pickle.dump({'model': ALA,
                 'pre_processors': {'coordination': lb_coord,
                                    'adsorbate': lb_ads}},
                open('pkls/CoordcountNeighborcountAds_Energy_Ala.pkl', 'w'))
pprint(ALA['model'])

## Plotting

###### SKLearn-types

In [None]:
# For each model...
for model in [LR, GBE, GP, TPOT]:
    traces = []
    # Create a parity plot where each adsorbate is shown. We do that by pulling out
    # data for each adsorbate and then plotting them.
    for ads in np.unique(DATA['adsorbate']):
        # We loop through all of our data and pull out the vectorized coordinations (x),
        # the DFT energy (y), and the coordination site (text).
        x = []
        y = []
        text = []
        for i, _ads in enumerate(DATA['adsorbate']):
            if _ads == ads:
                x.append(X[i])
                y.append(Y[i])
                text.append('Site:  %s\rNeighbor:  %s' \
                            % (DATA['coordination'][i],
                               DATA['nextnearestcoordination'][i]))
        # Use the vectorized coordination (x) to calculate a predicted energy (y_predicted).
        # Then add it to `traces` for plotting.
        y_predicted = model.predict(np.array(x))
        traces.append(go.Scatter(x=y_predicted,
                                 y=y,
                                 mode='markers',
                                 text=text,
                                 name=ads))
    # Create a diagonal line for the parity plot
    lims = [-4, 6]
    traces.append(go.Scatter(x=lims, y=lims,
                             line=dict(color=('black'), dash='dash'), name='Parity line'))
    # Format and plot
    layout = go.Layout(xaxis=dict(title='Regressed (eV)'),
                       yaxis=dict(title='DFT (eV)'),
                       title='Adsorption Energy as a function of (Coordination Count, Neighbor Coordination Count, Adsorbate); Model = %s; RMSE = %0.3f eV' \
                             % (model.name, math.sqrt(metrics.mean_squared_error(Y_TEST, model.predict(X_TEST)))))
    iplot(go.Figure(data=traces, layout=layout))

###### Alamo

In [None]:
# Create Pyplot plots for each dictionary-type model
for model in [ALA]:
    traces = []
    # Create a parity plot where each adsorbate is shown. We do that by pulling out
    # data for each adsorbate and then plotting them.
    for ads in np.unique(DATA['adsorbate']):
        # We loop through all of our data and pull out the vectorized coordination (x),
        # the DFT energy (y), and the coordination site (text).
        x = []
        y = []
        text = []
        for i, _ads in enumerate(DATA['adsorbate']):
            if _ads == ads:
                x.append(X[i])
                y.append(Y[i])
                text.append('Site:  %s\rNeighbor:  %s' \
                            % (DATA['coordination'][i],
                               DATA['nextnearestcoordination'][i]))
                
        # Do some footwork because Alamo returns a lambda function that doesn't accept np arrays
        def model_predict(factors):
            '''
            Turn a vector of input data, `factors`, into the model's guessed output. We use
            this function to do so because lambda functions suck. We should address this by
            making alamopy output a better lambda function.
            '''
            args = dict.fromkeys(range(0, len(factors)-1), None)
            for j, factor in enumerate(factors):
                args[j] = factor
            return model['f(model)'](args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], args[13], args[14], args[15], args[16], args[17], args[18], args[19], args[20], args[21], args[22], args[23], args[24], args[25])
        y_predicted = map(model_predict, x)
        
        # Plot
        traces.append(go.Scatter(x=y_predicted,
                                 y=y,
                                 mode='markers',
                                 text=text,
                                 name=ads))
    # Create a diagonal line for the parity plot
    lims = [-4, 6]
    traces.append(go.Scatter(x=lims, y=lims,
                             line=dict(color=('black'), dash='dash'), name='Parity line'))
    # Format and plot
    layout = go.Layout(xaxis=dict(title='Regressed (eV)'),
                       yaxis=dict(title='DFT (eV)'),
                       title='Adsorption Energy as a function of (Coordination Count, Neighbor Coordination Count, Adsorbate); Model = %s; RMSE = %0.3f eV' \
                             % (model['name'], math.sqrt(metrics.mean_squared_error(Y_TEST, map(model_predict, X_TEST)))))
    iplot(go.Figure(data=traces, layout=layout))