In [None]:
# regress.ipynb
Author:  Kevin Tran <ktran@andrew.cmu.edu>

This python notebook performs regressions on data pulled from a local GASdb. It then saves these regressions into pickles (for later use) and creates parity plots of the regression fits.

## Initializations

###### Imports

In [2]:
from pprint import pprint   # for debugging
import sys
import math
import numpy as np
sys.path.append('..')
from vasp_settings_to_str import vasp_settings_to_str
from gas_pull import GASPull
from sklearn import metrics
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from tpot import TPOTRegressor
import alamopy
import dill as pickle
pickle.settings['recurse'] = True     # required to pickle lambdify functions
import matplotlib.pyplot as plt
from plotly.offline import init_notebook_mode, plot, iplot
init_notebook_mode(connected=True)
import plotly.plotly as py
import plotly.graph_objs as go

###### Load data

In [3]:
# Location of the *.db file
#DB_LOC = '/global/cscratch1/sd/zulissi/GASpy_DB/'  # Cori
DB_LOC = '/Users/KTran/Nerd/GASpy'                 # Local

# Calculation settings we want to look at
VASP_SETTINGS = vasp_settings_to_str({'gga': 'BF',
                                      'pp_version': '5.4.',
                                      'encut': 350})

# Pull the data from the Local database
GAS_PULL = GASPull(DB_LOC, VASP_SETTINGS, split=True)
X, Y, DATA, X_TRAIN, X_TEST, Y_TRAIN, Y_TEST, lb_ads, lb_coord = GAS_PULL.energy_fr_coordcount_ads()

## Regressions
Create surrogate models using different methods

###### SKLearn Linear Regression

In [15]:
LR = LinearRegression()
LR.fit(X_TRAIN, Y_TRAIN)
LR.name = 'Linear'
pickle.dump({'model': LR,
             'pre_processors': {'coordination': lb_coord,
                                'adsorbate': lb_ads}},
            open('pkls/CoordcountAds_Energy_LR.pkl', 'w'))

###### SKLearn Gradient Boosting Ensemble Regression

In [16]:
GBE = GradientBoostingRegressor()
GBE.fit(X_TRAIN, Y_TRAIN)
GBE.name = 'GBE'
pickle.dump({'model': GBE,
             'pre_processors': {'coordination': lb_coord,
                                'adsorbate': lb_ads}},
            open('pkls/CoordcountAds_Energy_GBE.pkl', 'w'))

###### SKLearn Gaussian Process Regressor

In [17]:
GP = GaussianProcessRegressor()
GP.fit(X_TRAIN, Y_TRAIN)
GP.name = 'GP'
pickle.dump({'model': GP,
             'pre_processors': {'coordination': lb_coord,
                                'adsorbate': lb_ads}},
            open('pkls/CoordcountAds_Energy_GP.pkl', 'w'))

###### TPOT Regression

In [13]:
TPOT = TPOTRegressor(generations=100,
                     population_size=100,
                     verbosity=2,
                     random_state=42)
TPOT.fit(X_TRAIN, Y_TRAIN)
TPOT.name = 'TPOT'



Optimization Progress:   2%|▏         | 187/10100 [09:06<2:30:11,  1.10pipeline/s] 

Generation 1 - Current best internal CV score: 0.245020628343


Optimization Progress:   3%|▎         | 279/10100 [11:01<2:46:58,  1.02s/pipeline]

Generation 2 - Current best internal CV score: 0.245020628343


          on Progress:   4%|▎         | 372/10100 [13:28<3:36:47,  1.34s/pipeline]

Generation 3 - Current best internal CV score: 0.245020628343


Optimization Progress:   5%|▍         | 459/10100 [17:24<3:16:06,  1.22s/pipeline] 

Generation 4 - Current best internal CV score: 0.245020628343


          on Progress:   5%|▌         | 547/10100 [21:26<3:33:24,  1.34s/pipeline] 

Generation 5 - Current best internal CV score: 0.243001226959


Optimization Progress:   6%|▋         | 635/10100 [25:59<4:16:33,  1.63s/pipeline] 

Generation 6 - Current best internal CV score: 0.243001226959


Optimization Progress:   7%|▋         | 720/10100 [29:31<7:16:37,  2.79s/pipeline] 

Generation 7 - Current best internal CV score: 0.243001226959


Optimization Progress:   8%|▊         | 807/10100 [33:15<5:39:57,  2.19s/pipeline] 

Generation 8 - Current best internal CV score: 0.243001226959


Optimization Progress:   9%|▉         | 900/10100 [36:35<6:05:56,  2.39s/pipeline] 

Generation 9 - Current best internal CV score: 0.241806004636


Optimization Progress:  10%|▉         | 990/10100 [39:30<5:54:03,  2.33s/pipeline] 

Generation 10 - Current best internal CV score: 0.241806004636


Optimization Progress:  11%|█         | 1077/10100 [42:41<4:18:26,  1.72s/pipeline] 

Generation 11 - Current best internal CV score: 0.241806004636


Optimization Progress:  12%|█▏        | 1164/10100 [45:23<4:03:44,  1.64s/pipeline] 

Generation 12 - Current best internal CV score: 0.241806004636


Optimization Progress:  12%|█▏        | 1255/10100 [49:33<6:12:00,  2.52s/pipeline] 

Generation 13 - Current best internal CV score: 0.24034848458


Optimization Progress:  13%|█▎        | 1342/10100 [52:58<5:13:19,  2.15s/pipeline] 

Generation 14 - Current best internal CV score: 0.24034848458


Optimization Progress:  14%|█▍        | 1437/10100 [56:11<4:02:07,  1.68s/pipeline] 

Generation 15 - Current best internal CV score: 0.240147165413


Optimization Progress:  15%|█▌        | 1525/10100 [59:37<4:07:09,  1.73s/pipeline] 

Generation 16 - Current best internal CV score: 0.23972678294


Optimization Progress:  16%|█▌        | 1621/10100 [1:02:46<4:02:55,  1.72s/pipeline] 

Generation 17 - Current best internal CV score: 0.239569008927


Optimization Progress:  17%|█▋        | 1715/10100 [1:06:15<4:44:30,  2.04s/pipeline] 

Generation 18 - Current best internal CV score: 0.239241174879


Optimization Progress:  18%|█▊        | 1809/10100 [1:09:13<3:52:12,  1.68s/pipeline]

Generation 19 - Current best internal CV score: 0.239241174879


Optimization Progress:  19%|█▉        | 1904/10100 [1:12:34<4:17:29,  1.89s/pipeline] 

Generation 20 - Current best internal CV score: 0.239180216692


Optimization Progress:  20%|█▉        | 1999/10100 [1:15:48<4:18:37,  1.92s/pipeline] 

Generation 21 - Current best internal CV score: 0.238074458162


Optimization Progress:  21%|██        | 2088/10100 [1:18:53<4:09:27,  1.87s/pipeline] 

Generation 22 - Current best internal CV score: 0.237416827698


Optimization Progress:  22%|██▏       | 2180/10100 [1:22:03<4:42:17,  2.14s/pipeline] 

Generation 23 - Current best internal CV score: 0.236140380722


Optimization Progress:  22%|██▏       | 2271/10100 [1:24:45<3:00:34,  1.38s/pipeline]

Generation 24 - Current best internal CV score: 0.232687052012


Optimization Progress:  23%|██▎       | 2364/10100 [2:38:01<5:08:07,  2.39s/pipeline]     

Generation 25 - Current best internal CV score: 0.232687052012


Optimization Progress:  24%|██▍       | 2456/10100 [2:41:14<3:52:39,  1.83s/pipeline] 

Generation 26 - Current best internal CV score: 0.232687052012


Optimization Progress:  25%|██▌       | 2549/10100 [2:44:20<4:36:43,  2.20s/pipeline]

Generation 27 - Current best internal CV score: 0.232687052012


Optimization Progress:  26%|██▌       | 2642/10100 [2:47:20<4:22:08,  2.11s/pipeline]

Generation 28 - Current best internal CV score: 0.232687052012


Optimization Progress:  27%|██▋       | 2736/10100 [2:50:19<3:21:12,  1.64s/pipeline] 

Generation 29 - Current best internal CV score: 0.232687052012


Optimization Progress:  28%|██▊       | 2833/10100 [3:23:36<3:15:00,  1.61s/pipeline]   

Generation 30 - Current best internal CV score: 0.232687052012


Optimization Progress:  29%|██▉       | 2927/10100 [3:26:03<2:55:22,  1.47s/pipeline]

Generation 31 - Current best internal CV score: 0.232687052012


Optimization Progress:  30%|██▉       | 3021/10100 [3:28:49<3:25:39,  1.74s/pipeline]

Generation 32 - Current best internal CV score: 0.232687052012


Optimization Progress:  31%|███       | 3113/10100 [3:31:25<3:19:45,  1.72s/pipeline] 

Generation 33 - Current best internal CV score: 0.232687052012


Optimization Progress:  32%|███▏      | 3207/10100 [4:30:18<19:48:28, 10.35s/pipeline]  

Generation 34 - Current best internal CV score: 0.232687052012


Optimization Progress:  33%|███▎      | 3296/10100 [4:33:33<3:02:24,  1.61s/pipeline] 

Generation 35 - Current best internal CV score: 0.232687052012


Optimization Progress:  34%|███▎      | 3386/10100 [4:36:28<3:43:21,  2.00s/pipeline] 

Generation 36 - Current best internal CV score: 0.232687052012


Optimization Progress:  34%|███▍      | 3475/10100 [4:47:56<5:15:46,  2.86s/pipeline]   

Generation 37 - Current best internal CV score: 0.231960553099


Optimization Progress:  35%|███▌      | 3557/10100 [4:50:11<4:25:52,  2.44s/pipeline] 

Generation 38 - Current best internal CV score: 0.231960553099


Optimization Progress:  36%|███▌      | 3647/10100 [4:52:47<3:37:36,  2.02s/pipeline] 

Generation 39 - Current best internal CV score: 0.231960553099


Optimization Progress:  37%|███▋      | 3734/10100 [4:55:06<3:33:53,  2.02s/pipeline] 

Generation 40 - Current best internal CV score: 0.231960553099


Optimization Progress:  38%|███▊      | 3825/10100 [4:57:23<2:53:20,  1.66s/pipeline] 

Generation 41 - Current best internal CV score: 0.231960553099


Optimization Progress:  39%|███▉      | 3916/10100 [4:59:57<3:37:49,  2.11s/pipeline] 

Generation 42 - Current best internal CV score: 0.231960553099


Optimization Progress:  40%|███▉      | 4006/10100 [5:02:35<3:25:44,  2.03s/pipeline] 

Generation 43 - Current best internal CV score: 0.231960553099


Optimization Progress:  41%|████      | 4101/10100 [5:04:54<2:40:33,  1.61s/pipeline]

Generation 44 - Current best internal CV score: 0.231960553099


Optimization Progress:  41%|████▏     | 4185/10100 [5:07:13<3:40:07,  2.23s/pipeline] 

Generation 45 - Current best internal CV score: 0.231960553099


Optimization Progress:  42%|████▏     | 4278/10100 [5:09:48<3:19:48,  2.06s/pipeline]

Generation 46 - Current best internal CV score: 0.231960553099


Optimization Progress:  43%|████▎     | 4370/10100 [5:12:33<2:41:18,  1.69s/pipeline] 

Generation 47 - Current best internal CV score: 0.231960553099


Optimization Progress:  44%|████▍     | 4463/10100 [5:15:25<2:29:53,  1.60s/pipeline] 

Generation 48 - Current best internal CV score: 0.231960553099


          on Progress:  45%|████▌     | 4556/10100 [5:18:25<2:22:41,  1.54s/pipeline] 

Generation 49 - Current best internal CV score: 0.231960553099


Optimization Progress:  46%|████▌     | 4645/10100 [5:20:43<2:47:34,  1.84s/pipeline] 

Generation 50 - Current best internal CV score: 0.231960553099


Optimization Progress:  47%|████▋     | 4734/10100 [5:23:15<2:41:46,  1.81s/pipeline]

Generation 51 - Current best internal CV score: 0.231960553099


Optimization Progress:  48%|████▊     | 4822/10100 [5:25:42<2:46:13,  1.89s/pipeline] 

Generation 52 - Current best internal CV score: 0.231960553099


Optimization Progress:  49%|████▊     | 4907/10100 [5:28:00<2:33:09,  1.77s/pipeline] 

Generation 53 - Current best internal CV score: 0.231960553099


Optimization Progress:  49%|████▉     | 4999/10100 [5:31:09<2:20:28,  1.65s/pipeline]

Generation 54 - Current best internal CV score: 0.231960553099


Optimization Progress:  50%|█████     | 5087/10100 [5:33:33<2:28:20,  1.78s/pipeline] 

Generation 55 - Current best internal CV score: 0.231960553099


Optimization Progress:  51%|█████     | 5175/10100 [5:35:49<2:11:04,  1.60s/pipeline]

Generation 56 - Current best internal CV score: 0.231960553099


Optimization Progress:  52%|█████▏    | 5264/10100 [5:38:42<2:13:19,  1.65s/pipeline]

Generation 57 - Current best internal CV score: 0.231960553099


Optimization Progress:  53%|█████▎    | 5347/10100 [5:40:32<2:43:15,  2.06s/pipeline] 

Generation 58 - Current best internal CV score: 0.231960553099


Optimization Progress:  54%|█████▍    | 5438/10100 [5:42:53<2:11:32,  1.69s/pipeline]

Generation 59 - Current best internal CV score: 0.231960553099


Optimization Progress:  55%|█████▍    | 5533/10100 [5:45:07<2:21:14,  1.86s/pipeline]

Generation 60 - Current best internal CV score: 0.231960553099


Optimization Progress:  56%|█████▌    | 5618/10100 [69:43:49<1736:22:03, 1394.67s/pipeline]  

Generation 61 - Current best internal CV score: 0.231960553099


Optimization Progress:  56%|█████▋    | 5703/10100 [69:45:41<102:02:20, 83.54s/pipeline]   

Generation 62 - Current best internal CV score: 0.231960553099


Optimization Progress:  57%|█████▋    | 5795/10100 [69:47:49<2:25:13,  2.02s/pipeline]  

Generation 63 - Current best internal CV score: 0.231960553099


Optimization Progress:  58%|█████▊    | 5889/10100 [69:50:02<1:46:32,  1.52s/pipeline] 

Generation 64 - Current best internal CV score: 0.231960553099


Optimization Progress:  59%|█████▉    | 5973/10100 [69:51:51<1:59:36,  1.74s/pipeline]

Generation 65 - Current best internal CV score: 0.231960553099


Optimization Progress:  60%|██████    | 6061/10100 [69:53:36<2:19:07,  2.07s/pipeline] 

Generation 66 - Current best internal CV score: 0.231960553099


Optimization Progress:  61%|██████    | 6150/10100 [69:55:03<1:48:23,  1.65s/pipeline]

Generation 67 - Current best internal CV score: 0.231960553099


Optimization Progress:  62%|██████▏   | 6243/10100 [69:57:12<2:23:12,  2.23s/pipeline]

Generation 68 - Current best internal CV score: 0.231960553099


Optimization Progress:  63%|██████▎   | 6338/10100 [69:59:54<2:07:30,  2.03s/pipeline] 

Generation 69 - Current best internal CV score: 0.231960553099


Optimization Progress:  64%|██████▎   | 6424/10100 [70:01:40<1:30:38,  1.48s/pipeline] 

Generation 70 - Current best internal CV score: 0.231960553099


Optimization Progress:  64%|██████▍   | 6512/10100 [70:03:42<1:30:46,  1.52s/pipeline] 

Generation 71 - Current best internal CV score: 0.231960553099


Optimization Progress:  65%|██████▌   | 6604/10100 [70:05:12<1:20:17,  1.38s/pipeline] 

Generation 72 - Current best internal CV score: 0.231960553099


Optimization Progress:  66%|██████▋   | 6693/10100 [70:07:07<1:48:17,  1.91s/pipeline] 

Generation 73 - Current best internal CV score: 0.231960553099


Optimization Progress:  67%|██████▋   | 6781/10100 [70:09:10<1:06:08,  1.20s/pipeline]

Generation 74 - Current best internal CV score: 0.231960553099


Optimization Progress:  68%|██████▊   | 6869/10100 [70:10:47<1:04:29,  1.20s/pipeline] 

Generation 75 - Current best internal CV score: 0.231960553099


Optimization Progress:  69%|██████▉   | 6953/10100 [70:12:36<1:01:31,  1.17s/pipeline]

Generation 76 - Current best internal CV score: 0.231960553099


Optimization Progress:  70%|██████▉   | 7041/10100 [70:14:07<1:02:26,  1.22s/pipeline]

Generation 77 - Current best internal CV score: 0.231960553099


Optimization Progress:  71%|███████   | 7129/10100 [70:15:36<1:26:08,  1.74s/pipeline]

Generation 78 - Current best internal CV score: 0.231960553099


Optimization Progress:  71%|███████▏  | 7220/10100 [70:17:18<1:05:56,  1.37s/pipeline]

Generation 79 - Current best internal CV score: 0.231960553099


Optimization Progress:  72%|███████▏  | 7310/10100 [70:18:48<1:26:43,  1.86s/pipeline]

Generation 80 - Current best internal CV score: 0.231960553099


Optimization Progress:  73%|███████▎  | 7402/10100 [70:20:24<1:01:16,  1.36s/pipeline]

Generation 81 - Current best internal CV score: 0.231960553099


Optimization Progress:  74%|███████▍  | 7491/10100 [70:22:10<1:00:35,  1.39s/pipeline]

Generation 82 - Current best internal CV score: 0.231960553099


Optimization Progress:  75%|███████▍  | 7574/10100 [70:23:31<1:41:14,  2.40s/pipeline]

Generation 83 - Current best internal CV score: 0.231960553099


Optimization Progress:  76%|███████▌  | 7665/10100 [70:24:50<1:00:53,  1.50s/pipeline]

Generation 84 - Current best internal CV score: 0.231960553099


Optimization Progress:  77%|███████▋  | 7750/10100 [70:26:13<1:05:00,  1.66s/pipeline]

Generation 85 - Current best internal CV score: 0.231960553099


Optimization Progress:  78%|███████▊  | 7842/10100 [70:27:32<59:38,  1.59s/pipeline]  

Generation 86 - Current best internal CV score: 0.231960553099


Optimization Progress:  78%|███████▊  | 7923/10100 [70:28:47<1:11:14,  1.96s/pipeline]

Generation 87 - Current best internal CV score: 0.231960553099


Optimization Progress:  79%|███████▉  | 8010/10100 [70:30:00<55:10,  1.58s/pipeline]  

Generation 88 - Current best internal CV score: 0.231960553099


Optimization Progress:  80%|████████  | 8105/10100 [70:31:27<1:04:20,  1.94s/pipeline]

Generation 89 - Current best internal CV score: 0.231960553099


Optimization Progress:  81%|████████  | 8189/10100 [70:32:46<42:11,  1.32s/pipeline]  

Generation 90 - Current best internal CV score: 0.231960553099


Optimization Progress:  82%|████████▏ | 8279/10100 [70:34:04<48:30,  1.60s/pipeline]  

Generation 91 - Current best internal CV score: 0.231960553099


Optimization Progress:  83%|████████▎ | 8365/10100 [70:35:29<1:03:32,  2.20s/pipeline]

Generation 92 - Current best internal CV score: 0.231960553099


Optimization Progress:  84%|████████▎ | 8449/10100 [70:37:03<52:55,  1.92s/pipeline]  

Generation 93 - Current best internal CV score: 0.231960553099


Optimization Progress:  85%|████████▍ | 8538/10100 [70:38:04<48:42,  1.87s/pipeline]  

Generation 94 - Current best internal CV score: 0.231960553099


Optimization Progress:  85%|████████▌ | 8625/10100 [70:38:57<49:32,  2.02s/pipeline]  

Generation 95 - Current best internal CV score: 0.231960553099


Optimization Progress:  86%|████████▋ | 8712/10100 [70:40:09<32:24,  1.40s/pipeline]  

Generation 96 - Current best internal CV score: 0.231960553099


Optimization Progress:  87%|████████▋ | 8796/10100 [70:41:28<1:12:12,  3.32s/pipeline]

Generation 97 - Current best internal CV score: 0.231960553099


Optimization Progress:  88%|████████▊ | 8883/10100 [70:43:01<46:30,  2.29s/pipeline]  

Generation 98 - Current best internal CV score: 0.231960553099


Optimization Progress:  89%|████████▉ | 8976/10100 [70:44:07<23:44,  1.27s/pipeline]  

Generation 99 - Current best internal CV score: 0.231960553099


                                                                                      

Generation 100 - Current best internal CV score: 0.231960553099

Best pipeline: GradientBoostingRegressor(ExtraTreesRegressor(input_matrix, ExtraTreesRegressor__bootstrap=False, ExtraTreesRegressor__max_features=0.1, ExtraTreesRegressor__min_samples_leaf=5, ExtraTreesRegressor__min_samples_split=19, ExtraTreesRegressor__n_estimators=100), GradientBoostingRegressor__alpha=0.75, GradientBoostingRegressor__learning_rate=0.1, GradientBoostingRegressor__loss=huber, GradientBoostingRegressor__max_depth=6, GradientBoostingRegressor__max_features=0.45, GradientBoostingRegressor__min_samples_leaf=2, GradientBoostingRegressor__min_samples_split=8, GradientBoostingRegressor__n_estimators=100, GradientBoostingRegressor__subsample=0.9)


In [14]:
pickle.dump({'model': TPOT.fitted_pipeline_,
             'pre_processors': {'coordination': lb_coord,
                                'adsorbate': lb_ads}},
            open('pkls/CoordcountAds_Energy_TPOT.pkl', 'w'))

In [8]:
TPOT_PKL = pickle.load(open('pkls/CoordcountAds_Energy_TPOT.pkl', 'r'))
TPOT = TPOT_PKL['model']

###### Alamo Regression

In [11]:
# Since Alamo can take awhile, we actually try to load a pickle of the previous run
# before calling alamopy. Simply delete the pickle if you want to re-run.
try:
    ALA = pickle.load(open('pkls/CoordcountAds_Energy_Ala.pkl', 'r'))['model']
except IOError:
    ALA = alamopy.doalamo(X_TRAIN, Y_TRAIN.reshape(len(Y_TRAIN), 1),
                          X_TEST, Y_TEST.reshape(len(Y_TEST), 1),
                          showalm=1,
                          linfcns=1,
                          expfcns=1,
                          logfcns=1,
                          monomialpower=(1, 2, 3),
                          multi2power=(1, 2, 3),
                          ratiopower=(1, 2, 3)
                         )
    ALA['name'] = 'Alamo'
    pickle.dump({'model': ALA,
                 'pre_processors': {'coordination': lb_coord,
                                    'adsorbate': lb_ads}},
                open('pkls/CoordcountAds_Energy_Ala.pkl', 'w'))
pprint(ALA['model'])

'  z1 = 0.40689210671486919501660 * x1 + 0.23979449370395070073592 * x3 - 0.24026016988363380066929 * x4 - 0.24303314414319163172529 * x6 + 0.36793318358804466550183 * x7 + 0.34045386387183534937506 * x9 + 0.52626055135851601551877 * x18 + 0.30153379893248954957130 * x19 + 0.54007522653274131485546 * x20 - 0.24635185588857760885517 * x21 + 1.4588916942296219492192 * x23 - 1.0809185728635437584444 * x24 + 1.5294995359829648418071 * x26 + 0.27768071547722195102637 * x2*x24 + 0.13519052619007632110026 * (x11*x24)^3'



Trying to unpickle estimator LabelBinarizer from version 0.18.1 when using version 0.18.2. This might lead to breaking code or invalid results. Use at your own risk.



## Plotting

###### SKLearn-types

In [18]:
# For each model...
for model in [LR, GBE, GP, TPOT]:
    traces = []
    # Create a parity plot where each adsorbate is shown. We do that by pulling out
    # data for each adsorbate and then plotting them.
    for ads in np.unique(DATA['adsorbate']):
        # We loop through all of our data and pull out the vectorized coordination (x),
        # the DFT energy (y), and the coordination site (text).
        x = []
        y = []
        text = []
        for i, _ads in enumerate(DATA['adsorbate']):
            if _ads == ads:
                x.append(X[i])
                y.append(Y[i])
                text.append('Site:  %s' % DATA['coordination'][i])
        # Use the vectorized coordination (x) to calculate a predicted energy (y_predicted).
        # Then add it to `traces` for plotting.
        y_predicted = model.predict(np.array(x))
        traces.append(go.Scatter(x=y_predicted,
                                 y=y,
                                 mode='markers',
                                 text=text,
                                 name=ads))
    # Create a diagonal line for the parity plot
    lims = [-4, 6]
    traces.append(go.Scatter(x=lims, y=lims,
                             line=dict(color=('black'), dash='dash'), name='Parity line'))
    # Format and plot
    layout = go.Layout(xaxis=dict(title='Regressed (eV)'),
                       yaxis=dict(title='DFT (eV)'),
                       title='Adsorption Energy as a function of (Coordination Count, Adsorbate); Model = %s; RMSE = %0.3f eV' \
                             % (model.name, math.sqrt(metrics.mean_squared_error(Y_TEST, model.predict(X_TEST)))))
    iplot(go.Figure(data=traces, layout=layout))

###### Alamo

In [15]:
# Create Pyplot plots for each dictionary-type model
for model in [ALA]:
    traces = []
    # Create a parity plot where each adsorbate is shown. We do that by pulling out
    # data for each adsorbate and then plotting them.
    for ads in np.unique(DATA['adsorbate']):
        # We loop through all of our data and pull out the vectorized coordination (x),
        # the DFT energy (y), and the coordination site (text).
        x = []
        y = []
        text = []
        for i, _ads in enumerate(DATA['adsorbate']):
            if _ads == ads:
                x.append(X[i])
                y.append(Y[i])
                text.append('Site:  %s' % DATA['coordination'][i])
                
        # Do some footwork because Alamo returns a lambda function that doesn't accept np arrays
        def model_predict(factors):
            '''
            Turn a vector of input data, `factors`, into the model's guessed output. We use
            this function to do so because lambda functions suck. We should address this by
            making alamopy output a better lambda function.
            '''
            args = dict.fromkeys(range(0, len(factors)-1), None)
            for j, factor in enumerate(factors):
                args[j] = factor
            return model['f(model)'](args[0], args[1], args[2], args[3], args[4], args[5], args[6], args[7], args[8], args[9], args[10], args[11], args[12], args[13], args[14], args[15], args[16], args[17], args[18], args[19], args[20], args[21], args[22], args[23], args[24], args[25])
        y_predicted = map(model_predict, x)
        
        # Plot
        traces.append(go.Scatter(x=y_predicted,
                                 y=y,
                                 mode='markers',
                                 text=text,
                                 name=ads))
    # Create a diagonal line for the parity plot
    lims = [-4, 6]
    traces.append(go.Scatter(x=lims, y=lims,
                             line=dict(color=('black'), dash='dash'), name='Parity line'))
    # Format and plot
    layout = go.Layout(xaxis=dict(title='Regressed (eV)'),
                       yaxis=dict(title='DFT (eV)'),
                       title='Adsorption Energy as a function of (Coordination Count, Adsorbate); Model = %s; RMSE = %0.3f eV' \
                             % (model['name'], math.sqrt(metrics.mean_squared_error(Y_TEST, map(model_predict, X_TEST)))))
    iplot(go.Figure(data=traces, layout=layout))