# sandbox.ipynb

This python notebook performs regressions on data pulled from a processed mongo DB created by GASpy. It then saves these regressions into pickles (for later use) and creates parity plots of the regression fits.

# Initialize

In [1]:
# Importing
import pdb
import sys
from regressor import GASpyRegressor
import gpickle
sys.path.insert(0, '../')
from gaspy.utils import vasp_settings_to_str

VASP_SETTINGS = vasp_settings_to_str({'gga': 'RP',
                                      'pp_version': '5.4',
                                      'encut': 350})

# Regress

## Gaussian Process Models

In [2]:
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF, WhiteKernel, RationalQuadratic, ExpSineSquared

In [3]:
model_name = 'GP'
features = ['coordcount', 'rnnc_count']
responses = ['energy']
blocks = ['adsorbate']

In [4]:
gp = GaussianProcessRegressor(
                              #kernel= 1.0*RBF(length_scale=0.05) \
                              #       +1.0*RBF(length_scale=0.2) \
                              #       +1.0*WhiteKernel(noise_level=0.05**2.0),
                              #n_restarts_optimizer=2,
                             )
GP = GASpyRegressor(features=features, responses=responses,
                    blocks=blocks, vasp_settings=VASP_SETTINGS)
GP.fit_sk(gp, model_name=model_name)


Data with input dtype int64 was converted to float64 by StandardScaler.


From version 0.21, test_size will always complement train_size unless both are specified.


You are trying to block by adsorbate, but we did not find that fingerprint



In [5]:
gpickle.dump(GP)

In [6]:
GP = gpickle.load(model_name, features, responses, blocks)

In [6]:
GP.parity_plot(split=False)

RMSE values:
	(u'C',)
		test
			0.87977898408
		train
			0.210798119921
		train+test
			0.478591113174
	(u'H',)
		test
			0.440754848009
		train
			0.245647595778
		train+test
			0.305390553253
	(u'CO',)
		test
			0.618675198109
		train
			0.208321979642
		train+test
			0.360845402342
	(u'OH',)
		test
			0.606508916088
		train
			0.183547652089
		train+test
			0.343093961973
	(u'OOH',)
		test
			1.58672461624
		train
			0.0600760161732
		train+test
			0.817417695564
	(u'O',)
		test
			0.855301158861
		train
			0.195736809499
		train+test
			0.450962290643


## TPOT Models

In [2]:
from tpot import TPOTRegressor

In [3]:
model_name = 'TPOT'
features = ['coordcount', 'ads']
responses = ['energy']
blocks = None

In [4]:
tpot = TPOTRegressor(
                     generations=1,
                     population_size=2,
                     verbosity=2,
                     random_state=42,
                    )
TPOT = GASpyRegressor(features=features, responses=responses,
                      blocks=blocks, vasp_settings=VASP_SETTINGS)
TPOT.fit_tpot(tpot, model_name=model_name)


This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.


Data with input dtype int64 was converted to float64 by StandardScaler.


From version 0.21, test_size will always complement train_size unless both are specified.

                                                                          

Generation 1 - Current best internal CV score: 0.29971659987

Best pipeline: KNeighborsRegressor(ElasticNetCV(input_matrix, ElasticNetCV__l1_ratio=0.35, ElasticNetCV__tol=0.1), KNeighborsRegressor__n_neighbors=80, KNeighborsRegressor__p=1, KNeighborsRegressor__weights=distance)


In [9]:
gpickle.dump(TPOT)

In [3]:
TPOT = gpickle.load(model_name, features, responses, blocks)

In [5]:
TPOT.parity_plot(split=True)

RMSE values:
	no_block
		test
			0.545992961289
		train
			0.432892948686
		train+test
			0.463767079066


## Hierarchical Models

In [14]:
import copy
from tpot import TPOTRegressor
from sklearn.gaussian_process import GaussianProcessRegressor

In [15]:
model_name = 'GP_around_TPOT'
features = ['coordcount', 'ads']
outer_features = ['rnnc_count']
responses = ['energy']
blocks = None
fingerprints = {'nextnearestcoordination': '$processed_data.fp_init.nextnearestcoordination'}

In [16]:
tpot = TPOTRegressor(
                     generations=1,
                     population_size=2,
                     verbosity=2,
                     random_state=42,
                    )
gp = GaussianProcessRegressor(
                              #kernel= 1.0*RBF(length_scale=0.05) \
                              #       +1.0*RBF(length_scale=0.2) \
                              #       +1.0*WhiteKernel(noise_level=0.05**2.0),
                              #n_restarts_optimizer=2,
                             )
H = GASpyRegressor(features=features, responses=responses,
                   blocks=blocks, vasp_settings=VASP_SETTINGS,
                   fingerprints=fingerprints)
H.fit_tpot(tpot, model_name=model_name)
H.fit_hierarchical(gp, 'fit_sk', outer_features, model_name=model_name)

                                                                          

Generation 1 - Current best internal CV score: 0.296926218503

Best pipeline: KNeighborsRegressor(ElasticNetCV(input_matrix, ElasticNetCV__l1_ratio=0.35, ElasticNetCV__tol=0.1), KNeighborsRegressor__n_neighbors=80, KNeighborsRegressor__p=1, KNeighborsRegressor__weights=distance)


In [18]:
gpickle.dump(H)

In [6]:
H = gpickle.load(model_name, features+outer_features, responses, blocks)

In [17]:
H.parity_plot(split=True)

RMSE values:
	no_block
		test
			0.535360330158
		train
			0.338084259176
		train+test
			0.396713118627
