# Random split using various models and datarepresentations

### Models: ANN, RF, GP, Linear Model


In [None]:
# import all required packages 
import math
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.utils import shuffle

# Linear model
from sklearn.linear_model import LinearRegression

# for RF
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# for GP
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.gaussian_process.kernels import RBF,Matern, ConstantKernel as C
from sklearn.gaussian_process.kernels import WhiteKernel

# for adaboost
from sklearn.ensemble import AdaBoostRegressor

# for SVR
from sklearn.svm import SVR

# for the NN
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

#install package for excel support
#!pip install xlrd --user
import xlrd

import matplotlib.pyplot as plt
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Descriptors
from rdkit.Chem import AllChem
from rdkit import DataStructs

from rdkit import rdBase
rdBase.rdkitVersion

In [None]:
# Choosing the used set of descriptors

input = fingerprints_input
#input = np.append(input,data_2_sterimol,axis=1)
#input = np.append(input,dft_data,axis=1)
#input = np.append(input,fp_pca_data,axis=1)
#input = np.append(input,dataset_one_hot,axis=1)
#input = np.append(input,NBO_data,axis=1)
#input = np.append(input,chelpg_data,axis=1)
#input = np.append(input,vol_bur,axis=1)
#input = np.append(input,n_proton,axis=1)
# input = np.append(input,data_only_R_sterimol,axis=1)
# input = np.append(input,chelpg_data,axis=1)

#input = shuffle(input)
print('total_input shape:',input.shape)

output = dataset['Yield']

In [None]:
# Data partitioning
train_features, test_features, train_labels, test_labels = train_test_split(input, output, test_size=0.2, random_state=1)
print('Training Features Shape:', train_features.shape)
print('Training Labels Shape:', train_labels.shape)
print('Testing Features Shape:', test_features.shape)
print('Testing Labels Shape:', test_labels.shape)

input_dimension = (train_features.shape[1])

In [None]:
linear_regressor = LinearRegression()
linear_regressor.fit(train_features, train_labels)
linear_test_predictions = linear_regressor.predict(test_features)
linear_train_predictions = linear_regressor.predict(train_features)

In [None]:
rf_regressor = RandomForestRegressor(n_estimators=200, random_state=0)
rf_regressor.fit(train_features, train_labels)
rf_test_predictions = rf_regressor.predict(test_features)
rf_train_predictions = rf_regressor.predict(train_features)

In [None]:
# Let's train a GP regression
from sklearn.gaussian_process.kernels import Matern

#kernel = 1.0 * Matern(length_scale=1.2, nu=1.5)

kernel = 1.0 * Matern(length_scale=1.2, nu=1.5) + WhiteKernel(noise_level=0.5)
gp = GaussianProcessRegressor(kernel=kernel,n_restarts_optimizer=3)
gp.fit(train_features,train_labels)

gp_test_predictions, MSE = gp.predict(test_features, return_std=True)
gp_train_predictions, MSE_train = gp.predict(train_features, return_std=True)

In [None]:
svr = SVR(kernel='linear')
svr.fit(train_features,train_labels)
svr_test_predictions = svr.predict(test_features)
svr_train_predictions = svr.predict(train_features)

In [None]:
adaboost = AdaBoostRegressor(random_state=0, n_estimators=2000)
adaboost.fit(train_features,train_labels)
adaboost_test_predictions = adaboost.predict(test_features)
adaboost_train_predictions = adaboost.predict(train_features)

In [None]:
def build_model():
  model = keras.Sequential([
    layers.Dense(10, activation='relu', input_shape=[148,input_dimension]),
    layers.Dense(10, activation='relu', input_shape=[148,input_dimension]),
    layers.Dense(10, activation='relu', input_shape=[148,input_dimension]),
    layers.Dense(10, activation='relu', input_shape=[148,input_dimension]),
    layers.Dense(10, activation='relu', input_shape=[148,input_dimension]),
    layers.Dense(10, activation='relu', input_shape=[148,input_dimension]),
    layers.Dense(1)
  ])
  optimizer = tf.keras.optimizers.RMSprop(0.001)
  model.compile(loss='mse', optimizer=optimizer, metrics=['mae', 'mse'])
  return model

#create the NN
model = build_model()

EPOCHS = 1000

history = model.fit(train_features, train_labels,epochs=EPOCHS, validation_split = 0.3, verbose=0,callbacks=[tfdocs.modeling.EpochDots()])

nn_test_predictions = model.predict(test_features).flatten()
nn_train_predictions = model.predict(train_features).flatten()

In [None]:
# Getting the metrics for the RF - TRAIN RMSE

a = plt.axes(aspect='equal')
plt.title(label="Random Forest Regression")
plt.scatter(train_labels, rf_train_predictions,label='Train_data',color='orange')

plt.scatter(test_labels, rf_test_predictions,label='Test_data',color='dodgerblue')
plt.xlabel('True Values')
plt.ylabel('Predictions')
lims = [1e-04,1.2]
plt.xlim(lims)
plt.ylim(lims)
_ = plt.plot(lims, lims)
plt.legend()

print('Random Forest')
print('Mean Absolute Error:', metrics.mean_absolute_error(test_labels, rf_test_predictions))
print('Mean Squared Error:', metrics.mean_squared_error(test_labels, rf_test_predictions))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(test_labels, rf_test_predictions)))

print('Random Forest')
#print('Mean Absolute Error:', metrics.mean_absolute_error(test_labels, rf_test_predictions))
#print('Mean Squared Error:', metrics.mean_squared_error(test_labels, rf_test_predictions))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(train_labels, rf_train_predictions)))

In [None]:
a = plt.axes(aspect='equal')
plt.scatter(train_labels, linear_train_predictions,label='Train_data',color='orange')
plt.title(label="Linear Model evaluation")

plt.scatter(test_labels, linear_test_predictions,label='Test_data',color='dodgerblue')
plt.xlabel('True Values')
plt.ylabel('Predictions')
lims = [1e-04,1.05]
plt.xlim(lims)
plt.ylim(lims)
plt.rc('xtick', labelsize=11)    # fontsize of the tick labels
plt.rc('ytick', labelsize=11)
plt.legend()
_ = plt.plot(lims, lims)
#plt.savefig("parity-plot.png", dpi=300)

print('Linear Model')
print('Mean Absolute Error:', metrics.mean_absolute_error(test_labels, linear_test_predictions))
print('Mean Squared Error:', metrics.mean_squared_error(test_labels, linear_test_predictions))
print('Root Mean Squared Error TEST_data:', np.sqrt(metrics.mean_squared_error(test_labels, linear_test_predictions)))
print('Root Mean Squared Error TRAIN_data:', np.sqrt(metrics.mean_squared_error(train_labels, linear_train_predictions)))

In [None]:
a = plt.axes(aspect='equal')
plt.scatter(train_labels, gp_train_predictions,label='Train_data',color='orange')
plt.title(label="Gaussian Process evaluation")

plt.scatter(test_labels, gp_test_predictions,label='Test_data',color='dodgerblue')
plt.xlabel('True Values')
plt.ylabel('Predictions')
lims = [1e-04,1.05]
plt.xlim(lims)
plt.ylim(lims)
plt.rc('xtick', labelsize=11)    # fontsize of the tick labels
plt.rc('ytick', labelsize=11)
plt.legend()
_ = plt.plot(lims, lims)
plt.savefig("parity-plot.png", dpi=300)

print('Gaussian Process')
print('Mean Absolute Error:', metrics.mean_absolute_error(test_labels, gp_test_predictions))
print('Mean Squared Error:', metrics.mean_squared_error(test_labels, gp_test_predictions))
print('Root Mean Squared Error TEST_data:', np.sqrt(metrics.mean_squared_error(test_labels, gp_test_predictions)))
print('Root Mean Squared Error TRAIN_data:', np.sqrt(metrics.mean_squared_error(train_labels, gp_train_predictions)))

In [None]:
a = plt.axes(aspect='equal')
plt.scatter(train_labels, svr_train_predictions,label='Train_data',color='orange')
plt.title(label="SVR evaluation")

plt.scatter(test_labels, svr_test_predictions,label='Test_data',color='dodgerblue')
plt.xlabel('True Values')
plt.ylabel('Predictions')
lims = [1e-04,1.2]
plt.xlim(lims)
plt.ylim(lims)
plt.legend()
_ = plt.plot(lims, lims)


print('Gaussian Process')
print('Mean Absolute Error:', metrics.mean_absolute_error(test_labels, svr_test_predictions))
print('Mean Squared Error:', metrics.mean_squared_error(test_labels, svr_test_predictions))
print('Root Mean Squared Error TEST_data:', np.sqrt(metrics.mean_squared_error(test_labels, svr_test_predictions)))
print('Root Mean Squared Error TRAIN_data:', np.sqrt(metrics.mean_squared_error(train_labels, svr_train_predictions)))

In [None]:
a = plt.axes(aspect='equal')
plt.scatter(train_labels, adaboost_train_predictions,label='Train_data',color='orange')
plt.title(label="ADAboost evaluation")

plt.scatter(test_labels, adaboost_test_predictions,label='Test_data',color='dodgerblue')
plt.xlabel('True Values')
plt.ylabel('Predictions')
lims = [1e-04,1.2]
plt.xlim(lims)
plt.ylim(lims)
plt.legend()
_ = plt.plot(lims, lims)


print('adaboost')
print('Mean Absolute Error:', metrics.mean_absolute_error(test_labels, adaboost_test_predictions))
print('Mean Squared Error:', metrics.mean_squared_error(test_labels, adaboost_test_predictions))
print('Root Mean Squared Error TEST_data:', np.sqrt(metrics.mean_squared_error(test_labels, adaboost_test_predictions)))
print('Root Mean Squared Error TRAIN_data:', np.sqrt(metrics.mean_squared_error(train_labels, adaboost_train_predictions)))

In [None]:
# Getting the metrics for the NN


a = plt.axes(aspect='equal')
plt.scatter(train_labels, nn_train_predictions,label='Train_data',color='orange')

plt.title(label="Neural Net evaluation")
plt.scatter(test_labels, nn_test_predictions,label='Test_data',color='dodgerblue')
plt.xlabel('True Values')
plt.ylabel('Predictions')
lims = [1e-04,1.2]
plt.xlim(lims)
plt.ylim(lims)
plt.legend()
_ = plt.plot(lims, lims)


print('Neural Net')
print('Mean Absolute Error:', metrics.mean_absolute_error(test_labels, nn_test_predictions))
print('Mean Squared Error:', metrics.mean_squared_error(test_labels, nn_test_predictions))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(test_labels, nn_test_predictions)))

print('Train Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(train_labels, nn_train_predictions)))

Assess feature importance for random forest

In [None]:
# random forest for feature importance on a regression problem
'''
rf_regressor = RandomForestRegressor(n_estimators=200, random_state=0)
rf_regressor.fit(train_features, train_labels)
rf_test_predictions = rf_regressor.predict(test_features)
rf_train_predictions = rf_regressor.predict(train_features)

# get importance
importance = rf_regressor.feature_importances_

# summarize feature importance
for i,v in enumerate(importance):
	print('Feature: %0d, Score: %.5f' % (i,v))
# plot feature importance
pyplot.figure(figsize=(10, 4))

pyplot.bar([x for x in range(len(importance))], importance)

pyplot.title(label="Feature Importance Evaluation")
pyplot.ylabel('Importance')
pyplot.xlabel('Features')

pyplot.show()
'''