In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.offsetbox import AnchoredText
import matplotlib.colors as colors
from mpl_toolkits import mplot3d
from math import sqrt
import warnings

from sklearn.metrics import mean_squared_error,r2_score,mean_absolute_error
from sklearn.model_selection import train_test_split,KFold,cross_val_score,GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder,LabelEncoder, MinMaxScaler


from sklearn.linear_model import LinearRegression, ElasticNet
from sklearn.ensemble import RandomForestRegressor,AdaBoostRegressor,ExtraTreesRegressor,BaggingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.cross_decomposition import PLSRegression
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor

import tensorflow as tf
import keras
from keras import layers
from keras.models import Sequential
from keras.layers import Dense
from keras.models import load_model
from keras.wrappers.scikit_learn import KerasClassifier

import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (10,10)
plt.rcParams['savefig.dpi'] = 600
plt.rcParams["savefig.format"] = 'tiff'
warnings.filterwarnings("ignore")

In [None]:
sns.set(style='whitegrid')
sns.set_context("paper", font_scale=2)


In [None]:
# Learning Rate Scheduler
def scheduler(epoch, lr):
  if epoch < 160:
    return lr
  else:
    return lr * np.exp(-0.1)

callback = keras.callbacks.LearningRateScheduler(scheduler)



# Neural Network
def Neural_network():
    init = keras.initializers.random_normal()
    model=Sequential()
    model.add(layers.Dense(64,activation=keras.layers.LeakyReLU(alpha=0.3),kernel_initializer=init, input_dim=scaled_DF.shape[1]))
    model.add(layers.Dropout(0.1))
    

    model.add(layers.Dense(32,activation=keras.layers.LeakyReLU(alpha=0.3),kernel_initializer=init))
    model.add(layers.Dropout(0.1))

    
    # model.add(layers.Dense(128,activation=keras.layers.LeakyReLU(alpha=0.3),kernel_initializer=init))
    # model.add(layers.Dropout(0.1))

    
    model.add(layers.Dense(1,kernel_initializer=init,activation = 'linear'))
 
    
    optimize=tf.keras.optimizers.Adam()
    
    model.compile(optimizer=optimize,
                    loss='mse',
                    )
    return model



In [None]:
# Funtion to train the model
def training_model(X_train,Y_train,model):
    history=model.fit(X_train,Y_train,epochs=200,batch_size=64,verbose=0,callbacks=[callback])
    return history

In [None]:
# To print losses
def plots():
    f, ax = plt.subplots(1,1)
    actual_test=np.array(scaler.inverse_transform(testY).reshape(-1,1))
    predicted_test=np.array(scaler.inverse_transform(model.predict(testX).reshape(-1,1)))


    actual=np.array(scaler.inverse_transform(np.array(trainY).reshape(-1,1)))
    predicted=np.array(scaler.inverse_transform(model.predict(np.array(trainX)).reshape(-1,1)))

    plt.rcParams["figure.figsize"] = (10,10)
    plt.rcParams['savefig.dpi'] = 600
    plt.rcParams["savefig.format"] = 'tiff'

    sns.set(style='whitegrid')
    sns.set_context("paper", font_scale=2)

    sns.regplot(actual_test,predicted_test, color='olivedrab', scatter_kws={'s':75, 'alpha':0.8, 'edgecolor':'black'})


    print("Mean absolute error (MAE):      %f" % mean_absolute_error(actual_test,predicted_test))
    print("Mean squared error (MSE):       %f" % mean_squared_error(actual_test,predicted_test))
    print("Root mean squared error (RMSE): %f" % sqrt(mean_squared_error(actual_test,predicted_test)))
    print("R square (R^2):                 %f" % r2_score(actual_test,predicted_test))
    
    
    plt.xlabel('BDE (Actual)')
    plt.ylabel('Predicted')

    anchored_text = AnchoredText("R\u00b2 Score_test = "+str(round(r2_score(actual_test,predicted_test),3)) +'\n'"MAE = "+str(round(mean_absolute_error(actual_test,predicted_test),2)), loc=2,prop=dict(size=15))
    ax.add_artist(anchored_text)
    

    plt.tight_layout()
    plt.savefig(str(model)[1:6], bbox_inches='tight')

In [None]:
def defining_model(x):
    if x=='rndmfrst':
      model = RandomForestRegressor()
    else:
      print("wrong selection")
    return model



In [None]:
# Getting data from CSV file
train = pd.read_excel('train.xlsx')
test = pd.read_excel('test.xlsx')



In [None]:
# Combine Train and test for feature engineering
DF_raw = pd.concat([train,test],ignore_index=True)
DF_data = DF_raw.copy()

In [None]:
DF_data.drop(['Smiles_1','inchi'], axis=1, inplace=True)

In [None]:

# Scaling the whole DataFrame



scaler = StandardScaler()
scaled_DF = pd.DataFrame(scaler.fit_transform(DF_data.iloc[:,1:]))
scaled_DF.columns = DF_data.iloc[:,1:].columns

scaled_DF['BDE'] = scaler.fit_transform(np.array(DF_data['BDE']).reshape(-1,1))
scaled_DF

display(scaled_DF)

In [None]:
DF_target = scaled_DF[['BDE']]
scaled_DF.drop('BDE',axis=1,inplace=True)

display(DF_data)
display(DF_target)

In [None]:
trainX = scaled_DF[:len(train)] 
testX = scaled_DF[len(train):]

trainY = DF_target[:len(train)]
testY = DF_target[len(train):]

In [None]:
trainX.shape

In [None]:
# Random Forest
model = defining_model(x = 'rndmfrst')
kfold = KFold(n_splits=5, shuffle=True)

scores = []
rmse = []
for train,valid in kfold.split(trainX,trainY):
  model.fit(trainX.iloc[train],trainY.iloc[train])
  scores.append(model.score(trainX.iloc[valid],trainY.iloc[valid]))
  actual = trainY.iloc[valid]
  predicted = model.predict(trainX.iloc[valid])
  rmse.append(sqrt(mean_squared_error(scaler.inverse_transform(actual),scaler.inverse_transform(predicted.reshape(-1,1)))))

print("Average validation R2 score after crossvalidation : ", np.mean(scores))
print("Average validation rmse score after crossvalidation : ", np.mean(rmse))


# Train model on whole train data
model = defining_model(x = 'rndmfrst')
model.fit(trainX,trainY)
plots()

In [None]:

# get importance
importance = model.feature_importances_
# summarize feature importance
dicts = {
    'Feature':[x for x in trainX.columns],
    'Importance':importance
    }
DF_imp = pd.DataFrame(dicts)
DF_imp = DF_imp.sort_values('Importance',ascending=False)
DF_imp.to_excel('imp.xlsx', index=None)

# plot feature importance
plt.bar('Feature','Importance',data = DF_imp.iloc[:6,:])
plt.xticks(rotation = 90)
plt.show()

In [None]:
model = Neural_network()

kfold = KFold(n_splits=5, shuffle=True, random_state=None)

best_score = 0.1

scores = []
rmse = []
for train,valid in kfold.split(trainX,trainY):
  training_model(trainX.iloc[train],trainY.iloc[train],model)
  scores.append(r2_score(trainY.iloc[valid],model.predict(trainX.iloc[valid])))
  
  actual = trainY.iloc[valid]
  predicted = model.predict(trainX.iloc[valid])
  rmse.append(sqrt(mean_squared_error(scaler.inverse_transform(actual),scaler.inverse_transform(predicted))))
    
print("Average validation R2 score after crossvalidation : ", np.mean(scores))
print("Average validation rmse score after crossvalidation : ", np.mean(rmse))

# Train model on whole train data

model = Neural_network()
training_model(trainX,trainY,model)
#actual = trainY
#predicted = model.predict(trainX)

actual=np.array(scaler.inverse_transform(np.array(trainY)))
predicted=np.array(scaler.inverse_transform(model.predict(np.array(trainX)).reshape(-1,1)))
model.save("nn.h5")
score = r2_score(actual,predicted)
print("\n\nTraining Accuracy : ",score) # Training Accuracy
plots()

In [None]:
model = load_model('nn.h5')

In [None]:
plots()