In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from scipy.stats import uniform

import numpy as np
import pandas as pd

import xgboost as xgb

from rdkit import Chem
from rdkit.Chem import Descriptors

In [8]:
df = pd.read_csv("aqsoldb.csv")

Let's create a list of dictionaries, where each list item is the dictionary of chemical descriptors describing a compound.

In [None]:
# creating list of molecules in dataset using RDKit and SMILES
mol_list = []

for i in df.SMILES:
    molecule = Chem.MolFromSmiles(i)
    mol_list.append(molecule)


# creating descriptors of all molecules
complete_mol_desc = []

for molecule in mol_list:
    mol_desc = {}

    for name, function in Descriptors._descList: # Descriptors._descList provides list of all descriptors in RDKit Library
        # try-catch in case the descriptor fails to produce a value
        try:
            desc_value = function(molecule)
        
        except:
            import traceback
            traceback.print_exc()

            desc_value = None

        mol_desc[name] = desc_value
    
    complete_mol_desc.append(mol_desc)



In [10]:
df_desc = pd.DataFrame(complete_mol_desc)
df_desc = df_desc.assign(Solubility = df.Solubility) # adding column of solubility values from AqSolDB to dataframe of descriptors

inf_locations = np.where(df_desc.values >= np.finfo(np.float32).max) # locating infinite values in dataframe
for i in inf_locations[0]: # replacing infinite values with None
    for j in inf_locations[1]:
        df_desc.iat[i, j] = None

df_desc = df_desc.dropna()

We'll now convert this list of dictionaries into a pandas dataframe, then clean/process this data.

In [16]:
x = df_desc.drop(['Solubility'], axis=1)
y = df_desc['Solubility']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
print(f"{len(x_train)} compounds in training set.")
print(f"{len(x_test)} compounds in test set.")

7272 compounds in training set.
1819 compounds in test set.


Let's train the XGBoost model, using Randomised Search for hyperparameter tuning.

In [None]:
# defining hyperparameters and distributions for randomised search
param_distributions = {
    'n_estimators': [10, 15, 20, 30, 40, 50, 75, 100],
    'max_depth': [5, 10, 20],
    'min_child_weight': [ 1, 3, 5, 7 ],
    'eta': uniform(0, 1),
    'gamma': uniform(0, 0.5),
    'colsample_bytree': uniform(0.3, 1)
}

# creating model
model = xgb.XGBRegressor()

# performing randomised search
random_search = RandomizedSearchCV(model, param_distributions, cv=2, n_iter=200, scoring='r2', random_state=42)
random_search.fit(x_train, y_train)

cv_results = random_search.cv_results_

In [None]:
print(f"Best parameters: {random_search.best_params_}")
print(f"Best R^2: {random_search.best_score_}\n")

for key in cv_results:
    print(key)
    
average_score_time = np.mean(cv_results['mean_score_time'])
print(f"\nMean inference time is {average_score_time} seconds.")

In [13]:
predictions = model.predict(x_test)

In [14]:
r2 = r2_score(y_test, predictions)
rmse = np.sqrt(mean_squared_error(y_test, predictions))
print(r2)
print(rmse)

0.8724268056602865
0.8412860268678642


The following code was an implementation of randomised search with 3-fold cross-validation for hyperparameter tuning.

In [15]:
'''
# defining hyperparameters and distributions for randomised search
param_distributions = {
    'n_estimators': [10, 50, 100, 200],
    'max_depth': [ 3, 4, 5, 6, 8, 10, 12, 15],
    'min_child_weight': [ 1, 3, 5, 7 ],
    'eta': uniform(0, 1),
    'gamma': uniform(0, 0.5),
    'colsample_bytree': uniform(0.3, 1)
}

# creating model
model = xgb.XGBRegressor()

# performing randomised search
random_search = RandomizedSearchCV(model, param_distributions, cv=3, n_iter=200, scoring='r2', random_state=42)

random_search.fit(x_train, y_train)
print(random_search.best_params_)
'''

"\n# defining hyperparameters and distributions for randomised search\nparam_distributions = {\n    'n_estimators': [10, 50, 100, 200],\n    'max_depth': [ 3, 4, 5, 6, 8, 10, 12, 15],\n    'min_child_weight': [ 1, 3, 5, 7 ],\n    'eta': uniform(0, 1),\n    'gamma': uniform(0, 0.5),\n    'colsample_bytree': uniform(0.3, 1)\n}\n\n# creating model\nmodel = xgb.XGBRegressor()\n\n# performing randomised search\nrandom_search = RandomizedSearchCV(model, param_distributions, cv=3, n_iter=200, scoring='r2', random_state=42)\n\nrandom_search.fit(x_train, y_train)\nprint(random_search.best_params_)\n"