In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import joblib

In [None]:
import sys

sys.path.append("../")
from src.model import *

model_path = "../models/"

### Model training

Here we choose random forest and XGboost as our choice of model for the prediction of <sup>27</sup>Al C<sub>Q</sub>. We use RandomizedSearchCV from scikit-learn to select the hyperparameter and perform 10-fold cross validation. The resultant model is test on a stand-alone test set consist of 1617 Al sites.

4 models are trained here.
1. Baseline model with pure structural based features.
2. Improved model with structural+elemental features.
3. Test model with SMOTE rebalance.
4. XGboost model.

The output models are saved in /model/.

Reload features from data/processed

In [None]:
path = "../data/processed/nmr_param_and_features.csv"
with open(path, "r") as file:
    nmr_struc_data = pd.read_csv(file)
nmr_struc_data.head()

In [None]:
path_train = "../data/processed/nmr_param_and_features_train.csv"
path_test = "../data/processed/nmr_param_and_features_test.csv"
with open(path_train, "r") as file:
    nmr_struc_data_train = pd.read_csv(file)
with open(path_test, "r") as file:
    nmr_struc_data_test = pd.read_csv(file)

In [None]:
X_train = nmr_struc_data_train.loc[:, "fbl_average":]
y_train = nmr_struc_data_train[["CQ", "is_O"]]
X_test = nmr_struc_data_test.loc[:, "fbl_average":]
y_test = nmr_struc_data_test[["CQ", "is_O"]]

#### I. Baseline model with only structural based features

In [None]:
# # split y and x
# y = nmr_struc_data[["CQ", "is_O"]]
# x = nmr_struc_data.loc[:, "fbl_average":"DI"]

# X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)

# print(f"Size of train set: {len(X_train)}\nSize of test set: {len(X_test)}")

In [None]:
# split y and x
y = nmr_struc_data_train[["CQ", "is_O"]]
x = nmr_struc_data_train.loc[:, "fbl_average":"DI"]

X_train, _, y_train, _ = train_test_split(x, y, test_size=1, random_state=5)

print(f"Size of train set: {len(X_train)}\nSize of test set: {len(X_test)}")

In [None]:
# split y and x
y = nmr_struc_data_test[["CQ", "is_O"]]
x = nmr_struc_data_test.loc[:, "fbl_average":"DI"]

X_test, _, y_test, _ = train_test_split(x, y, test_size=1, random_state=5)

print(f"Size of train set: {len(X_train)}\nSize of test set: {len(X_test)}")

In [None]:
# define the param space for randomized search
param = {
    "n_estimators": randint(low=10, high=1000),
    "max_depth": randint(low=10, high=50),
    "min_samples_split": randint(low=2, high=10),
    "min_samples_leaf": randint(low=1, high=8),
    "max_features": [None, "sqrt", "log2"],
}

grid = model_train(X_train, y_train, "randomforest", param)

In [None]:
# print the model's performance
grid_performance(grid)

In [None]:
# plot model's performance over the test set
grid_test(X_test, y_test, grid, plot=True)

In [None]:
# Save the model
joblib.dump(grid.best_estimator_, model_path + "struc.pkl")

#### II. Improved model with structural+elemental features

In [None]:
# split y and x
y = nmr_struc_data[["CQ", "is_O"]]
x = nmr_struc_data.loc[:, "fbl_average":]

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)

print(f"Size of train set: {len(X_train)}\nSize of test set: {len(X_test)}")

In [None]:
# define the param space for randomized search
param = {
    "n_estimators": randint(low=10, high=1000),
    "max_depth": randint(low=10, high=50),
    "min_samples_split": randint(low=2, high=10),
    "min_samples_leaf": randint(low=1, high=8),
    "max_features": [None, "sqrt", "log2"],
}

grid = model_train(X_train, y_train, "randomforest", param)

In [None]:
# print the model's performance
grid_performance(grid)

In [None]:
# plot model's performance over the test set
grid_test(X_test, y_test, grid, plot=True)

In [None]:
# Save the model
joblib.dump(grid.best_estimator_, model_path + "struc+ele.pkl")

#### III. Test model with SMOTE rebalance

In [None]:
# split y and x
y = nmr_struc_data[["CQ", "is_O"]]
x = nmr_struc_data.loc[:, "fbl_average":]
# x = data_nocollinear

# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)

print(f"Size of train set: {len(X_train)}\nSize of test set: {len(X_test)}")

In [None]:
# resample the dataset using SMOTE
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

train = pd.concat([X_train, y_train["CQ"]], axis=1)
label = y_train["is_O"]

over = SMOTE(sampling_strategy=0.75)
under = RandomUnderSampler(sampling_strategy=1.0)
steps = [("o", over), ("u", under)]
pipeline = Pipeline(steps=steps)

train, label = pipeline.fit_resample(train, label)
y_train = pd.concat([train["CQ"], label], axis=1)
X_train = train.drop(columns=["CQ"])

In [None]:
# now the data is balanced
plt.figure(figsize=(10, 6))
sns.histplot(data=pd.concat([X_train, y_train], axis=1), x="CQ", hue="is_O")
plt.show()

In [None]:
# define the param space for randomized search
param = {
    "n_estimators": randint(low=10, high=1000),
    "max_depth": randint(low=10, high=50),
    "min_samples_split": randint(low=2, high=10),
    "min_samples_leaf": randint(low=1, high=8),
    "max_features": [None, "sqrt", "log2"],
}

grid = model_train(X_train, y_train, "randomforest", param)

In [None]:
# print the model's performance
grid_performance(grid)

In [None]:
# plot model's performance over the test set
grid_test(X_test, y_test, grid, plot=True, is_O=True)

In [None]:
# Save the model
joblib.dump(grid.best_estimator_, model_path + "smote.pkl")

#### IV. XGboost

In [None]:
# split y and x
y = nmr_struc_data[["CQ", "is_O"]]
x = nmr_struc_data.loc[:, "fbl_average":]
# x = data_nocollinear

# train test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=5)

print(f"Size of train set: {len(X_train)}\nSize of test set: {len(X_test)}")

In [None]:
# define the param space for randomized search
param = {
    "learning_rate": uniform(0, 1),
    "max_depth": randint(3, 50),
    "min_child_weight": randint(1, 10),
    "eta": uniform(0.01, 0.2),
    "gamma": uniform(0, 1),
    "reg_alpha": [1e-5, 1e-2, 0.1, 1, 100],
    "subsample": uniform(0, 1),
    "colsample_bytree": uniform(0, 1),
}

grid = model_train(X_train, y_train, "XGboost", param)

In [None]:
# print the model's performance
grid_performance(grid)

In [None]:
# plot model's performance over the test set
grid_test(X_test, y_test, grid, plot=True)

In [None]:
# Save the model
joblib.dump(grid.best_estimator_, model_path + "xgboost.pkl")