In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #data visualization
import seaborn as sns
from sklearn.model_selection import train_test_split # Split data to train and test data(after merging in this case)
from sklearn.metrics import accuracy_score,classification_report, confusion_matrix 
import warnings
warnings.filterwarnings('ignore')


# import libraries from tensorflow
from tensorflow import keras
import tensorflow as tf
from keras import backend as K
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Flatten, Dense, Dropout,Input, BatchNormalization, LeakyReLU, ReLU
from tensorflow.keras import layers
from tensorflow.keras import optimizers
from tensorflow.keras.utils import plot_model
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.initializers import HeUniform

In [2]:
pd.set_option('display.max_rows', None)
df = pd.read_excel("TrainingDataset.xlsx")
df = df.drop(['References'], axis=1)


In [3]:
#Data Imputation
from sklearn.impute import SimpleImputer

# Columns with missing values
missing_columns = ['Activator (equiv.)', 'Time (h)']

# 1. Imputation for numerical features (using mean or median)
numerical_columns = ['Acid (equiv.)', 'Amine (equiv.)', 'Activator (equiv.)', 'Base (equiv.)', 
                     'Global Conc (M)', 'Temp (C)', 'Time (h)']

# Create a SimpleImputer for numerical columns (using 'mean' strategy)
imputer_num = SimpleImputer(strategy='mean')

# Apply the imputer to numerical columns
df[numerical_columns] = imputer_num.fit_transform(df[numerical_columns])

In [4]:
X_train = df.drop(columns=['Reaction_Yield'])
y_train = df['Reaction_Yield']

In [5]:
X_test_update = pd.read_excel("TestData.xlsx")

## Categorical Encoding fitted on X_train, used(transformed) on test data

In [7]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

# List of categorical columns
categorical_columns = ['Acid (name)', 'Amine (name)', 'Activator (name)', 'Base (name)', 'Solvent']

# Initialize a dictionary to store the label encoders for each column
label_encoders = {}

# Apply label encoding to each categorical column in both train and test data
for col in categorical_columns:
    label_encoder = LabelEncoder()
    
    # Fit the label encoder on the training data
    X_train[col] = label_encoder.fit_transform(X_train[col])
    
    # Store the label encoder for later use
    label_encoders[col] = label_encoder
    
    # Identify unseen labels in the test data
    unseen_labels = X_test_update[~X_test_update[col].isin(label_encoder.classes_)][col].unique()
    
    # Generate new labels for unseen categories
    new_labels = {label: i for i, label in enumerate(unseen_labels, start=len(label_encoder.classes_))}
    
    # Map unseen labels in the test set to new unique integers
    X_test_update[col] = X_test_update[col].apply(lambda x: new_labels.get(x, x))
    
    # Append unseen labels to the label encoder's classes (though not strictly necessary anymore)
    label_encoder.classes_ = np.append(label_encoder.classes_, list(new_labels.keys()))
    
    # Transform both known and new labels in the test set
    X_test_update[col] = X_test_update[col].apply(lambda x: label_encoder.transform([x])[0] if x in label_encoder.classes_ else new_labels.get(x, x))


## Loading all Pickled Models

In [8]:
import pickle
from sklearn.metrics import mean_squared_error, r2_score

# Load the representation learning model along with Gradient Boosting
with open('rlgb_optuna.pkl', 'rb') as f:
    loaded_regressor = pickle.load(f)
    
# Load the encoder model to generate latent representations of data
with open('encoder_rlgb_optuna.pkl', 'rb') as f:
    loaded_encoder = pickle.load(f)

## Predicting on Test Data

In [9]:
#Extract learned representations from the encoder
X_test_encoded = loaded_encoder.predict(X_test_update)

# Use the loaded model to predict on X_test_encoded
y_pred_loaded = loaded_regressor.predict(X_test_encoded)

