# Benchmarking model-based design of experiment approaches with a pharmaceutical crystallisation emulator

**Installing the packages**

In [None]:
from obsidian import Campaign, ParamSpace, Target
from obsidian.parameters import Param_Categorical, Param_Ordinal, Param_Continuous, Param_Discrete
import pandas as pd
import io
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from joblib import load
from joblib import dump

**Define parameters and targets for optimisation**

In [None]:
params = [
    Param_Continuous('Initial_temperature', 34, 40),
    Param_Continuous('Seed_load', 0, 2.625),
    Param_Ordinal('Seed_mean', ['46.94', '55.94']),
    Param_Continuous('Final_temperature', 10, 25),
    Param_Continuous('Cooling_rate', 0.1, 0.5),
    Param_Continuous('AS_end_frac', 0.45, 0.9),
    Param_Continuous('AS_rate_mL_min', 3, 10)
    ]

param_column_names = [p.name for p in params]

X_space = ParamSpace(params)

target = [
    Target('d90', aim='min'),
    Target('Yield', aim='max'),
]

target_column_names = [t.name for t in target]

campaign = Campaign(X_space, target)

**Loading the simulated data**

In [None]:
def load_data():
    # Read the dataset from the Excel file
    data = pd.read_excel(r"file_location.xlsx")
    return data

**Random forest model**

This is specific to the optimisation problem and therefore needs to be run whenever inputs and outputs are changed.

In [None]:
def preprocess_data(data):
    input_features = param_column_names
    output_features = target_column_names

    X = data[input_features].values
    y = data[output_features].values

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test, scaler

def build_model():
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    return model

def main():
    data = load_data()
    X_train, X_test, y_train, y_test, scaler = preprocess_data(data)

    model = build_model()
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    print(f"Test MSE: {mse}, Test MAE: {mae}")

    return model, scaler

if __name__ == "__main__":
    model, scaler = main()
    dump(model, 'model.joblib')
    dump(scaler, 'scaler.joblib')

**Calling the emulator**

Tranform data to be used for the RF model and then predict values for your dataframe of experiments

In [None]:
new_X_scaled = scaler.transform(new_X)

predictions = model.predict(new_X_scaled)