In [None]:
## Import necessary packages
import pandas as pd
import numpy as np
import ipywidgets as widgets
from IPython.display import display
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import math

In [None]:
# Converting the data into Dataframe
data = pd.read_csv('Mendeley_data/100_Batches_IndPenSim_V3.csv') 
# Converting summary statisic data into Dataframe
data_summary = pd.read_csv('Mendeley_data/100_Batches_IndPenSim_Statistics.csv')

In [None]:
data_summary.info()

In [None]:
data_summary

In [None]:
# Split the dataframe into two parts at column "2400"
# The first part contains process variables, the second part contains Raman spectra
variable_raman_data = np.split(data, [data.columns.get_loc("2400")], axis=1)

In [None]:
# Extract the process variables dataframe (first part of the split)
variable_data = variable_raman_data[0]
# Fix a column naming issue by swapping two column names
# This appears to correct a mislabeling in the original dataset
variable_data = variable_data.rename(columns={
    '2-PAT control(PAT_ref:PAT ref)': 'Batch reference(Batch_ref:Batch ref)',
    'Batch reference(Batch_ref:Batch ref)':'2-PAT control(PAT_ref:PAT ref)'})

In [None]:
variable_data.info()

In [None]:
variable_data

In [None]:
variable_data.describe()

In [None]:
# Batch ID = 0 indicates no fault introduced at the specific time point in the batch (?)
batch_id_zero = variable_data[variable_data['Batch ID'] == 0]
(batch_id_zero['Fault flag'] == 0).all()

In [None]:
variable_list = variable_data.columns

In [None]:
variable_list

In [None]:
# Missing values

## Only 5 off-line measurement variables contain missing values:
## 1. PAA concentration offline(PAA_offline:PAA (g L^{-1}))
## 2. NH_3 concentration off-line(NH3_offline:NH3 (g L^{-1}))
## 3. Offline Penicillin concentration(P_offline:P(g L^{-1}))
## 4. Offline Biomass concentratio(X_offline:X(g L^{-1}))
## 5. Viscosity(Viscosity_offline:centPoise)

## The missingness is due to the measurement frequency: these offline variables are measured every 12 hrs and include a pre-defined delay (4 h). See Table 1 in the paper.
## e.g., the first measurement of Offline Penicillin concentration (P_offline) is at 1.0 h, which corresponds to the Penicillin concentration (P) at 0.2 h.
## The second measurement of P_offline is at 12.0 h, corresponding to P at 11.2 h.

In [None]:
categorical_vars = ['Fault reference(Fault_ref:Fault ref)',
                    '0 - Recipe driven 1 - Operator controlled(Control_ref:Control ref)',
                    '1- No Raman spec', ' 1-Raman spec recorded',
                    'Batch reference(Batch_ref:Batch ref)',
                    '2-PAT control(PAT_ref:PAT ref)', 'Batch ID', 'Fault flag']

variable_list_new = [var for var in variable_list if var not in categorical_vars]

In [None]:
dropdown_update = widgets.Dropdown(options=variable_list_new, description='Variable:', value='Penicillin concentration(P:g/L)')
output_update = widgets.Output()

def update_plot(change):
    with output_update:
        output_update.clear_output()
        plt.figure(figsize=(8,6))
        sns.lineplot(data=variable_data, x='Time (h)', y=change.new, hue='Batch reference(Batch_ref:Batch ref)', palette="Set1", legend=False)
        plt.title(f'Variable: {change.new}')
        plt.show()

dropdown_update.observe(update_plot, names='value')
display(dropdown_update, output_update)

In [None]:
# Configuration
n_cols = 4
n_plots_per_fig = 30

# Filter out 'Time (h)' from the variable list for plotting
variable_list_new_filtered_time = [var for var in variable_list_new if var != 'Time (h)']

# Create figures with subplots for all variables
for i in range(math.ceil(len(variable_list_new_filtered_time)/n_plots_per_fig)):
    # Get variables for this figure
    vars_subset = variable_list_new_filtered_time[i*n_plots_per_fig:min((i+1)*n_plots_per_fig, len(variable_list_new_filtered_time))]
    n_rows = math.ceil(len(vars_subset)/n_cols)
    # Create figure
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 5*n_rows))
    axes = axes.flatten()  # Flatten the axes array for easy indexing
    # Create each subplot
    for j, var in enumerate(vars_subset):
        sns.lineplot(data=variable_data, x='Time (h)', y=var, hue='Batch reference(Batch_ref:Batch ref)', palette="Set1", legend=False, ax=axes[j])
        axes[j].set_title(var, fontsize=12)
        axes[j].tick_params(labelsize=8, axis='x')
    # Hide unused subplots
    for k in range(len(vars_subset), len(axes)):
        axes[k].set_visible(False)
    # Add legend and show
    handles, labels = axes[0].get_legend_handles_labels()
    # fig.legend(handles, labels, loc='upper right', title='Batch reference')
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()

In [None]:
variable_data_end_p = variable_data.sort_values(by="Time (h)", ascending=False).groupby(['Batch reference(Batch_ref:Batch ref)']).first()["Penicillin concentration(P:g/L)"]
print(variable_data_end_p.describe())
plt.figure()
sns.boxplot(variable_data_end_p)
plt.ylabel('Penicillin concentration (g/L)')
plt.title('Distribution of Penicillin concentration at the end of the batch')
plt.show()

In [None]:
variable_data_end_p = variable_data.sort_values(by="Time (h)", ascending=False).groupby(['Batch reference(Batch_ref:Batch ref)']).first()["Penicillin concentration(P:g/L)"]
print(variable_data_end_p.describe())
plt.figure()
sns.boxplot(variable_data_end_p)
plt.yscale('log')
plt.ylabel('log10(Penicillin concentration (g/L))')
plt.title('Distribution of Penicillin concentration at the end of the batch')
plt.show()

In [None]:
# Configuration
n_cols = 4
n_plots_per_fig = 30
n_facets = 40

variable_data_copy = variable_data.copy()
variable_data_copy["P_end(P_end:g/L)"] = variable_data_copy["Batch reference(Batch_ref:Batch ref)"].map(variable_data_end_p.to_dict())

# Create interactive plots in batches
for i in range(math.ceil(len(variable_list_new)/n_facets)):
    # Select the subset of variables for this batch of plots
    vars_subset = variable_list_new[i*n_facets:min((i+1)*n_facets, len(variable_list_new))]
    n_rows = math.ceil(len(vars_subset)/n_cols)
    # Create a long-form DataFrame for the selected variables
    plot_data = variable_data_copy.melt(id_vars=['Time (h)', 'Batch reference(Batch_ref:Batch ref)', 'P_end(P_end:g/L)', "Penicillin concentration(P:g/L)"], 
                                        value_vars=vars_subset, var_name='Variable', value_name='Value')
    # Create the interactive scatter plot
    fig = px.scatter(plot_data, x='Time (h)', y='Value', color='P_end(P_end:g/L)', 
                     facet_col='Variable', hover_name="Batch reference(Batch_ref:Batch ref)", 
                     hover_data={"Time (h)": True, "Value": True, "P_end(P_end:g/L)": True, "Penicillin concentration(P:g/L)": True, },
                     facet_col_wrap=4, 
                    #  title=f"Process Variables Over Time (Color = Final Penicillin Concentration)",
                     )
    fig.update_layout(height=2500, showlegend=False)
    fig.update_yaxes(matches=None) 
    fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
    fig.show()

In [None]:
# Create figures with subplots for all variables against Penicillin concentration
response_var_p = 'Penicillin concentration(P:g/L)'

# Filter out the reference variable from the list to avoid plotting it against itself
variable_list_new_filtered_p = [var for var in variable_list_new if var != 'Penicillin concentration(P:g/L)' and var != 'Time (h)']

# Time-series with color-coded Penicillin concentration
for i in range(math.ceil(len(variable_list_new_filtered_p)/n_plots_per_fig)):
    # Get variables for this figure
    vars_subset = variable_list_new_filtered_p[i*n_plots_per_fig:min((i+1)*n_plots_per_fig, len(variable_list_new_filtered_p))]
    n_rows = math.ceil(len(vars_subset)/n_cols)
    # Create figure
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 4*n_rows))
    axes = axes.flatten()  # Flatten the axes array for easy indexing
    # Create each subplot
    for j, var in enumerate(vars_subset):
        # Create color-mapped scatter plots by time
        scatter = axes[j].scatter(variable_data['Time (h)'], variable_data[var], c=variable_data[response_var_p], cmap='viridis', alpha=0.5, s=30)
        # Add colorbar
        cb = fig.colorbar(scatter, ax=axes[j], shrink=0.8)
        cb.set_label(response_var_p, fontsize=10)
        # axes[j].set_title(f'{var}', fontsize=12)
        axes[j].set_xlabel('Time (h)', fontsize=10)
        axes[j].set_ylabel(var, fontsize=10)
    # Hide unused subplots
    for k in range(len(vars_subset), len(axes)):
        axes[k].set_visible(False)
    # Add legend and show
    plt.suptitle(f'Variables over time (color = {response_var_p})', fontsize=16)
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()