In [None]:
import math
from scipy.stats import pearsonr

import ipywidgets as widgets
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
from IPython.display import display

In [None]:
# Converting the data into Dataframe
data = pd.read_csv('Mendeley_data/100_Batches_IndPenSim_V3.csv') 
# Converting summary statisic data into Dataframe
data_summary = pd.read_csv('Mendeley_data/100_Batches_IndPenSim_Statistics.csv')

In [None]:
data_summary.info()

In [None]:
data_summary

In [None]:
# Check if the total yield is roughly equal to the sum of the yield during the batch and at the end of the batch
data_summary['Penicllin_harvested_total (kg)'] = data_summary['Penicllin_harvested_during_batch(kg)'] + data_summary['Penicllin_harvested_end_of_batch (kg)']
data_summary['Difference'] = data_summary['Penicllin_yield_total (kg)'] - data_summary['Penicllin_harvested_total (kg)']
discrepancies = data_summary[abs(data_summary['Difference'])>10].copy()
len(discrepancies)

In [None]:
# The maximum difference is 100.0 kg. (Is this amount acceptable??)
print(discrepancies['Difference'].describe())
print(discrepancies['Difference'].value_counts())

In [None]:
# Calculate the Pearson correlation coefficient
print(pearsonr(data_summary['Penicllin_harvested_during_batch(kg)'], data_summary['Penicllin_harvested_end_of_batch (kg)']))
print(pearsonr(data_summary['Penicllin_harvested_during_batch(kg)'], data_summary['Penicllin_yield_total (kg)']))
print(pearsonr(data_summary['Penicllin_harvested_end_of_batch (kg)'], data_summary['Penicllin_yield_total (kg)']))

In [None]:
sns.scatterplot(data=data_summary, x='Penicllin_harvested_during_batch(kg)', y='Penicllin_harvested_end_of_batch (kg)')
plt.xlabel('Penicllin_harvested_during_batch(kg)')
plt.ylabel('Penicllin_harvested_end_of_batch (kg)')
plt.show()

In [None]:
sns.scatterplot(data=data_summary, x='Penicllin_harvested_during_batch(kg)', y='Penicllin_yield_total (kg)')
plt.xlabel('Penicllin_harvested_during_batch(kg)')
plt.ylabel('Penicllin_yield_total (kg)')
plt.show()

In [None]:
sns.scatterplot(data=data_summary, x='Penicllin_harvested_end_of_batch (kg)', y='Penicllin_yield_total (kg)')
plt.xlabel('Penicllin_harvested_end_of_batch (kg)')
plt.ylabel('Penicllin_yield_total (kg)')
plt.show()

In [None]:
# Split the dataframe into two parts at column "2400"
# The first part contains process variables, the second part contains Raman spectra
variable_raman_data = np.split(data, [data.columns.get_loc("2400")], axis=1)

In [None]:
# Extract the process variables dataframe (first part of the split)
variable_data = variable_raman_data[0]
# Fix a column naming issue by swapping two column names
# This appears to correct a mislabeling in the original dataset
variable_data = variable_data.rename(columns={
    '2-PAT control(PAT_ref:PAT ref)': 'Batch reference(Batch_ref:Batch ref)',
    'Batch reference(Batch_ref:Batch ref)':'2-PAT control(PAT_ref:PAT ref)'})

In [None]:
variable_data.info()

In [None]:
# Missing values

## Only 5 off-line measurement variables contain missing values:
## 1. PAA concentration offline(PAA_offline:PAA (g L^{-1}))
## 2. NH_3 concentration off-line(NH3_offline:NH3 (g L^{-1}))
## 3. Offline Penicillin concentration(P_offline:P(g L^{-1}))
## 4. Offline Biomass concentratio(X_offline:X(g L^{-1}))
## 5. Viscosity(Viscosity_offline:centPoise)

## The missingness is due to the measurement frequency: these offline variables are measured every 12 hrs and include a pre-defined delay (4 time steps). See Table 1 in the paper.
## e.g., the first measurement of Offline Penicillin concentration (P_offline) is at 1.0 h, which corresponds to the Penicillin concentration (P) at 0.2 h.
## The second measurement of P_offline is at 12.0 h, corresponding to P at 11.2 h.

In [None]:
# Batch ID = 0 indicates no fault introduced at the specific time point in the batch (?)
batch_id_zero = variable_data[variable_data['Batch ID'] == 0]
(batch_id_zero['Fault flag'] == 0).all()

In [None]:
# Create a new column to indicate the batch length
time_group = variable_data.groupby('Batch reference(Batch_ref:Batch ref)')['Time (h)'].max()
fixed_batch = time_group[time_group == 230].index
# Create a new column with default value 0
variable_data['0 - Variable 1 - Fixed(Batch_length:Batch length)'] = 0
# Set the value to 1 for batches with max time = 230
for batch in fixed_batch:
    variable_data.loc[variable_data['Batch reference(Batch_ref:Batch ref)'] == batch, '0 - Variable 1 - Fixed(Batch_length:Batch length)'] = 1

In [None]:
variable_data

In [None]:
variable_list = variable_data.columns

In [None]:
variable_list

In [None]:
categorical_vars = ['Fault reference(Fault_ref:Fault ref)',
                    '0 - Recipe driven 1 - Operator controlled(Control_ref:Control ref)',
                    '1- No Raman spec', ' 1-Raman spec recorded',
                    'Batch reference(Batch_ref:Batch ref)',
                    '2-PAT control(PAT_ref:PAT ref)', 'Batch ID', 'Fault flag', '0 - Variable 1 - Fixed(Batch_length:Batch length)']

variable_list_new = [var for var in variable_list if var not in categorical_vars]

In [None]:
# Configuration for plotting
n_cols = 4
n_plots_per_fig = 30
n_facets = 40

In [None]:
# Create figures with subplots for all variables against Penicillin concentration
response_var_p = 'Penicillin concentration(P:g/L)'

# Filter out the reference variable from the list to avoid plotting it against itself
variable_list_new_filtered_p = [var for var in variable_list_new if var != 'Penicillin concentration(P:g/L)' and var != 'Time (h)']

In [None]:
dropdown_update = widgets.Dropdown(options=variable_list_new, description='Variable:', value='Penicillin concentration(P:g/L)')
output_update = widgets.Output()

def update_plot(change):
    with output_update:
        output_update.clear_output()
        plt.figure(figsize=(8,6))
        sns.lineplot(data=variable_data, x='Time (h)', y=change.new, hue='Batch reference(Batch_ref:Batch ref)', palette="Set1", legend=False)
        plt.title(f'Variable: {change.new}')
        plt.show()

dropdown_update.observe(update_plot, names='value')
display(dropdown_update, output_update)

In [None]:
# Filter out 'Time (h)' from the variable list for plotting
variable_list_new_filtered_time = [var for var in variable_list_new if var != 'Time (h)']

# Create figures with subplots for all variables
for i in range(math.ceil(len(variable_list_new_filtered_time)/n_plots_per_fig)):
    # Get variables for this figure
    vars_subset = variable_list_new_filtered_time[i*n_plots_per_fig:min((i+1)*n_plots_per_fig, len(variable_list_new_filtered_time))]
    n_rows = math.ceil(len(vars_subset)/n_cols)
    # Create figure
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 5*n_rows))
    axes = axes.flatten()  # Flatten the axes array for easy indexing
    # Create each subplot
    for j, var in enumerate(vars_subset):
        sns.lineplot(data=variable_data, x='Time (h)', y=var, hue='Batch reference(Batch_ref:Batch ref)', palette="Set1", legend=False, ax=axes[j])
        axes[j].set_title(var, fontsize=12)
        axes[j].tick_params(labelsize=8, axis='x')
    # Hide unused subplots
    for k in range(len(vars_subset), len(axes)):
        axes[k].set_visible(False)
    # Add legend and show
    handles, labels = axes[0].get_legend_handles_labels()
    # fig.legend(handles, labels, loc='upper right', title='Batch reference')
    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()

In [None]:
# Create interactive plots in batches
for i in range(math.ceil(len(variable_list_new)/n_facets)):
    # Select the subset of variables for this batch of plots
    vars_subset = variable_list_new[i*n_facets:min((i+1)*n_facets, len(variable_list_new))]
    n_rows = math.ceil(len(vars_subset)/n_cols)
    # Create a long-form DataFrame for the selected variables
    plot_data = variable_data.melt(id_vars=['Time (h)', 'Batch reference(Batch_ref:Batch ref)', "Penicillin concentration(P:g/L)"], 
                                        value_vars=vars_subset, var_name='Variable', value_name='Value')
    # Create the interactive scatter plot
    fig = px.scatter(plot_data, x='Time (h)', y='Value', color='Penicillin concentration(P:g/L)', 
                     facet_col='Variable', hover_name="Batch reference(Batch_ref:Batch ref)", 
                     hover_data={"Time (h)": True, "Value": True, "Penicillin concentration(P:g/L)": True, },
                     facet_col_wrap=4
                     )
    fig.update_layout(height=2000, showlegend=False)
    fig.update_yaxes(matches=None) 
    fig.for_each_yaxis(lambda yaxis: yaxis.update(showticklabels=True))
    plt.tight_layout()
    fig.show()

In [None]:
fig = px.scatter(variable_data, x='Time (h)', y='Penicillin concentration(P:g/L)', 
                 color='Batch reference(Batch_ref:Batch ref)',
                 hover_name="Batch reference(Batch_ref:Batch ref)", 
                 hover_data={"Time (h)": True, "Penicillin concentration(P:g/L)": True},
                 labels={"Batch reference(Batch_ref:Batch ref)": "Batch reference"}
                )
fig.update_layout(height=600, width=800)
fig.show()

In [None]:
variable_list_filtered_p = [var for var in variable_list if var != 'Penicillin concentration(P:g/L)' and var != 'Time (h)' and 
                            var != '1- No Raman spec' and var != ' 1-Raman spec recorded' and var != 'Batch reference(Batch_ref:Batch ref)' and
                            var != '2-PAT control(PAT_ref:PAT ref)' and var != 'Batch ID' and var != 'Fault flag']

In [None]:
# Find the peak penicillin concentration time for each batch
def analyze_penicillin_concentration(variable_data):
    # Group data by batch
    results = []
    for batch, batch_data in variable_data.groupby('Batch reference(Batch_ref:Batch ref)'):
        batch_data = batch_data.sort_values('Time (h)')
        # Find peak concentration and its time
        peak_idx = batch_data['Penicillin concentration(P:g/L)'].idxmax()
        peak_time = batch_data.loc[peak_idx, 'Time (h)']
        peak_concentration = batch_data.loc[peak_idx, 'Penicillin concentration(P:g/L)']
        # Calculate if concentrations declines after peak
        last_concentration = batch_data.iloc[-1]['Penicillin concentration(P:g/L)']
        decline = last_concentration < peak_concentration
        # Extract process variables at peak time
        process_vars = variable_list_filtered_p
        var_values = {var: batch_data.loc[peak_idx, var] for var in process_vars}
        # Store results
        results.append({
            'Batch': batch,
            'Peak time (h)': peak_time,
            'Peak concentration (g/L)': peak_concentration,
            'Has decline': decline,
            **var_values
        })
    # Create DataFrame from results
    peak_results = pd.DataFrame(results)
    return peak_results

In [None]:
def analyze_correlation(peak_results):
    # Select variables for correlation
    correlation_vars = ['Peak time (h)', 'Peak concentration (g/L)']
    process_vars = variable_list_filtered_p
    # Create correlation matrix
    corr_matrix = peak_results[correlation_vars + process_vars].corr()
    # Plot heatmap
    plt.figure(figsize=(24, 10))
    sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
    plt.title('Correlation between variables and peak penicillin concentration')
    plt.tight_layout()
    plt.show()

In [None]:
peak_results = analyze_penicillin_concentration(variable_data)
peak_results

In [None]:
peak_results.info()

In [None]:
peak_results['Has decline'].value_counts()

In [None]:
analyze_correlation(peak_results)

In [None]:
peak_results[['Agitator RPM(RPM:RPM)', 'Oil flow(Foil:L/hr)', 'Ammonia shots(NH3_shots:kgs)']].describe()

In [None]:
# Create figure
fig, axes = plt.subplots(n_rows, n_cols, figsize=(20, 5*n_rows))
axes = axes.flatten()  # Flatten the axes array for easy indexing
# List to store statistics
stats_list = []
# Create a boxplot for each variable
for i, var in enumerate(variable_list_filtered_p):
    sns.boxplot(data=peak_results, x='Has decline', y=var, ax=axes[i])
    axes[i].set_xlabel('Has decline', fontsize=10)
    axes[i].set_ylabel(var, fontsize=10)
    # Calculate basic statistics
    stats_dict = {'Variable': var}
    for group in [True, False]:
        group_data = peak_results[peak_results['Has decline'] == group][var]
        group_name = 'Decline' if group else 'No decline'
        stats_dict[f'{group_name} Count'] = len(group_data)
        stats_dict[f'{group_name} Mean'] = group_data.mean()
        stats_dict[f'{group_name} Median'] = group_data.median()
        stats_dict[f'{group_name} Std'] = group_data.std()
    stats_list.append(stats_dict)
for i in range(len(variable_list_filtered_p), len(axes)):
    axes[i].set_visible(False)
plt.tight_layout()
plt.show()

In [None]:
# Create and display statistics DataFrame
stats_df = pd.DataFrame(stats_list)
stats_df