In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt

# Load raw data
raw_data_path = "../data/LogisticGrowthData_Cleaned.csv"
raw_data = pd.read_csv(raw_data_path)

# Compute the number of data points for each ID
n_values = raw_data.groupby("ID")["Time"].count().to_dict()

# Define file paths for fitted model results
file_paths = {
    "Quadratic": "../results/nlls_quadratic_fits.csv",
    "Cubic": "../results/nlls_cubic_fits.csv",
    "Logistic": "../results/nlls_logistic_fits.csv",
    "Gompertz": "../results/nlls_gompertz_fits.csv"
}

# Define the number of parameters for each model
param_counts = {
    "Quadratic": 3,   # a, b, c
    "Cubic": 4,       # a, b, c, d
    "Logistic": 3,    # N_0, N_max, r
    "Gompertz": 4     # N_0, N_max, r_max, t_lag
}


def compute_rss(df, model_name):
    """Compute RSS for each subset in the dataset."""
    rss_results = []
    for _, row in df.iterrows():
        subset = raw_data[raw_data["ID"] == row["ID"]]
        t = subset["Time"].values
        y_true = subset["PopBio"].values  

        if model_name == "Quadratic":
            y_pred = row["a"] * t**2 + row["b"] * t + row["c"]
        elif model_name == "Cubic":
            y_pred = row["a"] * t**3 + row["b"] * t**2 + row["c"] * t + row["d"]
        elif model_name == "Logistic":
            N_0, N_max, r = row["N_0"], row["N_max"], row["r"]
            y_pred = (N_0 * N_max * np.exp(r * t)) / (N_max + N_0 * (np.exp(r * t) - 1))
        elif model_name == "Gompertz":
            N_0, N_max, r_max, t_lag = row["N_0"], row["N_max"], row["r_max"], row["t_lag"]
            exp_term = np.exp(r_max * np.exp(1) * (t_lag - t) / ((N_max - N_0) * np.log(10)) + 1)
            y_pred = N_0 + (N_max - N_0) * np.exp(-exp_term)
        else:
            continue

        rss = np.sum((y_true - y_pred) ** 2)
        rss_results.append({"ID": row["ID"], "Model": model_name, "RSS": rss})
    
    return pd.DataFrame(rss_results)


def calculate_aic_bic(df, rss_df, model_name, k):
    """Compute AIC and BIC for each model."""
    epsilon = 1e-10  # Prevent log(0)
    results = []
    for _, row in df.iterrows():
        rss_row = rss_df[rss_df["ID"] == row["ID"]]
        if rss_row.empty:
            continue

        rss = rss_row["RSS"].values[0] + epsilon  # Ensure RSS is nonzero
        n = n_values.get(row["ID"], 100)  # Get number of data points

        if model_name in ["Quadratic", "Cubic"]:
            aic = 2 * k + n * np.log(rss / n)
            bic = k * np.log(n) + n * np.log(rss / n)
        else:
            sigma_squared = rss / (n - k)
            log_likelihood = -0.5 * n * np.log(2 * np.pi * sigma_squared) - (rss / (2 * sigma_squared))
            aic = 2 * k - 2 * log_likelihood
            bic = k * np.log(n) - 2 * log_likelihood

        results.append({"ID": row["ID"], "Model": model_name, "AIC": aic, "BIC": bic, "R²": row["R²"], "n": n, "RSS": rss})

    return pd.DataFrame(results)

final_results = []
for model_name, file_path in file_paths.items():
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        rss_df = compute_rss(df, model_name)
        final_results.append(calculate_aic_bic(df, rss_df, model_name, param_counts[model_name]))

comparison_df = pd.concat(final_results, ignore_index=True)

# Identify the best model based on AIC
best_models = comparison_df.groupby("ID").apply(lambda x: x.loc[x["AIC"].idxmin()])
best_models = best_models.dropna().reset_index(drop=True)


def filter_best_models(group):
    """Filter best models ensuring sufficient AIC/BIC difference."""
    if len(group) > 1:
        sorted_models = group.sort_values("AIC")
        best_model = sorted_models.iloc[0]
        second_best = sorted_models.iloc[1]
        
        if (second_best["AIC"] - best_model["AIC"]) > 4 and (second_best["BIC"] - best_model["BIC"]) > 6:
            return best_model.to_frame().T
        else:
            return sorted_models.iloc[:2]
    return group

best_models_filtered = best_models.groupby("ID").apply(filter_best_models).reset_index(drop=True)

# Save results to CSV
comparison_df.to_csv("../results/model_comparison_results_filtered.csv", index=False)
best_models_filtered.to_csv("../results/best_models_by_AIC_filtered.csv", index=False)

# Print summary results
print("\nFiltered model comparison results (Top 10 rows):")
print(comparison_df.head(10).to_string(index=False))

print("\nBest models for each ID (Selected by AIC):")
print(best_models_filtered.head(10).to_string(index=False))

print("\nResults saved as:")
print(" - model_comparison_results_filtered.csv")
print(" - best_models_by_AIC_filtered.csv")


Filtered model comparison results (Top 10 rows):
                                                                                                                                                                                                                                                                    ID     Model         AIC         BIC       R²  n      RSS
  Chryseobacterium.balustinum_5_TSB_Bae, Y.M., Zheng, L., Hyun, J.E., Jung, K.S., Heu, S. and Lee, S.Y., 2014. Growth characteristics and biofilm formation of various spoilage bacteria isolated from fresh produce. Journal of food science, 79(10), pp.M2072-M2080. Quadratic -280.737222 -276.740608 0.995391 28 0.000999
             Enterobacter.sp._5_TSB_Bae, Y.M., Zheng, L., Hyun, J.E., Jung, K.S., Heu, S. and Lee, S.Y., 2014. Growth characteristics and biofilm formation of various spoilage bacteria isolated from fresh produce. Journal of food science, 79(10), pp.M2072-M2080. Quadratic -148.343800 -144.347186 0.729903 28 0.113

  y_pred = (N_0 * N_max * np.exp(r * t)) / (N_max + N_0 * (np.exp(r * t) - 1))
  y_pred = (N_0 * N_max * np.exp(r * t)) / (N_max + N_0 * (np.exp(r * t) - 1))
  exp_term = np.exp(r_max * np.exp(1) * (t_lag - t) / ((N_max - N_0) * np.log(10)) + 1)


In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
from matplotlib.backends.backend_pdf import PdfPages

# Define file paths
results_path = "../results"
pdf_filename = "aic_bic_distribution.pdf"
pdf_path = os.path.join(results_path, pdf_filename)  # Final PDF file path

# Load model comparison results
df = pd.read_csv(os.path.join(results_path, "model_comparison_results_filtered.csv"))

# Remove infinite and NaN values from AIC/BIC
df = df[np.isfinite(df["AIC"]) & np.isfinite(df["BIC"])]

# Create a PDF to save plots
with PdfPages(pdf_path) as pdf:
    # Create subplots
    fig, ax = plt.subplots(1, 2, figsize=(15, 5))

    # Plot AIC distribution
    for model in df["Model"].unique():
        subset = df[df["Model"] == model]
        ax[0].hist(subset["AIC"], bins=20, alpha=0.5, label=model)
    
    ax[0].set_xlabel("AIC value")
    ax[0].set_ylabel("Frequency")
    ax[0].set_title("AIC Distribution of Different Models")
    ax[0].legend()

    # Plot BIC distribution
    for model in df["Model"].unique():
        subset = df[df["Model"] == model]
        ax[1].hist(subset["BIC"], bins=20, alpha=0.5, label=model)
    
    ax[1].set_xlabel("BIC value")
    ax[1].set_ylabel("Frequency")
    ax[1].set_title("BIC Distribution of Different Models")
    ax[1].legend()

    plt.tight_layout()
    
    # Save plots to PDF
    pdf.savefig(fig)
    plt.close()

print(f"\n AIC/BIC distribution plots saved as PDF: {pdf_path}")



 AIC/BIC distribution plots saved as PDF: ../results/aic_bic_distribution.pdf


In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import scipy.stats as stats
import os
from matplotlib.backends.backend_pdf import PdfPages

# Define file paths
results_path = "../results"
pdf_filename = "model_ranking_analysis.pdf"
pdf_path = os.path.join(results_path, pdf_filename)  # Final PDF file path

# Load model comparison results
comparison_df = pd.read_csv(os.path.join(results_path, "model_comparison_results_filtered.csv"))

# Compute the percentage of best model selection based on AIC and BIC
best_model_counts_aic = comparison_df.loc[comparison_df.groupby("ID")["AIC"].idxmin()]["Model"].value_counts()
best_model_counts_bic = comparison_df.loc[comparison_df.groupby("ID")["BIC"].idxmin()]["Model"].value_counts()

total_ids = len(comparison_df["ID"].unique())  # Total number of groups

best_model_percentages_aic = best_model_counts_aic / total_ids * 100
best_model_percentages_bic = best_model_counts_bic / total_ids * 100


def confidence_interval(p, n, confidence=0.95):
    """Compute confidence interval for a proportion."""
    z = stats.norm.ppf(1 - (1 - confidence) / 2)  # 1.96 for 95% CI
    se = np.sqrt((p * (100 - p)) / n)  # Standard error
    lower = max(0, p - z * se)
    upper = min(100, p + z * se)
    return lower, upper

# Compute confidence intervals
ci_results_aic = {model: confidence_interval(p, total_ids) for model, p in best_model_percentages_aic.items()}
ci_results_bic = {model: confidence_interval(p, total_ids) for model, p in best_model_percentages_bic.items()}

ci_df_aic = pd.DataFrame(ci_results_aic, index=["Lower CI", "Upper CI"]).T
ci_df_bic = pd.DataFrame(ci_results_bic, index=["Lower CI", "Upper CI"]).T

# Create a PDF and save visualizations
with PdfPages(pdf_path) as pdf:
    fig, ax = plt.subplots(1, 2, figsize=(15, 5))

    # Plot best model selection based on AIC
    ax[0].bar(best_model_percentages_aic.index, best_model_percentages_aic.values, 
              yerr=[best_model_percentages_aic.values - ci_df_aic["Lower CI"], 
                    ci_df_aic["Upper CI"] - best_model_percentages_aic.values], 
              capsize=5, color="skyblue", alpha=0.8)
    ax[0].set_xlabel("Model")
    ax[0].set_ylabel("Best Model Percentage (%)")
    ax[0].set_title("Best Model Selection Based on AIC")
    ax[0].set_xticklabels(best_model_percentages_aic.index, rotation=45)
    ax[0].grid(axis="y", linestyle="--", alpha=0.7)

    # Plot best model selection based on BIC
    ax[1].bar(best_model_percentages_bic.index, best_model_percentages_bic.values, 
              yerr=[best_model_percentages_bic.values - ci_df_bic["Lower CI"], 
                    ci_df_bic["Upper CI"] - best_model_percentages_bic.values], 
              capsize=5, color="lightcoral", alpha=0.8)
    ax[1].set_xlabel("Model")
    ax[1].set_ylabel("Best Model Percentage (%)")
    ax[1].set_title("Best Model Selection Based on BIC")
    ax[1].set_xticklabels(best_model_percentages_bic.index, rotation=45)
    ax[1].grid(axis="y", linestyle="--", alpha=0.7)

    plt.tight_layout()
    pdf.savefig(fig)
    plt.close()

# Save statistical results
ci_df_aic.to_csv(os.path.join(results_path, "model_best_percentage_with_CI_AIC.csv"), index=True)
ci_df_bic.to_csv(os.path.join(results_path, "model_best_percentage_with_CI_BIC.csv"), index=True)

# Print results
print("\nBest model selection percentages based on AIC:")
print(best_model_percentages_aic.to_string())

print("\nBest model selection percentages based on BIC:")
print(best_model_percentages_bic.to_string())

print("\n95% confidence intervals for each model (AIC):")
print(ci_df_aic.to_string())

print("\n95% confidence intervals for each model (BIC):")
print(ci_df_bic.to_string())

print(f"\n Visualization analysis saved as PDF: {pdf_path}")



Best model selection percentages based on AIC:
Model
Quadratic    53.068592
Cubic        36.462094
Logistic      9.025271
Gompertz      1.444043

Best model selection percentages based on BIC:
Model
Quadratic    53.429603
Cubic        36.101083
Logistic      9.025271
Gompertz      1.444043

95% confidence intervals for each model (AIC):
            Lower CI   Upper CI
Quadratic  47.191550  58.945634
Cubic      30.793890  42.130298
Logistic    5.650854  12.399687
Gompertz    0.039162   2.848925

95% confidence intervals for each model (BIC):
            Lower CI   Upper CI
Quadratic  47.555330  59.303876
Cubic      30.445009  41.757157
Logistic    5.650854  12.399687
Gompertz    0.039162   2.848925

 Visualization analysis saved as PDF: ../results/model_ranking_analysis.pdf


  ax[0].set_xticklabels(best_model_percentages_aic.index, rotation=45)
  ax[1].set_xticklabels(best_model_percentages_bic.index, rotation=45)
