In [1]:
%pip install -q -e ..
%reload_ext autoreload
%autoreload 2

Note: you may need to restart the kernel to use updated packages.


In [2]:
from thesis.fuzzy.membership import compute_ndg_streaming, compute_ndg_dense

# quick sanity-plot
import numpy as np, matplotlib.pyplot as plt
x = np.linspace(-4, 4, 800)
data = np.array([-2, 0, 2])
sigma = 0.7
plt.plot(x, compute_ndg_dense(x, data, sigma), label="dense")
plt.plot(x, compute_ndg_streaming(x, data, sigma), "--", label="stream")
plt.legend(); plt.show()

ModuleNotFoundError: No module named 'thesis'

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from new_source.run import run_cases

ModuleNotFoundError: No module named 'new_source'

In [None]:
results_df = run_cases()
print(results_df)

In [None]:
norm_only_df = results_df[results_df["Normalized"]]

In [None]:
# Assuming results_df is already loaded from running run_cases()

# --- Optional Filtering ---
# Uncomment the next line to filter for a specific normalization status (e.g., True)
# results_df_filtered = results_df[results_df['Normalized'] == True].copy()
# Or, use the full DataFrame:
results_df_filtered = results_df.copy()
# --------------------------

# # --- Remove relative sigma options ('r...') ---
# # Ensure the 'Sigma_Option' column is of string type first
# results_df_filtered['Sigma_Option'] = results_df_filtered['Sigma_Option'].astype(str)
# # Create a boolean mask for rows that start with 'r'
# mask_starts_with_r = results_df_filtered['Sigma_Option'].str.startswith('r').fillna(False)
# # Apply the mask using boolean indexing with '~' to keep rows that DO NOT start with 'r'
# results_df_filtered = results_df_filtered[~mask_starts_with_r].copy()
# --- End removal ---

# --- Simplify Grouping and Display Columns ---
# Initial columns, assuming Method='nd' and Empirical_Method='kde'
grouping_cols = ["Case", "Normalized"]
display_cols = ["Case", "Normalized", "Sigma_Option", "Sigma"]

# Check if 'Normalized' has only one unique value after potential filtering.
# If so, remove it from grouping and display cols as well.
if (
    "Normalized" in results_df_filtered.columns
    and results_df_filtered["Normalized"].nunique() == 1
):
    grouping_cols.remove("Normalized")
    display_cols.remove("Normalized")
    print("Note: Analyzing results for a single normalization status.")
elif "Normalized" not in results_df_filtered.columns and "Normalized" in grouping_cols:
    # Handle case where it wasn't in the original df but was in default list
    grouping_cols.remove("Normalized")
    display_cols.remove("Normalized")
# ---------------------------------------------

# --- Get Top 2 based on metrics ---
# Group by the simplified columns
grouped = results_df_filtered.groupby(grouping_cols, group_keys=False)

top2_aic = grouped.apply(lambda x: x.sort_values("AIC_s1").head(2))
top2_bic = grouped.apply(lambda x: x.sort_values("BIC_s1").head(2))
top2_mse = grouped.apply(lambda x: x.sort_values("MSE_s1").head(2))
# top2_kl = grouped.apply(lambda x: x.sort_values('KL_Divergence_s1').head(2)) # Uncomment if needed

# --- Display simplified results ---
print("--- Top 2 Sigma options based on AIC (Method=nd, Empirical=kde) ---")
# Add the metric column itself to the display
print(top2_aic[display_cols + ["AIC_s1"]])
print("-" * 50)  # Separator

print("\n--- Top 2 Sigma options based on BIC (Method=nd, Empirical=kde) ---")
print(top2_bic[display_cols + ["BIC_s1"]])
print("-" * 50)  # Separator

print("\n--- Top 2 Sigma options based on MSE (Method=nd, Empirical=kde) ---")
print(top2_mse[display_cols + ["MSE_s1"]])
print("-" * 50)  # Separator

# print(f"\n--- Top 2 Sigma options based on KL Divergence (Method=nd, Empirical=kde) ---") # Uncomment if needed
# print(top2_kl[display_cols + ['KL_Divergence_s1']])
# print("-" * 50) # Separator

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Filter data for a specific case and method

df_case = results_df[
    (not results_df["Normalized"]) & (results_df["Empirical_Method"] == "counts")
]

# Calculate the means across the same Sigma_Option
df_case = (
    df_case[
        [
            "Similarity_Overlap",
            "Distance_Euclidean",
            "MSE_s1",
            "KL_Divergence_s1",
            "AIC_s1",
            "BIC_s1",
            "Sigma_Option",
            "Method",
        ]
    ]
    .groupby(["Sigma_Option", "Method"])
    .mean()
    .reset_index()
)

fig, axes = plt.subplots(2, 3, figsize=(18, 10))  # 2 rows, 3 columns
axes = axes.flatten()  # Flatten the axes array for easy indexing

# Define the metric names
metrics = [
    "Similarity_Overlap",
    "Distance_Euclidean",
    "MSE_s1",
    "KL_Divergence_s1",
    "AIC_s1",
    "BIC_s1",
]

# Plot each metric in a separate subplot
for i, metric in enumerate(metrics):
    sns.stripplot(
        data=df_case, x="Sigma_Option", y=metric, ax=axes[i], hue="Method", marker="o"
    )
    axes[i].set_title(f"{metric} vs. Sigma_Option")
    axes[i].set_xlabel("Sigma_Option")
    axes[i].set_ylabel(metric)
    axes[i].tick_params(
        axis="x", rotation=45
    )  # Rotate x-axis labels for better readability

# Adjust layout to prevent overlap
plt.tight_layout()

# Show the plot
plt.show()