In [1]:
import sleap
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import os
import csv
from pathlib import Path

mpl.style.use("seaborn-deep")
sleap.versions()
sleap.disable_preallocation()
# sleap.use_cpu_only()

SLEAP: 1.3.3
TensorFlow: 2.7.0
Numpy: 1.21.6
Python: 3.7.12
OS: Windows-10-10.0.22621-SP0


In [2]:
mpl.rcParams["figure.facecolor"] = "w"
mpl.rcParams["figure.dpi"] = 150
mpl.rcParams["savefig.dpi"] = 600
mpl.rcParams["savefig.transparent"] = True
# mpl.rcParams["savefig.bbox_inches"] = "tight"
mpl.rcParams["font.size"] = 15
mpl.rcParams["font.family"] = "sans-serif"
mpl.rcParams["font.sans-serif"] = ["Arial"]
mpl.rcParams["axes.titlesize"] = "xx-large"  # medium, large, x-large, xx-large
# print(mpl.rcParams.keys())
# This might be necessary to export to PDF without messing up the text:
mpl.rcParams["pdf.fonttype"] = 42
mpl.rcParams["ps.fonttype"] = 42

# mpl.style.use("ggplot")
# mpl.style.use("seaborn-paper")
mpl.style.use("seaborn-deep")
# print(mpl.style.available)

In [3]:
# scale
px_per_mm = 10.6

In [4]:
se_folders = """
sample_efficiency_plantwise_split_rice_10do_main
sample_efficiency_plantwise_split_rice_3do_long
sample_efficiency_plantwise_split_rice_3do_main
sample_efficiency_plantwise_split_soy_lr
sample_efficiency_plantwise_split_soy_pr
sample_efficiency_plantwise_split_arabidopsis_7dap_lr
sample_efficiency_plantwise_split_arabidopsis_7dap_pr
sample_efficiency_plantwise_split_canola_lr
sample_efficiency_plantwise_split_canola_pr
""".strip().split()

se_names = {
    "sample_efficiency_plantwise_split_rice_10do_main": "Rice (10 DAG, crown)",
    "sample_efficiency_plantwise_split_rice_3do_long": "Rice (3 DAG, primary)",
    "sample_efficiency_plantwise_split_rice_3do_main": "Rice (3 DAG, crown)",
    "sample_efficiency_plantwise_split_soy_lr": "Soybean (5-8 DAG, lateral)",
    "sample_efficiency_plantwise_split_soy_pr": "Soybean (5-8 DAG, primary)",
    "sample_efficiency_plantwise_split_arabidopsis_7dap_lr": "Arabidopsis (7 DAG, lateral)",
    "sample_efficiency_plantwise_split_arabidopsis_7dap_pr": "Arabidopsis (7 DAG, primary)",
    "sample_efficiency_plantwise_split_canola_lr": "Canola (5-13 DAG, lateral)",
    "sample_efficiency_plantwise_split_canola_pr": "Canola (5-13 DAG, primary)",
}

In [5]:
se_folders

['sample_efficiency_plantwise_split_rice_10do_main',
 'sample_efficiency_plantwise_split_rice_3do_long',
 'sample_efficiency_plantwise_split_rice_3do_main',
 'sample_efficiency_plantwise_split_soy_lr',
 'sample_efficiency_plantwise_split_soy_pr',
 'sample_efficiency_plantwise_split_arabidopsis_7dap_lr',
 'sample_efficiency_plantwise_split_arabidopsis_7dap_pr',
 'sample_efficiency_plantwise_split_canola_lr',
 'sample_efficiency_plantwise_split_canola_pr']

In [6]:
os.makedirs(f"figures/sample_efficiency", exist_ok=True)

In [7]:
# Define the name of the output CSV file
output_csv_path = Path("sample_efficiency.csv")

# Prepare a list to hold data rows
data_rows = []

# Loop over each SE folder
for se_folder in se_folders:
    # Define the path to the splits CSV file using pathlib
    splits_csv_path = Path(se_folder) / "splits.csv"

    # Read the splits CSV file into a DataFrame
    splits_df = pd.read_csv(splits_csv_path)

    # Loop over each row in the DataFrame
    for index, row in splits_df.iterrows():
        # Get the base path from the "path" column
        base_path = Path(row["path"])

        # Define the models path
        models_path = base_path / "models"

        # Loop over the models in the base path
        for model_name in models_path.iterdir():
            # Prepare the row data as a dictionary
            row_data = {
                "se_folder": se_folder,
                "model_path": str(model_name),
                "sample_size": row["sample_size"]
            }

            # Append the dictionary to the data rows list
            data_rows.append(row_data)

# Convert the list of dictionaries to a DataFrame
output_df = pd.DataFrame(data_rows)

# Write the DataFrame to a CSV file
output_df.to_csv(output_csv_path, index=False)


In [9]:
# df = pd.read_csv("sample_efficiency.csv")
df = output_df.copy()

In [10]:
se_folder = list(df.se_folder)
len(se_folder)

225

In [11]:
train_n = list(df.sample_size)
len(train_n)

225

In [12]:
models = list(df.model_path)
len(models)

225

In [None]:
# all px converted to mm
dist_p50, dist_p90, dist_p95, dist_p99, dist_avg, dist_std, oks_map = [], [], [], [], [], [], []

for model in models:
    metrics = sleap.load_metrics(model, split="test")
    
    # print("Error distance (50%):", metrics["dist.p50"] / px_per_mm)
    # print("Error distance (90%):", metrics["dist.p90"] / px_per_mm)
    # print("Error distance (95%):", metrics["dist.p95"] / px_per_mm)
    # print("Error distance (99%):", metrics["dist.p99"] / px_per_mm)
    # print("Error distance (avg):", metrics["dist.avg"] / px_per_mm)
    # print("Error std:", np.nanstd(metrics["dist.dists"].flatten()) / px_per_mm)
    
    dist_p50.append(metrics["dist.p50"] / px_per_mm)
    dist_p90.append(metrics["dist.p90"] / px_per_mm)
    dist_p95.append(metrics["dist.p95"] / px_per_mm)
    dist_p99.append(metrics["dist.p99"] / px_per_mm)
    dist_avg.append(metrics["dist.avg"] / px_per_mm)
    dist_std.append(np.nanstd(metrics["dist.dists"].flatten()) / px_per_mm)
    oks_map.append(metrics["oks_voc.mAP"])

In [None]:
df_summary = pd.DataFrame({
    "SE folder": se_folder,
    "Model": models,
    "Train sample size": train_n,
    "Error distance (50%) mm": dist_p50,
    "Error distance (90%) mm": dist_p90,
    "Error distance (95%) mm": dist_p95,
    "Error distance (99%) mm": dist_p99,
    "Error distance (avg) mm": dist_avg,
    "Error std mm": dist_std,
    "oks mAP": oks_map,
})
df_summary["Dataset"] = df_summary["SE folder"].map(lambda x: se_names[x])

df_summary

In [None]:
# summary of sample efficiency 
df_summary.to_csv(f"sample_efficiency_summary.csv", index=False)

In [None]:
len(df_summary["SE folder"].unique())

In [None]:
# # subset_df = df_summary[df_summary["SE folder"] == se_folder]

# # Calculate the median and standard deviation of "Error distance (90%) mm" for each "Train sample size" value
# # median_df = df_summary.groupby("Train sample size")["Error distance (90%) mm"].agg(["median", "std"])
# # median_df.reset_index(inplace=True)

# plt.figure(figsize=(8, 6))

# # Create a scatter plot of "Train sample size" vs "Error distance (90%) mm"
# ax = sns.scatterplot(x="Train sample size", y="Error distance (90%) mm", hue="Dataset", data=df_summary)

# ax = sns.lineplot(x="Train sample size", y="Error distance (90%) mm", hue="Dataset", errorbar=None, data=df_summary, legend=False, lw=3)

# # Add error bars to the plot using the standard deviation values
# # col = sns.color_palette("tab10")[0]
# # plt.errorbar(median_df["Train sample size"], median_df["median"], yerr=median_df["std"], fmt='none', ecolor=col)#, ecolor='k')

# # # Add lines connecting the median values for each "Train sample size" value
# # sns.lineplot(x="Train sample size", y="median", data=median_df, ax=ax, color=col)

# # # Set the title of the subplot to the current SE_folder value
# # ax.set_title(se_folder)

# # ax.set_xlabel("Train sample size")

# # ax.set_ylabel("Error distance (90%) mm")

# # plt.savefig(f"{output_folder}/{se_folder}_error_dist_90.png", dpi=300, bbox_inches='tight', pad_inches=0.1, facecolor='w')

# # plt.show()
# # plt.close()

In [None]:
df_summary['Dataset'].unique()

In [None]:
plt.figure(figsize=(10, 7))

# Define the desired order of the dataset hue
hue_order = ["Arabidopsis (7 DAG, primary)", "Rice (3 DAG, primary)", "Canola (5-13 DAG, primary)",
             "Arabidopsis (7 DAG, lateral)", "Soybean (5-8 DAG, lateral)", "Soybean (5-8 DAG, primary)", 
             "Rice (3 DAG, crown)", "Rice (10 DAG, crown)", "Canola (5-13 DAG, lateral)"]

# Create a scatter plot with the specified hue order
ax = sns.scatterplot(x="Train sample size", y="Error distance (90%) mm", hue="Dataset", hue_order=hue_order, data=df_summary)

ax = sns.lineplot(x="Train sample size", y="Error distance (90%) mm", hue="Dataset", hue_order=hue_order, errorbar=None, data=df_summary, legend=False, lw=3)

# Optionally, you can set a custom legend order using the 'handles' and 'labels' attributes of the legend
handles, labels = ax.get_legend_handles_labels()
ax.legend(handles=handles, labels=hue_order, title="Root model")

plt.savefig(r"figures/sample_efficiency/sample_efficiency_all_models.png", bbox_inches = "tight", facecolor="white")
plt.savefig(r"figures/sample_efficiency/sample_efficiency_all_models.pdf", bbox_inches = "tight", facecolor="white")

In [None]:
threshold = 3  # mm

# Define the desired order of the dataset hue
hue_order = ["Arabidopsis (7 DAG, primary)", "Rice (3 DAG, primary)", "Canola (5-13 DAG, primary)",
             "Arabidopsis (7 DAG, lateral)", "Soybean (5-8 DAG, lateral)", "Soybean (5-8 DAG, primary)", 
             "Rice (3 DAG, crown)", "Rice (10 DAG, crown)", "Canola (5-13 DAG, lateral)"]

def is_accurate(df):
    df["is_accurate"] = (df["Error distance (90%) mm"] - df["Error distance (90%) mm"].min()) < threshold
    return df

df = df_summary.groupby(["Dataset", "Train sample size"])["Error distance (90%) mm"].agg("median").reset_index()
df = df.groupby("Dataset").apply(is_accurate)
df = df[df["is_accurate"]].sort_values("Train sample size").groupby("Dataset").first().reset_index().drop("is_accurate", axis=1)

plt.figure(figsize=(8, 6))
sns.barplot(x="Train sample size", y="Dataset", data=df, order=hue_order )
plt.xlabel("Labels required to attain accuracy within 3 mm of best performance")
plt.ylabel("Root model")
plt.savefig(r"figures/sample_efficiency/number_labeled_frames_required_for_accurate_predictions.png", bbox_inches = "tight", facecolor="white")
plt.savefig(r"figures/sample_efficiency/number_labeled_frames_required_for_accurate_predictions.pdf", bbox_inches = "tight", facecolor="white")

In [None]:
# labels required to attain accuracy within 3 mm of best performance
df.to_csv(f"labels_for_minimally_accurate_model.csv", index=False)