# Spyfish Status Board

This notebook builds the status board of Surveys and BUV Deployments, including:
- File presence validation (checking if video files exist in S3)
- Annotation status (expert, ML, and Biigle annotations)
- Survey-level summaries and statistics


In [None]:
# Last change: 2025.11.22

In [None]:
# DEV
# Uncomment, if you want to include local coding changes continuously.
%load_ext autoreload
%autoreload 2

## Load Data from S3


In [None]:
import pandas as pd

from sftk.s3_handler import S3Handler
from sftk.common import S3_SHAREPOINT_SPECIES_CSV, S3_SHAREPOINT_SITE_CSV, S3_SHAREPOINT_SURVEY_CSV, S3_KSO_ANNOTATIONS_CSV, S3_SHAREPOINT_DEPLOYMENT_CSV, FILE_PRESENCE_RULES
from sftk.utils import filter_file_paths_by_extension, read_file_to_df

FILE_PRESENCE_RULES

In [None]:
s3_handler = S3Handler()
# scientific_names_df = s3_handler.read_df_from_s3_csv(S3_SHAREPOINT_SPECIES_CSV)
# site_id_df = s3_handler.read_df_from_s3_csv(S3_SHAREPOINT_SITE_CSV)
surveys_df = s3_handler.read_df_from_s3_csv(S3_SHAREPOINT_SURVEY_CSV)
annotations_df = s3_handler.read_df_from_s3_csv(S3_KSO_ANNOTATIONS_CSV)
deployment_df = s3_handler.read_df_from_s3_csv(S3_SHAREPOINT_DEPLOYMENT_CSV)
# scientific_names_df.columns,
# site_id_df.columns,
# surveys_df.columns,
# annotations_df.columns,
# deployment_df.columns, 



## Check which surveys are missing/extra in BUV Deployments

In [None]:
survey_ids_surveys = set(surveys_df["SurveyID"].unique())
survey_ids_deps = set(deployment_df["SurveyID"].unique())

print(f"{len(survey_ids_surveys - survey_ids_deps)} surveys in surveys but not in deployments:\n", survey_ids_surveys - survey_ids_deps)
print(f"{len(survey_ids_deps - survey_ids_surveys)} Surveys in deployments but not in surveys:\n", survey_ids_deps - survey_ids_surveys)

## Merge Surveys and Deployments


In [None]:
dep_surv_df = surveys_df.merge(deployment_df, on="SurveyID", how="left")
dep_surv_df

In [None]:
# check the num of extra surveys in surveys but not in deployments is present in merged dataset
dep_surv_df["DropID"].isna().sum() == len(survey_ids_surveys - survey_ids_deps)

## Create Deployment Status Dataset

In [None]:
dep_surv_min_df  = dep_surv_df[["SurveyID", "DropID", "LinkToVideoFile", "IsBadDeployment"]].copy()
dep_surv_min_df

## Add Expert Annotations Count 

annotation rows per DropID


In [None]:
counts = annotations_df["DropID"].value_counts()
dep_surv_min_df["expert_annotations"] = dep_surv_min_df["DropID"].map(counts).fillna(0).astype(int)
dep_surv_min_df[dep_surv_min_df["expert_annotations"] != 0].shape

## Validate File Presence in S3


In [None]:
from sftk.validation_strategies import FilePresenceValidator

from sftk.common import FILE_PRESENCE_RULES
file_presence_validator = FilePresenceValidator(FILE_PRESENCE_RULES, s3_handler)
missing_files, extra_files = file_presence_validator.get_file_differences(FILE_PRESENCE_RULES)

# (558, 3242) 15.11
# (307, 2135) 22.11

"Missing files", len(missing_files), "Extra files", len(extra_files)

In [None]:
dep_surv_min_df["file_present"] = (
    dep_surv_min_df["LinkToVideoFile"].notna()
    & ~dep_surv_min_df["LinkToVideoFile"].isin(missing_files)
)

# Missing files per survey
dep_surv_min_df[dep_surv_min_df["file_present"] == False].value_counts("SurveyID")

In [None]:
dep_surv_min_df.sample(10)

## Add annotations info

In [None]:
dep_surv_min_df["ML_annotations"] = False
dep_surv_min_df["biigle_annotations"] = False
dep_surv_min_df["annotations_count"] = dep_surv_min_df["ML_annotations"] + dep_surv_min_df["biigle_annotations"] + dep_surv_min_df["expert_annotations"] 

dep_surv_min_df["annotations"] = dep_surv_min_df["annotations_count"] > 0
dep_surv_min_df

In [None]:
dep_surv_min_df["file_present_or_bad_deployment"] = dep_surv_min_df["file_present"] | dep_surv_min_df["IsBadDeployment"]

In [None]:
dep_surv_min_df[["SurveyID", "DropID", "file_present_or_bad_deployment", "annotations" ]].sample(10)

## Count Deployments per Survey


In [None]:

dep_surv_min_df["DropID_per_SurveyID_count"] = (
    dep_surv_min_df.groupby("SurveyID")["DropID"]
    .transform(lambda x: x.notna().sum())
)
dep_surv_min_df

In [None]:
dep_surv_min_df[["SurveyID", "DropID","IsBadDeployment", "DropID_per_SurveyID_count", "file_present_or_bad_deployment", "annotations" ]]

In [None]:
dep_surv_min_df.columns

## Create Survey-Level Summary


In [None]:
summary_df = (
    dep_surv_min_df
    .groupby("SurveyID")
    .agg(
        DropID_count=("DropID", lambda x: x.notna().sum()),
        file_present_or_bad_deployment_count=("file_present_or_bad_deployment", "sum"),
        annotations_count=("annotations", "sum")
    )
    .reset_index()
)


In [None]:

summary_df[summary_df["annotations_count"] != 0]

In [None]:
# Set pandas display options to show full content
# To display the full content of each cell without truncation
pd.set_option('display.max_colwidth', None)

# To display all rows (if your DataFrame has many rows)
pd.set_option('display.max_rows', None)

# To display all columns (if your DataFrame has many columns)
pd.set_option('display.max_columns', None)

In [None]:
summary_df.sort_values(by="DropID_count", ascending=False)

In [None]:
summary_df["files_missing"] = summary_df["DropID_count"] - summary_df["file_present_or_bad_deployment_count"]
summary_df["annotations_missing"] = summary_df["DropID_count"] - summary_df["annotations_count"]

summary_df[["SurveyID", "DropID_count", "files_missing", "annotations_missing"]].sample(5)

In [None]:
summary_df.columns

In [None]:
# END