# Combine annotations into one csv file

This notebooks is part of the Spyfish Aotearoa data cleaning efforts and is used to concatenate files containing extracted expert annotations, which is then uploaded to the S3 bucket.

In the second part of the notebook, there are some visual checks to see if something is suspicious with the annotations. These output should be ready for upload, so there shouldn't be any irregularities and if there are, it means that the previous notebook (legacy_annotations_extract) needs to be updated and the export file re-run - or potentially done by hand, but with a TODO in the above notebook.

TODO: Some of these checks could be repurposed for the automatic tests of the annotations being saved in the S3 buckets in the future.


In [1]:
# Last changed 2025.05.03

In [27]:
import os
import pandas as pd
import datetime

from sftk.common import S3_SHAREPOINT_SURVEY_CSV
from sftk.utils import filter_file_paths_by_extension, read_file_to_df
from sftk.s3_handler import S3Handler

In [5]:
annotations_folder = "/path/to/data/Video analysis/export"

In [None]:
all_files = os.listdir(annotations_folder)
tabular_files = filter_file_paths_by_extension(all_files, ["csv"])
tabular_files = [os.path.join(annotations_folder, file_name) for file_name in tabular_files if "~" not in file_name] 
tabular_files

## Combine the files into one dataframe


In [7]:
def combine_annotations(all_tab_files):
    dfs = []
    for f in all_tab_files:
        try: 
            dfs.append(read_file_to_df(f))
        except Exception as e:
            print(f"{f} not read, {e}")
    combined_df = pd.concat(dfs, axis=0)
    return combined_df

In [None]:
combined_df = combine_annotations(tabular_files)
print(combined_df.shape)
print(combined_df.columns)
combined_df.sample(10)

In [15]:
# Add confidence agreement NA because of expert annotations
combined_df["ConfidenceAgreement"] = "NA"

# Check validity of various columns

## Review null deployments

In [None]:
# check to see if there are any problems
combined_df[combined_df["ScientificName"].isna()]
combined_df[combined_df["MaxInterval"].isna()]
combined_df[combined_df["TimeOfMax"].isna()] # This one most often shows some irregularities

In [None]:
combined_df.fillna("NULL", inplace=True)

## Check species names

Species underscored with FIX_ need review, as do sp1, sp2, sp3, sp4, sp5, sp6, sp7, as do any mention of unknown/undefined.
TODO:
- check species with species name checker to make sure all is good


In [None]:
combined_df["ScientificName"].value_counts()

In [17]:
def rename_species(name):
    if name in {"sp1", "sp2"}:
        return f"FIX_{name}"
    return name

In [18]:
combined_df["ScientificName"] = combined_df["ScientificName"].apply(rename_species)

## Check MaxIterval & TimeOfMax

In [None]:
combined_df["MaxInterval"].unique()

In [None]:
combined_df["TimeOfMax"].unique()

In [None]:
# Are there any times that do not follow the predefined format or NULL
# TODO: now this checks that the string is 8 long, it would be good to check with a regex str
combined_df[(combined_df["TimeOfMax"].str.len() != 8) & (combined_df["TimeOfMax"] != 'NULL')]

## Compare SurveyID presence in annotation vs metadata 

In [None]:
surveyIDs_annotations_set = set(combined_df['DropID'].str[:16].unique())
len(surveyIDs_annotations_set), surveyIDs_annotations_set 

In [28]:
s3_handler = S3Handler()
surveys_df = s3_handler.read_df_from_s3_csv(S3_SHAREPOINT_SURVEY_CSV )
surveyIDs_metadata_set = set(surveys_df["SurveyID"])

In [None]:
common = surveyIDs_annotations_set & surveyIDs_metadata_set
only_in_annotations = surveyIDs_annotations_set - surveyIDs_metadata_set
only_in_surveys = surveyIDs_metadata_set - surveyIDs_annotations_set


print(f"Reviewing files annotations and surveys, there are {len(common)} SurveyIDs in common." )
print(f"The two files have the following {len(common)} SurveyIds in common:")
print(sorted(list(common)))

print(f"The {len(only_in_annotations)} SurveyIDs present only in annotations are:")
print(sorted(list(only_in_annotations)))

print(f"The {len(only_in_surveys)} SurveyIDs present only in surveys are:")
print(sorted(list(only_in_surveys)))

## Review duplicates

In [None]:
combined_df[combined_df.duplicated(keep=False)]

# Export combined_df to combined annotations file

In [39]:
# Create export folder in folder containing the annotations folder
path_to_export = os.path.join(annotations_folder, "export")
os.makedirs(path_to_export, exist_ok=True)

In [None]:
# Get the current date for the annotations file
current_date = str(datetime.date.today())
current_date

In [None]:
combined_df.shape

In [None]:
export_excel_file_name = f"{current_date}_annotations_buv_doc_combined.csv"
export_location = os.path.join(path_to_export, export_excel_file_name)

print(f"File containing the concatenated annotations exported to: '{export_location}'")
combined_df.to_csv(export_location,index=False)  

In [37]:
# Remove rows where any cell starts with "FIX"
new_df = combined_df[~combined_df.apply(lambda row: row.astype(str).str.startswith("FIX").any(), axis=1)]

In [None]:
new_df.shape

In [None]:

export_excel_file_name = f"{current_date}_annotations_buv_doc_combined_fix_removed.csv"
export_location = os.path.join(path_to_export, export_excel_file_name)

print(f"File containing the concatenated annotations without the rows to be fixed exported to: '{export_location}'")
new_df.to_csv(export_location,index=False)  