# Extract annotation data from Video analysis annotation files.

This notebooks is part of the Spyfish Aotearoa existing data cleaning effort and is used to extract the annotations - specifically the surveyIDs, scientific names, (max) count, time of max and time interval - from the files containing video analysis.

These annotations are provided by experts. As part of Spyfish Aotearoa, there exist also citizen science annotations and ML annotations (the ML will be trained with the existing expert and cit science annotations).

This notebook guides through the export of the annotations from the video analysis files, while also checking the validity and cleaning the various entries. 


What's happening in this notebook:
- load file/relevant excel sheet into df      
- clean scientific names
- review siteID
- define ReplicateWithinSite
- create DropID
- ...

In [None]:
# Last changed 2025.05.03

In [None]:
# When reviewing this notebook next time, take a look at the Gemini comments here: 
# https://github.com/wildlifeai/Spyfish-Aotearoa-toolkit/pull/36

In [262]:
import pandas as pd
import numpy as np
import os

import datetime

from sftk.s3_handler import S3Handler
from sftk.common import S3_SHAREPOINT_SPECIES_CSV, S3_BUCKET, S3_SHAREPOINT_SITE_CSV, S3_SHAREPOINT_SURVEY_CSV
from sftk.utils import filter_file_paths_by_extension, read_file_to_df

# Load Scientific names file

In [None]:
s3_handler = S3Handler()

In [None]:
scientific_names_df = s3_handler.read_df_from_s3_csv(S3_SHAREPOINT_SPECIES_CSV, S3_BUCKET)
scientific_names_df.columns

In [None]:
site_id_df = s3_handler.read_df_from_s3_csv(S3_SHAREPOINT_SITE_CSV, S3_BUCKET)
site_id_df.columns

In [None]:
surveys_df = s3_handler.read_df_from_s3_csv(S3_SHAREPOINT_SURVEY_CSV, S3_BUCKET)
surveys_df.columns

# Get sharepoint files 

# Get and select file to process:

1. Download the files from General > Spyfish > Video Analyisis
[link](https://docnz.sharepoint.com/teams/SpyfishAotearoa/Shared%20Documents/Forms/AllItems.aspx?id=%2Fteams%2FSpyfishAotearoa%2FShared%20Documents%2FGeneral%2FSpyfish%2FVideo%20analysis)

2. Change folder path below `video_analysis_folder`



In [3]:
video_analysis_folder = "/path/to/data/Video analysis"
tabular_file_extensions = ("xlsx", "xls", "xlsm", "csv")

In [None]:
all_files = os.listdir(video_analysis_folder)
tabular_files = filter_file_paths_by_extension(all_files, tabular_file_extensions)
tabular_files = [os.path.join(video_analysis_folder, file_name) for file_name in tabular_files if "~" not in file_name]  # filters out temporary files]

# Enumerate current files and select which one to process next
# TODO make the output sorted
for i, e in enumerate(tabular_files):
    print(i, e.split("/")[-1])

### Select file to process by changing number below: 

The process from here on is to be repeated for each file we need to process.
Restart here for each file that needs processing.

In [968]:
selected_file = 0

In [None]:

file_path = tabular_files[selected_file]
file_name = file_path.split("/")[-1]
print(f"File selected: {file_name}")

Read data sheet: 

In [None]:
# If 'All counts compiled' exists, it will go with it automatically, otherwise select sheet name containing annotations.
# DATA is often used, however make sure to check for interval here.

sheets = pd.ExcelFile(file_path).sheet_names
sheet_name = "All counts compiled"
if sheet_name not in sheets:
    for i, e in enumerate(sheets):
        print(i, e)
    file_num = int(input("select sheet you want to process: "))
    sheet_name = sheets[file_num]
    print("\nSelected sheet name: ", sheet_name)

current_file_df = read_file_to_df(file_path, sheet_name=sheet_name)
print(current_file_df.shape)
print(current_file_df.columns)
current_file_df.sample(3)


In [973]:
# Extract & rename the relevant columns to 'ScientificName', 'TimeOfMax', 'MaxInterval'.
# TODO test if it works for atypical files
# TODO should start interval also be a column

output_columns = {'ScientificName', 'TimeOfMax', 'MaxInterval'}
columns_in_current_sheet = set(current_file_df.columns)

# Different combinations of columns
columns_in_sheets = [{'ScientificName', 'TimeOfMaxN', 'MaxN'},
           {'ScientificName', 'TimeOfMax', 'MaxN'},
           {'ScientificName', 'TimeMaxN', 'MaxN'},
           {'ScientificName', 'TimeOfMax', 'MaxInterval'}, 
           {'ScientificName', 'Timeof MaxN', 'MaxN2'},
           {'ScientificNameFish', 'TimeMaxN', 'MaxN'},
           {'CommonName', 'TimeOfMax', 'MaxInterval'},
           ]

if len(output_columns - columns_in_current_sheet) != 0:
    for columns in columns_in_sheets:
        if len(columns - columns_in_current_sheet) == 0:
            print(f"original column names to be renamed: {columns}")
            rename_dict = dict(zip(columns, output_columns))
            current_file_df = current_file_df.rename(columns=rename_dict)

In [None]:
# Confirm/fix SurveyID to match the BUV SurveyID metadata values

unique_survey_ids = current_file_df['SurveyID'].unique()
print("Current file name: ", file_name)
print(f"SurveyIDs in file: {unique_survey_ids}")

In [None]:
np.sort(surveys_df["SurveyID"].unique())

In [976]:
existing_survey_ids = set(surveys_df["SurveyID"].unique())

survey_mapping = {
    'BUV_BNP_20181216' : 'BNP_20181216_BUV',
    # MPAMAR Data BUV Akaroa Pohatu 2017 Video analysis data sheet.xlsm
    'BUV_BNP_20170223' : 'BNP_20170223_BUV',
    #  MPAMAR data Akaroa Pohatu BUV 2021 - Video analysis data sheet - DOC-7166069.xlsm
    'BUV_BNP_20210127' : 'BNP_20210127_BUV', 
    # MPAMAR Data BUV Tonga Island 2021 Video analysis data sheet.xlsm
    'BUV_TON_20211026' : 'TON_20211026_BUV', 
    # MPAMAR Data BUV Te Tapuwae o Rongokako 2021 - Video analysis sheet - DOC-6731514.xlsm
    'BUV_TTR_20210125' : 'RON_20210125_BUV',
     # MPAMAR Data BUV Tuhua 2021 Video analysis sheet - DOC-6891090.xlsm
    'BUV_TUH_20210311' : 'TUH_20210309_BUV',
    'BUV_TUH_20210310' : 'TUH_20210309_BUV',
    'BUV_TUH_20210309' : 'TUH_20210309_BUV',

    # "MRMDATA - BUV - Parininihi - 2012_2014 - DOC2787054 - DOC-2787054.xlsm"
    'BUV_TNK_20140405': 'PAR_20140404_BUV',  
    'BUV_TNK_20120423': 'PAR_20120224_BUV', 
    # "MRMDATA - BUV - Tapuae - 2011_2015 - DOC-2639983.xlsm":
    'BUV_TNK_20110324': 'SLI_20110413_BUV',
    'BUV_TNK_20130227': 'SLI_20130227_BUV',
    'BUV_TNK_20150216': 'SLI_20150216_BUV', 
    # MPAMAR Data BUV Tonga Island 2021 Video analysis data sheet.xlsm 
    'BUV_TON_20210101': 'TON_20211026_BUV',
    'BUV_TON_20211026': 'TON_20211026_BUV',
    'BUV_TON_20211027': 'TON_20211026_BUV',
    # MPAMAR Data BUV Tuhua 2020 Video analysis sheet.xlsm
    'BUV_TUH_20200922' : "TUH_20200830_BUV",
    'BUV_TUH_20200831' : "TUH_20200830_BUV",
    'BUV_TUH_20200901' : "TUH_20200830_BUV",
    'BUV_TUH_20200830' : "TUH_20200830_BUV",
    
    # MPAMAR Data BUV Horoirangi 2021 - Video analysis sheet.xlsm
    'HMR_20211122_BUV' : 'HOR_20211122_BUV',

    'BUV_TEA_20210313' : 'ANG_20210313_BUV',
    'BUV_TUH_20200922' : 'TUH_20200830_BUV',
       
    'BUV_TAP_01012022' : 'SLI_20220228_BUV',
    }

In [977]:
def fix_survey_id(survey_id):
    if pd.isna(survey_id):
        print("survey issue")
        return "FIX"

    if file_name == "MPAMAR Data BUV Tapuae 2024 - Video analysis sheet.xlsm":
        survey_id = 'SLI_20240124_BUV' 

    curr_survey = survey_mapping.get(survey_id, survey_id)
    if curr_survey not in existing_survey_ids:
        print(f"survey issue,{curr_survey}, {survey_id}")
        return f"FIX_{curr_survey}"
    
    return curr_survey

In [None]:
# Confirm/fix SurveyID to match the BUV SurveyID metadata values
unique_survey_ids = current_file_df['SurveyID'].unique()

print("Current file name: ", file_name)
print(f"SurveyIDs in file: {unique_survey_ids}")

current_file_df['SurveyID'] = current_file_df['SurveyID'].apply(fix_survey_id)

print("Validated survey ids:", current_file_df['SurveyID'].unique()) 

In [None]:
# Check if you need to fix SiteID, if it looks good go to next section
print(f"row nums: {current_file_df.shape[0]}")
print(f"len siteIDs: {len(current_file_df['SiteID'].unique())}, nan values: {current_file_df['SiteID'].isna().sum()}")
print(current_file_df['SiteID'].unique())
print(f"len SiteName: {len(current_file_df['SiteName'].unique())}, nan values: {current_file_df['SiteName'].isna().sum()}")
print(current_file_df['SiteName'].unique())
print(f"len SiteCode: {len(current_file_df['SiteCode'].unique())}, nan values: {current_file_df['SiteCode'].isna().sum()}")
print(current_file_df['SiteCode'].unique())
print(f"len SurveyName: {len(current_file_df['SurveyName'].unique())}, nan values: {current_file_df['SurveyName'].isna().sum()}")
print(current_file_df['SurveyName'].unique())


In [None]:
# TODO so that it matches survey?
# TODO not robust, build on the go, does not cover all cases

def fix_site_id(site_id):
    # TODO could just add zfill 
    if site_id == "BNP_97":
        return "BNP_097"
    if site_id.startswith("TNK"):
        return "SLI" + site_id[3:]
    return site_id

In [774]:
current_file_df["SiteID"] = current_file_df["SiteID"].apply(fix_site_id)
print(current_file_df['SiteID'].unique())

In [747]:
current_file_df.loc[current_file_df['SiteID'] == "BNP_97", 'SiteID'] = "BNP_097"

In [None]:
current_file_df[current_file_df["SiteID"] == "TNK_011"][["Latitude", "Longitude"]]

In [None]:
site_id_df[site_id_df["SiteID"] == "SLI_011"]

# Create DropID

SurveyID_SiteID_ReplicateWithinSite


### get replicate withing Site

ReplicateWithinSite is a 2 digit number, starting with 1 and for each repetition is a new replicate.

If there are multiple years per file, repeat this process.

In [926]:

clean_df = current_file_df.copy()

### SiteID checks

In [None]:
# current_file_df[ "SiteID"] = current_file_df[ "SiteName"]

# if current_file_df["SurveyID"].unique() == ["RON_20210125_BUV"]:
#     current_file_df[ "SiteID"] = current_file_df[ "SiteCode"]

In [None]:

print(len(clean_df))

# ['PAR_20120224_BUV' 'PAR_20140404_BUV']
# clean_df = clean_df[clean_df['SurveyID'] == 'PAR_20120224_BUV']
# clean_df = clean_df[clean_df['SurveyID'] == 'PAR_20140404_BUV']

# ['SLI_20110413_BUV', 'SLI_20130227_BUV' , 'SLI_20150216_BUV'  ]
# clean_df = clean_df[clean_df['SurveyID'] == 'SLI_20110413_BUV']
# clean_df = clean_df[clean_df['SurveyID'] == 'SLI_20130227_BUV']
# clean_df = clean_df[clean_df['SurveyID'] == 'SLI_20150216_BUV']

print(len(clean_df))

In [None]:

print("SiteIDs with bad deployments: ", clean_df[clean_df['IsBadDeployment'] == True]["SiteID"].unique())
print("Unique SiteIDs with Bad Deployments:", len(clean_df[clean_df['IsBadDeployment'] == True]["SiteID"].unique()))
print("Total bad deployments:", len(clean_df[clean_df['IsBadDeployment'] == True]["SiteID"]))
clean_df[clean_df['IsBadDeployment'] == True][["SurveyName", "SiteID", "SiteName", "SiteCode"]] # ,"deployment_number"

In [None]:
# Check bad deployments
clean_df[clean_df['SiteID'] == "TON_007"]
clean_df[clean_df["SiteID"] == "TUH_009"]

In [None]:
clean_df[clean_df['TimeOfMax'].isna()]

In [None]:
# Find if there is a good deployment out of the bad deployment
for i in clean_df[clean_df['IsBadDeployment'] == True]["SiteID"]:
    print(len(clean_df[clean_df[ 'SiteID']== i]), i)

In [None]:
# Check things
clean_df[clean_df['SiteID']== "SLI_072"]
site_id_df[site_id_df["SiteCode"] == "SEPPT_D1"]
site_id_df[site_id_df["SiteID"] == "TON_006"]

## Create ReplicateWithinSite equivalent:

In [None]:
print(sum(clean_df["IsBadDeployment"] == True))
print(sum(clean_df["IsNullSample"] == True))

In [934]:

duplicate = {}
clean_df["deployment_number"] = 0
for row_id, row in clean_df.iterrows():
    curr = duplicate.get(row["SiteID"], 1)
    clean_df.at[row_id, "deployment_number"] = curr
    if row["IsBadDeployment"]:
        duplicate[row["SiteID"]] = curr + 1



In [None]:
clean_df["deployment_number"].unique()

## Create DropIDs

In [None]:
def make_new_DropID(row):
     return f'{row["SurveyID"]}_{row["SiteID"]}_{int(row["deployment_number"]):02d}'

clean_df["DropID"] = clean_df.apply(make_new_DropID, axis=1)
clean_df["DropID"].sample(3)

In [937]:
# clean_df[clean_df['SiteID']== "SLI_072"]

# Fix Scientific Names

In [None]:
current_file_df[current_file_df["ScientificName"].isnull()]

In [939]:
# New species/new nomenclature/typos found in files matched to Scientific names

dict_added = {}

# New species
dict_added["Cheilodactylus spectabilis"] = "Chirodactylus spectabilis"
dict_added["Pseudolabrus miles"] = "Pseudolabrus miles"
dict_added["Conger wilsoni"] = "Conger wilsoni"
dict_added["Pseudocaranx georgianus"] = "Pseudocaranx georgianus"
dict_added["Chelidonichthys cuculus"] = "Chelidonichthys kumu"


# Not accpted anymore
dict_added["Cephaloscyllium isabellum"] = "Cephaloscyllium isabella"
dict_added["Chromis dispilus"] = "Chromis dispila"
dict_added["Upeneichthys porosus"] = "Upeneichthys lineatus"
dict_added["Upeneichthys porsus"] = "Upeneichthys lineatus" # typo?
dict_added["Pagrus aurastus"] = "Pagrus auratus"
dict_added["Dasyatis brevicaudata"] = "Bathytoshia brevicaudata"
dict_added["Octopus maorium"] = "Macroctopus maorum"


# Typos
dict_added["Pseduolabrus miles"] = "Pseudolabrus miles"
dict_added["Psedolabrus miles"] = "Pseudolabrus miles"
dict_added["Odax pullas"] = "Odax pullus"
dict_added["Chiroremus marmoratus"] = "Chironemus marmoratus"
dict_added["Psedudocaranx georgianus"] = "Pseudocaranx georgianus"
dict_added["Gymnothoraz numilus"] = "Gymnothorax nubilus"
dict_added["Psuedophyscis bacchus"] = "Pseudophycis bachus"
dict_added["Psuedophycis bachus"] = "Pseudophycis bachus"
dict_added["Parus auratus"] = "Pagrus auratus"
dict_added["Chrysophrys auratus"] = "Pagrus auratus"
dict_added["Chelidonchthys kumu"] = "Chelidonichthys kumu"
dict_added["Paraperis colias"] = "Parapercis colias"
dict_added["Seriola lalandi lalandi"] = "Seriola lalandi"

# Too broad, to fix
dict_added["Chondrichthyes"] = "_FIXChondrichthyes"

# To Check
# didn't exist
dict_added["Notoclinus cinctus"] = "Notolabrus cinctus" # is it Notolabrus cinctus?


dict_added["Oligoplites saurus"] = "Oligoplites saurus"




dict_added["Zearaja nasuta"] = "Dipturus nasutus"


# dict_scientific_scientific.update(dict_added)

In [940]:
# extra dictionary with invalid, null deployments and undefined species (TODO: these are getting fixed)
extras_dict = {}
# for i in ["Bad deployment", 'Null', "Null sample"]:
#     extras_dict.update({i: i.upper()})
# extras_dict["Null sample"] = "NULL"
for i in [ "Sp1", "Sp2", "Sp3", "Sp4", "Sp5", "Sp6", "Sp7", "Unknown", "Other"]:
    extras_dict.update({i: i.lower()})
    

In [941]:
scientific_name_set = set(scientific_names_df["scientificName"])
common_name_dict = dict(zip(scientific_names_df["commonName"], scientific_names_df["scientificName"]))

In [942]:
# Initialize counters for different name categories
sn = 0  # Scientific names found
cn = 0  # Common names found
dn = 0  # Names found in user-added dictionary
nn = 0  # Names not found (require fixing)
en = 0  # Names found in extras dictionary
nd = 0  # Null or 'Null sample' values
bd = 0  # 'Bad deployment' entries
def clean_name(name):
    global sn, cn, dn, nn, en, nd, bd
    
    name = " ".join(name.strip().split()).capitalize()

    if pd.isna(name):
        nn+=1
        return "NULL"
    
    if name in scientific_name_set:
        sn+=1
        return name

    if name in dict_added:
        dn+=1
        return dict_added[name]
    
    if name in common_name_dict:
        cn+=1
        return common_name_dict[name]
    
    if name in { "Sp1", "Sp2", "Sp3", "Sp4", "Sp5", "Sp6", "Sp7", "Unknown", "Other"}:
        en +=1
        return name.lower()
    
    if name in {'Null', "Null sample"}:
        nd +=1
        return "NULL"
    
    if name == "Bad deployment":
        bd+=1
        return "BAD DEPLOYMENT"

    print("name not found", name)
    nn+=1
    return f"FIX_{name}"

In [None]:
## print(file_name)
clean_df["ScientificName"] = clean_df["ScientificName"].apply(clean_name)
# Print summary of results
print(f"""Summary of name cleaning:
- Scientific names found        : {sn}
- Dictionary-added names        : {dn}
- Common names found            : {cn}
- Extras (sp, unknown) found    : {en}
- Null/Null sample entries      : {nd}
- Bad deployment entries        : {bd}
- Names not found (FIX_...)     : {nn}
""")

np.sort(clean_df["ScientificName"].unique())

In [None]:
# TODO check NULL SAMPLE in max and time
# TODO check what this does

print(sum(clean_df["ScientificName"]== "NULL SAMPLE"), "should be 0")
# current_file_df["ScientificName"][current_file_df["ScientificName"] == "NULL SAMPLE"] = "NULL"

# sum(current_file_df["ScientificName"]== "NULL SAMPLE"))
clean_df['MaxInterval'] = clean_df['MaxInterval'].apply(lambda x: x.strip() if isinstance(x, str) else x)
print(len(clean_df[clean_df['TimeOfMax']==""]))
clean_df['MaxInterval'][clean_df['MaxInterval']== "NULL SAMPLE"] = 'NULL'
clean_df['MaxInterval'][clean_df['MaxInterval']==""] = 'NULL'
print(len(clean_df[clean_df['TimeOfMax']==""]))

clean_df['TimeOfMax'] = clean_df['TimeOfMax'].apply(lambda x: x.strip() if isinstance(x, str) else x)
clean_df['TimeOfMax'][clean_df['TimeOfMax']=="NULL SAMPLE"] = 'NULL'

In [None]:
# remove bad deployments
print(len(clean_df), sum(clean_df["ScientificName"] == 'BAD DEPLOYMENT'))
clean_df = clean_df[clean_df["IsBadDeployment"] != True]
len(clean_df), sum(clean_df["ScientificName"] == 'BAD DEPLOYMENT')

In [None]:
clean_df[clean_df["TimeOfMax"].isna()]

In [947]:
# fix datetime

def fix_datetime(row):
    date = row['TimeOfMax']
    good = False
    while good == False:
        
        if type(date) == str: 
            if date == "NULL":
                good = True
            else:
                try: 
                    date = date.replace(";", ":")
                    date_list = date.split(":")
                    seconds = int(date_list[-1])

                    mins = int(date_list[-2])

                    if len(date_list) == 3:
                        hours = int(date_list[0])
                    else:
                        hours = 0
                    print(f'{hours}:{mins}:{seconds}', row["DropID"], row["ScientificName"])

                    date = datetime.time(hours,mins,seconds)
                    print(date)
                    good = True
                except Exception as e:
                    print("what??", e)
                    pass

            if not good:
                print("Current date doesn't fit format: ", date, row["DropID"], row["ScientificName"])
                date_str = input("Type out the time in following format HH:MM:SS)") 
                print( row["DropID"], row["ScientificName"])

                date = datetime.datetime.strptime(f'{date_str[:2]}:{date_str[3:5]}:{date_str[6:]}', '%H:%M:%S').time()
                good = True
            # TODO: check first and second and third pair is digit.
        elif isinstance(date, datetime.time):
            good = True
        else:
            print(type(date), date)

            date = "NULL"            
    return date

In [None]:
print(len(clean_df['TimeOfMax'].unique()), len(clean_df['MaxInterval'].unique()))

clean_df['TimeOfMax'] = clean_df.apply(fix_datetime, axis=1)

print(len(clean_df['TimeOfMax'].unique()), len(clean_df['MaxInterval'].unique()))

print(clean_df['TimeOfMax'].unique())
print(clean_df['MaxInterval'].unique())


In [949]:
# current_file_df[current_file_df["SiteID"] == "SLI_076"]

In [None]:
selected_df = clean_df[['DropID','ScientificName', 'TimeOfMax', 'MaxInterval']].copy()
selected_df

# export annotation file 


# Check validity of various columns


## Review null deployments:


In [None]:
# There should be the same amount of nulls...

print(len(selected_df[selected_df["ScientificName"] == "NULL"]))
print(len(selected_df[selected_df["TimeOfMax"] == "NULL"]))
print(len(selected_df[selected_df["MaxInterval"] == "NULL"]))

In [None]:
selected_df[selected_df["TimeOfMax"] == "NULL"] 

In [None]:

print(len(selected_df[selected_df["ScientificName"].isna()]))
print(len(selected_df[selected_df["TimeOfMax"].isna()]))
print(len(selected_df[selected_df["MaxInterval"].isna()]))

In [None]:

print(sum(clean_df["ScientificName"] == "NULL"))
clean_df['ScientificName'] = clean_df['ScientificName'].fillna('NULL')
clean_df['TimeOfMax'] = clean_df['TimeOfMax'].fillna('NULL')
clean_df['MaxInterval'] = clean_df['MaxInterval'].fillna('NULL')
print(sum(clean_df["ScientificName"] == "NULL"))

## Check species names

Species underscored with FIX_ need review, as do sp1, sp2, sp3, sp4, sp5, sp6, sp7, as do any mention of unknown/undefined.

TODO:
- check species with species name checker to make sure all is good

In [None]:
selected_df["ScientificName"].value_counts()

In [None]:
species_names = selected_df["ScientificName"].unique()
for i in np.sort(species_names[1:]):
    print(i)   

## Check MaxIterval & TimeOfMax


In [None]:
selected_df["MaxInterval"].unique()

In [959]:
# Are there any times that do not follow the predefined format or NULL

for row in selected_df["TimeOfMax"]:
    if type(row) == str:
        if row != "NULL":
            print(row)
    elif type(row) == datetime.time:
        pass

## Review duplicates

In [None]:
# TODO: duplicates usually when TimeofMax missing
selected_df[selected_df.duplicated(keep=False)]

In [961]:
selected_df.drop_duplicates(inplace=True)

In [None]:
selected_df[selected_df.index == 215]

# Export

In [None]:
# selected_df = current_file_df[['DropID','ScientificName', 'TimeOfMax', 'MaxInterval']].copy()
selected_df['AnnotatedBy'] = "expert"

# TODO select the right annotations (should it be an input?)
# All Counts Compiled -> IntervalAnnotation 30, 
# Max Count Compiled more -> IntervalAnnotation 1800,

# interval_annotation = 1800
interval_annotation = 30
selected_df['IntervalAnnotation'] = interval_annotation
selected_df = selected_df.fillna("NULL")
selected_df['ConfidenceAgreement'] = "NA"

In [None]:
surveyID = clean_df["SurveyID"].unique()[0]
surveyID

In [None]:

def export_to_annotations(df_with_vals, file_name, selected_folder, export_csv_file_name=None):
    # Export extracted annotations to csv sheet in export folder
    if not export_csv_file_name:
        export_file_name = os.path.splitext(os.path.basename(file_name))[0]
        export_csv_file_name = f"annotations_buv__{interval_annotation}__{surveyID}__{export_file_name}.csv"
 
    # create export folder in folder containing the annotation files 
    # TODO check this
    path_to_export = os.path.join(selected_folder, "export")
    os.makedirs(path_to_export, exist_ok=True)
    export_location = os.path.join(path_to_export, export_csv_file_name)
    print(f"Exporting data to file: '{export_location}'")
    df_with_vals.to_csv(export_location,index=False)  
    
print(f"Showing sample of export with shape: {selected_df.shape}")
display(selected_df.sample(10))
export_to_annotations(selected_df, file_name, video_analysis_folder)     
