# Extract annotation data from files

Running this will create an export folder in the current directory


In [8]:
import os
import re
import pandas as pd

from ipyfilechooser import FileChooser
from IPython.display import display

In [9]:
### Constants

# SAMPLE_SHEET_LINES defines how many lines of the file get shown each time a new file is processed
SAMPLE_SHEET_LINES = 5

OUTPUT_COLUMNS = ['DeploymentID', 'ScientificName', 'TimeOfMax', 'MaxInterval']


# Different combinations of columns
COLUMNS = [['DeploymentID', 'ScientificName', 'TimeOfMaxN', 'MaxN'],
           ['DeploymentID', 'ScientificName', 'TimeOfMax', 'MaxN'],
           ['DeploymentID', 'ScientificName', 'TimeMaxN', 'MaxN'], 
           ['DeploymentID' ,'ScientificNameFish', 'TimeMaxN', 'MaxN'],
           ['UnitID' ,'ScientificNameFish', 'TimeMaxN', 'MaxN'],
           ['DeploymentID', 'ScientificName', 'TimeOfMax', 'MaxInterval']]

# Most often the annotations is held in the "All counts compiled" sheet

#### Select the folder containing the files from which to extract video analysis




In [10]:
folder_chooser = FileChooser(title='<b>Select a folder</b>')
display(folder_chooser)


FileChooser(path='/Users/kalindi/Desktop/dt/spyfish/notebooks', filename='', title='<b>Select a folder</b>', s…

In [13]:
selected_folder = folder_chooser.selected_path
assert selected_folder != None, "Select folder in the cell above."
print(f"The selected folder is {selected_folder}")
all_files = os.listdir(selected_folder)
all_tab_files = [os.path.join(selected_folder, file) for file in all_files if file.endswith(".xlsx") or file.endswith(".xls") or file.endswith(".csv")]
all_tab_files

The selected folder is /Users/kalindi/Desktop/dt/spyfish/og/VideoAnalysis


['/Users/kalindi/Desktop/dt/spyfish/og/VideoAnalysis/MPAMAR Data BUV Te Tapuwae o Rongokako 2021 - DOC-6731514.csv',
 '/Users/kalindi/Desktop/dt/spyfish/og/VideoAnalysis/MPAMAR Data BUV Akaroa Pohatu 2019 Video analysis data sheet - DOC-6649702.csv',
 '/Users/kalindi/Desktop/dt/spyfish/og/VideoAnalysis/MPAMAR Data BUV Akaroa Pohatu 2017 Video analysis data sheet - DOC-5452215.csv']

## Functions

In [14]:
def export_metadata_from_files_to_excel( all_tab_files, export_excel_file_name=None, include_file_name=False, columns=OUTPUT_COLUMNS):
    
    problem_files = []
    done_files = []
    to_fix_files = []

    
    if include_file_name:
        columns += ["FileName"]
    
    df_with_vals = pd.DataFrame(columns=columns) 

    
    for file_name in all_tab_files:
        print(f"Working on file: '{file_name}'. Example rows:\n")
        # Extract data frame from Excel sheet
        selected_df = get_df_from_sheet(file_name)
        if selected_df.empty:
            print(f"!!!!!!!!!!!!!!!!!!!!!!!!!!!!!File not exported {file_name}!!!!!!!!!!!!!!!!!!!!!!!!!!!!!")
            problem_files.append(file_name)
        else:
            to_export = True
            print(f"Reviewing file: {file_name}.")
            for col in selected_df:
                print(f"Showing unique values from column: {col}\n", selected_df[col].unique())
                file_to_fix = input("1: if all good, 2: if the order of BUV is off, 3: if the df needs fixing")
                if file_to_fix == "2" and col == "DeploymentID":
                    selected_df["DeploymentID"] = selected_df["DeploymentID"].map(format_DeploymentID)
                    print(selected_df["DeploymentID"].unique())
                    
                elif file_to_fix != "1":
                    to_fix_files.append(file_name)
                    to_export = False
                    break
            if to_export:
                if include_file_name:
                    selected_df["FileName"] = file_name 
                print(f"Showing sample of export with shape: {selected_df.shape}")
                display(selected_df.sample(10))
                export_to_annotations(selected_df, file_name, export_excel_file_name=None)     
                done_files.append(file_name)

        # print(f"Done files: \n {done_files}")
        # print(f"Problem files: \n{problem_files}")
        # print(f"Files to fix: \n{to_fix_files}")
        
        
    return done_files, problem_files, to_fix_files
            

In [15]:
def get_the_right_columns(current_file_df):
    print("Current columns of file\n", current_file_df.columns)
    for c in COLUMNS:
        try:
            selected_df = current_file_df[c]
            print("Shape of current sheet with relevant columns: ",selected_df.shape)
            rename_dict = dict(zip(c, OUTPUT_COLUMNS))
            selected_df.rename(columns=rename_dict, inplace=True)
            return selected_df
        except Exception as error: 
            print(f"Something is wrong with the columns: {error}\n.")
    return pd.DataFrame()

In [16]:
def get_df_from_sheet(file_name):
    """Get a data frame from the given sheet and filename."""
    if file_name.endswith(".csv"):
        try:
            current_file_df = pd.read_csv(file_name)
            return get_the_right_columns(current_file_df)
        except Exception as error: 
            print(f"File {file_name} doesn't open with CSV reader.", error)
    else:
        tabs = pd.ExcelFile(file_name).sheet_names
        for sheet_name in tabs:
            print(f"Checking sheet: {sheet_name}")
            try: 
                current_file_df = pd.read_excel(file_name, sheet_name=sheet_name)
                selected_df = get_the_right_columns(current_file_df)
                if not selected_df.empty:
                    print(f"Exporting sheet: {sheet_name} into df.")
                    return selected_df
            except  Exception as error: 
                print(f"File {file_name} wasn't processed because of:", error)
    return pd.DataFrame()

In [17]:
def format_DeploymentID(dep_id):
    correct_pattern = r'^[A-Z]{3}_20\d{6}_BUV_\d{4}$'
    fix_zeros = r'^[A-Z]{3}_20\d{6}_BUV_\d*$'
    fix_pattern = r'^BUV_[A-Z]{3}_20\d{6}_\d*$'
    if re.match(correct_pattern, dep_id):
        pass
    elif re.match(fix_zeros, dep_id):
        num = int(dep_id[17:])
        num_str = f"{num:04}"
        dep_id = dep_id[:17] + num_str
        
    elif re.match(fix_pattern, dep_id):
        dep_id = dep_id[4:17] + "BUV_" + f"{int(dep_id[17:]):04}"
    else:
        print(f"DeploymentID another issue: {dep_id}")
    return dep_id

In [21]:
def export_to_annotations(df_with_vals, file_name, export_excel_file_name=None):
    # Export extracted metadata to Excel sheet in export folder
    # rewrite at each step to save progress after each file
    if not export_excel_file_name:
        export_file_name = os.path.basename(file_name)
        export_file_name = export_file_name[:export_file_name.find('.')]
        export_excel_file_name = f"annotations_buv_doc_{export_file_name}.xlsx"

 
    # make export folder in folder containing the annotation files 
    path_to_export = os.path.join(selected_folder, "export")
    print(path_to_export)
    os.makedirs(path_to_export, exist_ok=True)
    export_location = os.path.join(path_to_export, export_excel_file_name)
    print(f"Exporting data to file: '{export_location}'")
    df_with_vals.to_excel(export_location)  



## Run extractor

done_files, problem_files, to_fix_files = export_metadata_from_files_to_excel(all_tab_files=all_tab_files)

In [23]:
len(problem_files), len(to_fix_files), len(all_tab_files), len(done_files)

(0, 1, 3, 2)

In [24]:
done_files

['/Users/kalindi/Desktop/dt/spyfish/og/VideoAnalysis/MPAMAR Data BUV Te Tapuwae o Rongokako 2021 - DOC-6731514.csv',
 '/Users/kalindi/Desktop/dt/spyfish/og/VideoAnalysis/MPAMAR Data BUV Akaroa Pohatu 2019 Video analysis data sheet - DOC-6649702.csv']

In [25]:
problem_files

[]

In [26]:
to_fix_files

['/Users/kalindi/Desktop/dt/spyfish/og/VideoAnalysis/MPAMAR Data BUV Akaroa Pohatu 2017 Video analysis data sheet - DOC-5452215.csv']

# Notes



- Column names in the export folder were renamed to match those the original files, case, spaces removed
- ReplicateWithinSite and DeploymentID added manually in a few occasions

```
E2 = SurveyID 
R = DeploymentID 
AF2 = ReplicateWithinSite 
AC2 = SiteID 
```
__DeploymentID__

If SurveyID order of values needs to be fixed too 
```
=CONCAT(RIGHT(E2, 12), "_BUV_",TEXT(AF2,"0000")) 
```
SurveyID ok 
```
=CONCAT(E2,"_",TEXT(AF2,"0000"))  
```

__ReplicateWithinSite__
```
=IF(AC2=AC3,AF2,AF2+1)
```
AF1 = 0

# WORK IN PROGRESS NOTES: 




## Potentially ok: 

Every line is a different DeploymenID
- Te Whanganui-O-Hei BUV 2016 DATA - DOC-7180687.xlsx

Potenitally able to extract: 
- 'MPAMAR Data BUV Tapuae MR 2011 - DOC-1243658.xlsx'
- 'MRMDATA - BUV - Long Bay Okura Marine Reserve 2021 - DOC-7164369.xlsx']


Different format?
- 'MPAMAR Akaroa Pohatu BUV summary data 2021 - DOC-7166068.csv'


Other data:
- 'Poor Knights Islands BUV analysis April 2015 - DOC-2654059.xlsx',

TODO: 
- should I reupload the changed files that I used for the extraction (where I changed replica IDs etc)
- should I check if the deploymentIDs are good
- 
# TODO: 

1. consolidate names of columns in annotate file




What about when replicate within site not continuous? check exports


## Comments/Questions:
- Bad deployment entries? Copy them over? No?


## Extra columns in BUV sheet 
- Should metadata be extracted from here too?
- how to define replicateWithinSite?
- What about the order?
- 2 IDs? what order?


Files for the above:
- 'Te Whanganui-O-Hei BUV 2018 DATA - DOC-7180695.xlsx',
- 'Te Whanganui-O-Hei BUV 2016 DATA - DOC-7180687.xlsx',
- 'Te Whanganui-O-Hei BUV 2020 DATA - DOC-7180706.xlsx',
- 'MRMDATA BUV CROP and TAW 2023 - DOC-7433664.xlsx',
-  'MPAMAR Data BUV Te Tapuwae o Rongokako 2021 - DOC-6731514.csv',
    - what about the SiteIDs and replicate num they don't seem to match
-  'MPAMAR Data BUV Akaroa Pohatu 2019 Video analysis data sheet - DOC-6649702.csv',
-   'MPAMAR Data BUV Akaroa Pohatu 2017 Video analysis data sheet - DOC-5452215.csv', 


   
Sheet 1 works? Is this an example?
 'Copy of BUV data entry sheet for Horoirangi 2022 example sites jane.xlsx', 

Just BUV info: 

 - 'MPAMAR_TeWhanganui-o-Hei_BUV_DATA_Fish_2023 - DOC-7375238.xlsx', 
 - 'Poor Knights Islands BUV analysis April 2015 - DOC-2654059.xlsx', (and a bit different)


## Different data: 
- 'MPAMAR Data BUV Tuhua MR 2004 - DOC-1159857.xlsx',
- 'MPAMAR Data BUV Tapuae MR 2011 - DOC-1243658.xlsx',
- 'MPAMAR Data BUV Horoirangi Tonga Island MR 2004 - DOC-6831278.xlsx',
-  'MRMDATA - BUV - Long Bay Okura Marine Reserve 2021 - DOC-7164369.xlsx'

Not that exciting.... annotations_buv_doc_None

## Comments on the files

- 'MPAMAR Data BUV Te Angiangi 2021 Video analysis data sheet.xlsm'
    - has 3 different surveyIDs
- 'MPAMAR Data BUV Akaroa Pohatu 2017 Video analysis data sheet.xlsm'
    - missing SideID data for second half
    - difference between xlsm and csv file
    - 
 
ReplicateWithinSite and DeploymentID last 4 digits don't match?
Take BUV out of data files? take video info out of data files

how many should there be in annotations



### problem files
- 'MPAMAR Metadata BUV Te Tapuwae o Rongokako 2021.xlsx'
    - sheet is about video not metadata
- 'MPAMAR Metadata BUV Te Angiangi 2021.csv'
    - sheet is very different info


 'MPAMAR Akaroa Pohatu BUV metadata 2021 - DOC-7166068.csv', 
 'MPAMAR Metadata BUV Te Angiangi 2021.csv', 
 'Poor Knights Islands BUV analysis April 2015 - DOC-2654059.xlsx'



### 9.11 Old notes, done
 
Many different deploymentID dates:
[?] MPAMAR Data BUV Tuhua 2020 Video analysis sheet.xlsm
        TUH_20200830_BUV went with that
[?] MPAMAR Data BUV Tonga Island 2021 Video analysis data sheet.xlsm
      TON_20211026_BUV is in the survey file, so I'll go with that
[?] MPAMAR Data BUV Tuhua 2021 Video analysis sheet - DOC-6891090.xlsm
      siteIds not in order, TUH_20210309_BUV go with this
[?] MPAMAR Data BUV Tapuae 2021 Video analysis sheet .xlsm.
      TAP in the video data, but in survey it's SLI_20220228_BUV, so went with that

      
Duplicate files/not matching entries?
- MPAMAR Data BUV Akaroa Pohatu 2019 Video analysis data sheet.xlsm
- MPAMAR Data BUV Akaroa Pohatu 2019 Video analysis data sheet - DOC-6649702.csv [ignore]
and
- MPAMAR Data BUV Akaroa Pohatu 2017 Video analysis data sheet - DOC-5452215.csv [ignore]
- MPAMAR Data BUV Akaroa Pohatu 2017 Video analysis data sheet.xlsm

        

# TODO etc

In [None]:
# try additional sheets get sheet info?
# reset infor?
# excel sheets get didn't work to get sheet info
# if deployment id not ok, put files into to change files


# Extra code bits hold:


In [None]:
## Print the created metadata dataframe (that was exported to the excel file)
# Display sample of data frame
with pd.option_context('display.max_columns', None): 
    display(current_file_df.sample(min(SAMPLE_SHEET_LINES, current_file_df.shape[0])))
print(f"Working on file: '{file_name}'.\n")

In [None]:
a = pd.DataFrame([[1,2,3,4]], columns =['DeploymentID', 'ScientificName', 'TimeMaxN', 'MaxN'] )

c = ['DeploymentID', 'ScientificName', 'TimeMaxN', 'MaxN']  
a = a.rename({c[0]:'DeploymentID', c[1]:'ScientificName', c[2]:'TimeOfMax', c[3]:'MaxInterval'}, axis='columns')
a