# Avian data Data transformation notebook

This notebook performs and document the transformations performed on the data received from Colibri by The Water Institute as part of this project. 

The main goals of this transformations are:
  - Create a unified dataset for 2010-2021 data
  - Rename both high resolution photos and screenshots to have them in a unified folder structure
  - Create thumbnails for the high resolution photos to be visualized on the web dashboard
  - Analyze difference between the summary data and the totals calculated from the raw data (if any)
  - Generate datasets for being used in the visualizations

In [None]:
#%pip install tqdm pandas boto3


### Parameters

In [None]:
thumbnail_size = (518, 345)
create_thumbnails = False
rename_files=False
replace = False
_base_folder = "avian_monitoring"



### Common imports

In [None]:
from PIL import Image
import re
import pandas as pd

# import geopandas as gpd
import pandas_access
from datetime import datetime
import pyodbc
import numpy as np
from multiprocessing import Pool
from functools import partial
import geopandas as gp
import boto3
import re
from tqdm.notebook import tqdm
from dateutil.parser import parse
from concurrent.futures import ThreadPoolExecutor
from io import BytesIO
from functools import partial

tqdm.pandas()

boto3.setup_default_session(profile_name="GLO")
bucket_name = "twi-aviandata"
starting_folder = "HighResolutionImages"
new_folder_hr = "avian_monitoring_"
aws_s3 = boto3.resource(
    "s3",
)
avian_data = aws_s3.Bucket(bucket_name)

### Utilities

In [None]:
def clean_date(text):
    """Return a date as yyyy-MMM-dd (e.g. 2022-May-15)"""
    datetimestr = parse(text)
    text = datetime.strftime(datetimestr, "%Y-%b-%d")
    return text


def update_mime_type(t, mime="image/png"):
    """Update the metadata of s3 objects to the given Mime type"""
    s3_object = avian_data.Object(t)
    s3_object.copy_from(
        CopySource={"Bucket": avian_data.name, "Key": t},
        Metadata=s3_object.metadata,
        MetadataDirective="REPLACE",
        ContentType=mime,
    )


def copy_wnew_mime_type(t, k, mime="image/png"):
    """Copy a s3 object, modifying its mime type"""
    s3_object = avian_data.Object(t)
    avian_data.copy(
        {"Bucket": avian_data.name, "Key": t},
        k,
        Metadata=s3_object.metadata,
        MetadataDirective="REPLACE",
        ContentType=mime,
    )


def rename(key, new_name, replace=False):
    """Copy an object (key) to a new location (new_name) on the avian_data bucket"""
    if not replace and [o for o in avian_data.objects.filter(Prefix=new_name)]:
        return True

    try:
        avian_data.copy({"Bucket": avian_data.name, "Key": key}, new_name)
    except Exception as e:
        print(e)
        check_lowercase = list(
            avian_data.objects.filter(Prefix=key.replace(".JPG", ".jpg"))
        )
        if check_lowercase:
            avian_data.copy(
                {"Bucket": avian_data.name, "Key": check_lowercase[0].key}, new_name
            )
            return True
        check_uppercase = list(
            avian_data.objects.filter(Prefix=key.replace(".jpg", ".JPG"))
        )
        if check_uppercase:
            avian_data.copy(
                {"Bucket": avian_data.name, "Key": check_uppercase[0].key}, new_name
            )
            return True
        print(f"{key} not found")
        return False


def generate_thumbnail(high_res_key, thumb_key, regenerate=False):
    """Generate a low resolution thumbnail at thumb_key of the high_res_key object.
    it uses the global thumbnail_size to define the size.
    """
    current = avian_data.objects.filter(Prefix=thumb_key)
    if not regenerate and [o for o in current]:
        return True
    objs = [o for o in avian_data.objects.filter(Prefix=high_res_key)]
    if not objs:
        print(f"there is no {high_res_key}")
        return False
    image = Image.open(BytesIO(objs[0].get()["Body"].read()))
    buffer = BytesIO()
    image.thumbnail(thumbnail_size)
    image.save(buffer, format="png")
    buffer.seek(0)
    avian_data.put_object(Key=thumb_key, Body=buffer, ContentType="image/png")
    return True


# Unified 2010-2021 Data

Data has been unified by Colibri into a single access database, however it contains tables discriminated by years, with some differences in the schema. This process will combine all the datasets into a single one with a common deffinition of the total birds and total nests. 

In [None]:
# Notes: In order to use all the fields, I did a rename of the columns containing '?' or '/'
# acc_db = "/mnt/c/Users/carizaporras/Downloads/Colibri_tblsSpeciesData2010-2021_18sept22.accdb"
#acc_db = "/mnt/z/Colibri2010-2021CWBColonies_12Nov2022_working_copy.accdb"
#acc_db = "/mnt/z/Colibri2010-2021CWBColonies_2Jan2023.accdb"
acc_db = "Colibri2010-2021CWBColonies_2Jan2023.accdb"

schema = pandas_access.read_schema(acc_db)


In [None]:
ct_name = "tblRWCWB_ColonyInventory_10Nov22"
# colonies_table = schema.pop("tblRWCWB_ColonyInventory_13Sept2022")
colonies_table = schema.pop(ct_name)


There are three tables with slighly different schema

In [None]:
schema


Taking the 2015 to 2021 table as a reference, lets compare the schemas.

Fields in the reference but not in the `tblSpeciesData2011_2013` table

In [None]:
schema["tblSpeciesData2015_2018_2021"].keys() - schema["tblSpeciesData2011_2013"].keys()


Fields in the `tblSpeciesData2011_2013` table but not in the reference

In [None]:
schema["tblSpeciesData2011_2013"].keys() - schema["tblSpeciesData2015_2018_2021"].keys()

Fields in the referece but not in the 2010 table

In [None]:
schema["tblSpeciesData2015_2018_2021"].keys() - schema["tblSpeciesData2010"].keys()


Fields in the referece but not in the 2010 table

In [None]:
schema["tblSpeciesData2010"].keys() - schema["tblSpeciesData2015_2018_2021"].keys()


That also means that we have different formulas to calculate the total. The bestForBPE field is used as filter in 2013 to 2021, but it is not used in the 2010.

### Formulas

|                | 2010                                                                                                                                                               | 2011-2013                                                                                                                                                          | 2015-2021                                                                                                 |
|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------|
| Nests          | sum(<br>[WBN]<br>+[ChickNestw/outAdult]<br>+[AbandNest]<br>+[EmptyNest]<br>+[PBN]<br>+[Site]<br>+[Brood])                                                          | Sum(<br>[WBN]<br>+[ChickNestw/outAdult]<br>+[AbandNest]<br>+[EmptyNest]<br>+[PBN]<br>+[Site]<br>+[Brood])                                                          | Sum(<br>[WBN]<br>+[ChickNest]<br>+[ChickNestw/outAdult]<br>+[AbandNest]<br>+[PBN]<br>+[Site]<br>+[Brood]) |
| Birds          | sum([WBN]<br>+[PBN]<br>+[Site]<br>+[OtherAdultsInColony]<br>+[OtherImmInColony]<br>+[RoostingBirds]<br>+[RoostingAdults]<br>+[RoostingImmatures]<br>+[UnknownAge]) | Sum([WBN]<br>+[PBN]<br>+[Site]<br>+[OtherAdultsInColony]<br>+[OtherImmInColony]<br>+[RoostingBirds]<br>+[RoostingAdults]<br>+[RoostingImmatures]<br>+[UnknownAge]) | Sum(<br>[WBN]<br>+[ChickNest]<br>+[PBN]<br>+[Territory]<br>+[Site]<br>+[OtherBirds])                      |
| SumOfEmptyNest | sum(EmptyNest)                                                                                                                                                     | EmptyNest                                                                                                                                                          | EmptyNest                                                                                                 |


But 2015 to 2021 has additional categories: 

```SQL

Sum(WBN) AS SumOfWBN

Sum(ChickNest) AS SumOfChickNest

Sum([ChickNestw/outAdult]) AS [SumOfChickNestw/outAdult] 

Sum(Brood) AS SumOfBrood

Sum(AbandNest) AS SumOfAbandNest

Sum(PBN) AS SumOfPBN

Sum(Territory) AS SumOfTerritory
```

## Generating a common dataset:
- merge 2010 species with the colonies. 
- select only the common columns 

In [None]:
cols_2015_2021 = set(schema["tblSpeciesData2015_2018_2021"].keys())
cols_2011_2013 = set(schema["tblSpeciesData2011_2013"].keys())
cols_2010 = set(schema["tblSpeciesData2010"].keys())
common_fields = cols_2011_2013.intersection(cols_2015_2021)


In [None]:
cols_2010 - common_fields


In [None]:
common_fields - cols_2010


In [None]:
pd_species_2010 = pandas_access.read_table(acc_db, "tblSpeciesData2010")


In [None]:
pd_species_2010["Notes"] = ""
pd_species_2010["BestForBPE"] = "N"


In [None]:
pd_species_2011_2013 = pandas_access.read_table(
    acc_db, "tblSpeciesData2011_2013"
)  # [common_fields]
pd_species_2015_2021 = pandas_access.read_table(
    acc_db, "tblSpeciesData2015_2018_2021"
)  # [common_fields]


In [None]:
pd_species = pd.concat(
    [pd_species_2010, pd_species_2011_2013, pd_species_2015_2021], ignore_index=True
)


In [None]:
pd_species = pd_species.drop(columns="AutoID")


In [None]:
pd_species["Year"] = pd_species["Year"].astype(int).astype(str)


In [None]:
pd_colonies = pandas_access.read_table(acc_db, ct_name)
pd_colonies["ColonyName"] = pd_colonies["ColonyName"].astype(str)


In [None]:
pd_species = pd.merge(pd_species, pd_colonies, on="ColonyName")


In [None]:
pd_species["HighResImage_new"] = (
    f"{_base_folder}/high_resolution_photos/"
    + pd_species["Year"].astype(str)
    + "/"
    + pd_species["GeoRegion"]
    + "/"
    + pd_species["ColonyName"]
    + "/"
    + pd.to_datetime(
        pd_species["Date"].astype("str"), format="%m/%d/%y %H:%M:%S"
    ).dt.strftime("%d%B%y")
    + "Camera"
    + pd_species["CameraNumber"]
    + "-"
    + "Card"
    + (pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))
    + "-"
    + pd_species["PhotoNumber"]
    + ".jpg"
)
pd_species["screenshot_new"] = (
    f"{_base_folder}/screenshots/"
    + pd_species["Year"].astype(str)
    + "/"
    + pd_species["GeoRegion"]
    + "/"
    + pd_species["ColonyName"]
    + "/"
    + pd.to_datetime(
        pd_species["Date"].astype("str"), format="%m/%d/%y %H:%M:%S"
    ).dt.strftime("%d%B%y")
    + "Camera"
    + pd_species["CameraNumber"]
    + "-"
    + "Card"
    + (pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))
    + "-"
    + pd_species["PhotoNumber"]
    + ".jpg"
)
pd_species["thumbnail_new"] = (
    f"{_base_folder}/thumbnails/"
    + pd_species["Year"].astype(str)
    + "/"
    + pd_species["GeoRegion"]
    + "/"
    + pd_species["ColonyName"]
    + "/"
    + pd.to_datetime(
        pd_species["Date"].astype("str"), format="%m/%d/%y %H:%M:%S"
    ).dt.strftime("%d%B%y")
    + "Camera"
    + pd_species["CameraNumber"]
    + "-"
    + "Card"
    + (pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))
    + "-"
    + pd_species["PhotoNumber"]
    + ".png"
)


In [None]:
pd_species["total_nests"] = np.where(
    pd_species["Year"].isin(["2010", "2011", "2012", "2013"]),
    pd_species["WBN"]
    + pd_species["ChickNestwithoutAdult"]
    + pd_species["AbandNest"]
#    + pd_species["EmptyNest"]
    + pd_species["PBN"]
    + pd_species["Site"]
    + pd_species["Brood"],
    pd_species["WBN"]
    + pd_species["ChickNestwithoutAdult"]
    + pd_species["AbandNest"]
    + pd_species["ChickNest"]
    + pd_species["PBN"]
    + pd_species["Site"]
    + pd_species["Brood"],
)
pd_species["total_birds"] = np.where(
    pd_species["Year"].isin(["2010", "2011", "2012", "2013"]),
    pd_species["WBN"]
    + pd_species["PBN"]
    + pd_species["Site"]
    + pd_species["OtherAdultsInColony"]
    + pd_species["OtherImmInColony"]
    + pd_species["RoostingBirds"]
    + pd_species["RoostingAdults"]
    + pd_species["RoostingImmatures"]
    + pd_species["UnknownAge"],
    pd_species["WBN"]
    + +pd_species["ChickNest"]
    + pd_species["PBN"]
    + pd_species["Territory"]
    + pd_species["Site"]
    + pd_species["OtherBirds"],
)


In [None]:
pd_species["date2"] = pd_species["Date"].astype("str").apply(clean_date)
pd_species["month"] = pd_species["date2"].apply(lambda x: x.split("-")[1])
order_by_field = "total_nests" 
agg_2010 = (
    pd_species[pd_species.Year == "2010"]
    .groupby(["Year", "month", "date2","ColonyName", "SpeciesCode"])
    .agg({"total_nests": "sum", "total_birds":"sum"})
    .reset_index()
)


### Exceptions
For 2010 data there are a few instances where taking the month with the largest total_nests is not the used approach.
We are encodign the exceptions here:

- Breton Island, ROSA, we pick data from June
- Do not include values of "UNTE", "UNGT","UNWA","UNGU","UNIB","UNEG" for "Breton Island","Cat Bay Island","Martin Island"
- Martin Island, exclude ROSA counts
- Drum Bay Island UNIB data not best for BPE
- Photos for BLSK in Martin Island for May 18 are not best for BPE


![image.png](attachment:image.png)

In [None]:
# Exceptions
agg_2010["Exception"] = 0
agg_2010.loc[(agg_2010["ColonyName"]=="Breton Island") & (agg_2010.SpeciesCode == "ROSA") & (agg_2010.month == "Jun"), "Exception"] =1
#agg_2010.loc[agg_2010.SpeciesCode.isin(["UNTE", "UNGT","UNWA","UNGU","UNIB","UNEG"]) & (agg_2010.month == "Jun"), "Exception"] = 1
agg_2010.loc[(agg_2010.SpeciesCode=="BLSK") & (agg_2010.ColonyName=="Martin Island") & (agg_2010.date2 == "2010-May-18"), "Exception"] = -1
agg_2010.loc[(agg_2010.SpeciesCode=="ROSA") & (agg_2010.ColonyName=="Martin Island"), "Exception"] = -1
agg_2010.loc[(agg_2010.SpeciesCode=="UNWA") & (agg_2010.ColonyName=="Drum Bay Island") & (agg_2010.month=="Jun"), "Exception"] = -1
agg_2010.loc[agg_2010.SpeciesCode.isin(["UNTE", "UNGT","UNWA","UNGU","UNIB","UNEG"]) & agg_2010.ColonyName.isin(["Breton Island","Cat Bay Island","Martin Island"]), "Exception"] = -1
agg_2010.loc[(agg_2010.SpeciesCode=="UNIB") & (agg_2010.ColonyName=="Drum Bay Island"), "Exception"] = -1


In [None]:
agg_2010.loc[agg_2010.SpeciesCode.isin(["UNTE", "UNGT","UNWA","UNGU","UNIB","UNEG"]) & agg_2010.ColonyName.isin(["Breton Island","Cat Bay Island","Martin Island"])]

In [None]:
selected = (
    agg_2010[agg_2010.Exception>-1].sort_values(["Exception", order_by_field, "total_nests" if order_by_field=="total_birds" else "total_birds"])
    .drop_duplicates(["Year", "ColonyName", "SpeciesCode"], keep="last")[
        ["Year", "month", 'date2',"ColonyName", "SpeciesCode"]
    ]
    .reset_index(drop=True)
)


In [None]:
agg_2010[(agg_2010.Exception>-1) & agg_2010.SpeciesCode.isin(["UNTE", "UNGT","UNWA"])]

In [None]:
pd_species.loc[
    pd_species[["Year", "month",'date2', "ColonyName", "SpeciesCode"]]
    .apply(lambda row: str([x for x in row]), axis=1)
    .isin(selected.apply(lambda row: str([x for x in row]), axis=1)),
    "BestForBPE",
] = "Y"


In [None]:
pd_species["uid"] = (
    pd_species["HighResImage_new"]
    + "#"
    + np.where(pd_species["SpeciesCode"].isna(), "N/A", pd_species["SpeciesCode"])
)


In [None]:
pd_species["SpeciesCode"] = pd_species["SpeciesCode"].str.upper()
# For 2021 REEG data is divided in subspecies on the access database, but reported as REEG in the summary.
pd_species.loc[
    (pd_species.Year == "2021") & pd_species.SpeciesCode.str.startswith("REEG "),
    "SpeciesCode",
] = "REEG"

pd_species.loc[pd_species.SpeciesCode=="COGA","SpeciesCode"] = "COMO"


In [None]:
pd_species.to_excel("avianmonitoring_2010-2021.xlsx", index=False)


# Reorganize Files
Photos from 2010 to 2021 originally are not organized by GeoRegion/Colony and can have different naming standard.
We are using the results of the dotting process to organize the images referenced from the Access database. 

We are also generating a thumbnail and reorganizing and renaming the screenshots from the dotting process. 

Note: This will be a subset of the photos, you can see all the high resolution photos available on the HighResolutionImages folder in the S3 bucket. 

In [None]:
pd_species[pd_species["HighResImage_new"].isna()]


In [None]:
files = [
    o
    for o in avian_data.objects.filter(Prefix=starting_folder)
    if re.match(r".*/20[1-2][0-9]/.*(\.jp(.{0,1})g|.tiff)", o.key.lower())
] if rename_files or create_thumbnails else []


We will need to obtain the photo information from the HighResolution photo path. The file naming patter varies, even in the same year, but we can define a regular expression general enough to catch all the cases for most of the information. 

In [None]:
image_groups = re.compile(
    r"HighResolutionImages/(?P<year>\d{4}).*[/ ,]+(?P<date>\d+\s*[A-Z]+\s*\d{2,4}).*(Camera|Cam)\s*(?P<camera>\d+)[ /-]*(Card\s*(?P<card>\d+)){0,1}[\s-]*((\w+/))*(IMGP){0,1}(?P<photo>\d+)\.(?P<extension>jp.?g|tiff?)",
    flags=re.IGNORECASE,
)


In [None]:
m = image_groups.match(
    "HighResolutionImages/2010/June 2010/10 June 2010/10 June 2010 Camera 1 Card 1/10 June 2010 Camera 1 Card 1 010.JPG"
)


In [None]:
df_files = []
no_files = []
for i, o in tqdm(enumerate(files)):
    if image_groups.match(o.key):
        dict_t = image_groups.match(o.key).groupdict()
        df_files.append({"key": o.key, "object": o, **dict_t})
    else:
        no_files.append(o.key)


There are a few files with not enough information:

In [None]:
no_files


In [None]:
no_files = list(filter(lambda k: not "numbering off" in k, no_files))


In [None]:
with open("n_files_report.txt", "w") as n_files_report:
    n_files_report.write("\n".join(no_files))


Create a dataframe from the collected information, and merge it with the species dataset to be able to rearrange/rename the photos to the desire location based on the dotting information. 

In [None]:
if df_files:
    fdf = pd.DataFrame(df_files)
    fdf["extension"] = fdf["extension"].str.lower()
    fdf["date2"] = fdf["date"].astype("str").apply(clean_date)
    fdf["month"] = fdf["date2"].apply(lambda x: x.split("-")[1])
    fdf["day"] = fdf["date2"].apply(lambda x: x.split("-")[2])
    pd_species["date2"] = pd_species["Date"].astype("str").apply(clean_date)
    pd_species["month"] = pd_species["date2"].apply(lambda x: x.split("-")[1])
    pd_species["day"] = pd_species["date2"].apply(lambda x: x.split("-")[2])
    pd_species[~pd_species["HighResImage_new"].isna()].to_csv(
        "avianData20102021.csv.gz", index=False
    )
    fdf = fdf.rename(
        columns={
            "camera": "CameraNumber",
            "card": "CardNumber",
            "photo": "PhotoNumber",
            "year": "Year",
        }
    )
    fdf["PhotoNumber"] = fdf["PhotoNumber"].str.rjust(5, "0")
    pd_species["PhotoNumber"] = pd_species["PhotoNumber"].str.rjust(5, "0")
    join_cols = ["CameraNumber", "CardNumber", "PhotoNumber", "Year", "month", "day"]
    for c in join_cols:
        fdf[c] = fdf[c].astype("str").str.strip()
        pd_species[c] = pd_species[c].astype("str").str.strip()

    c = "CardNumber"
    fdf.loc[fdf[c] == "None", c] = (
        fdf.loc[fdf[c] == "None", "key"]
        .str.extract(r".*Card\s*(\d+).*", flags=re.IGNORECASE, expand=False)
        .where(fdf.loc[fdf[c] == "None", "key"].str.match(r".*Card\s(\d+).*", False), 1)
    )
    merged = pd_species.merge(fdf, on=join_cols, how="left")
    m_grouped = (
        merged[
            [
                "HighResImage_new",
                "key",
                "CameraNumber",
                "CardNumber",
                "PhotoNumber",
                "Date",
                "Year",
                "month",
                "thumbnail_new",
            ]
        ]
        .groupby(
            [
                "HighResImage_new",
                "key",
                "CameraNumber",
                "CardNumber",
                "PhotoNumber",
                "Date",
                "Year",
                "month",
                "thumbnail_new",
            ]
        )
        .count()
        .reset_index()
    )


In [None]:
# r = m_grouped.progress_apply(lambda x:avian_data.copy ({'Bucket': avian_data.name,'Key':x['key']}, x['HighResImage_new']), axis=1)
if rename_files:
    with ThreadPoolExecutor(max_workers=16) as e:
        futures = list(
            tqdm(
                e.map(
                    lambda x, y: rename(x, y),
                    m_grouped["key"].tolist(),
                    m_grouped["HighResImage_new"].tolist(),
                ),
                total=m_grouped.shape[0],
            )
        )
    with pd.option_context("display.max_colwidth", None):
        display(
            m_grouped[
                m_grouped["HighResImage_new"]
                == "avian_monitoring/high_resolution_photos/2021/Birdsfoot West/Birdsfoot West 6 A/22May21Camera1-Card2-3297.jpg"
            ]
        )
        m_grouped.to_excel("UsedPhotos2010-2021.xlsx", index=False)



In [None]:
display(pd_species[
    pd_species["HighResImage_new"]
    == "avian_monitoring/high_resolution_photos/2021/Biloxi North/Biloxi North 4/16June21Camera2-Card2-6268.jpg"
]
)


In [None]:
if create_thumbnails:
    with ThreadPoolExecutor(max_workers=16) as e:
        futures = list(
            tqdm(
                e.map(
                    partial(generate_thumbnail, regenerate=False),
                    m_grouped["HighResImage_new"].tolist(),
                    m_grouped["thumbnail_new"].tolist(),
                ),
                total=m_grouped.shape[0],
            )
        )


# Previous version code
```python
#acc_db = "/mnt/c/Users/carizaporras/Downloads/Colibri_tblSpeciesData2015_2018_2021_2Sept2022 (1).accdb"
#schema = pandas_access.read_schema(acc_db)
#pd_species = pandas_access.read_table(acc_db, "tblSpeciesData2015_2018_2021")
#pd_species["ColonyName"] = pd_species["ColonyName"].astype(str)
#pd_colonies = pandas_access.read_table(acc_db, "tblRWCWB_ColonyInventory_2022")
#pd_colonies["ColonyName"] = pd_colonies["ColonyName"].astype(str)
#pd_species = pd.merge(pd_species, pd_colonies, on="ColonyName")
#pd_species["HighResImage_new"] = f"{_base_folder}/high_resolution_photos/"+pd_species['Year'].astype(str)+'/'+pd_species['GeoRegion']+'/'+pd_species['ColonyName']+'/'+pd.to_datetime(pd_species["Date"].astype('str'), format="%m/%d/%y %H:%M:%S").dt.strftime("%d%B%y")+'Camera'+pd_species["CameraNumber"]+'-'+'Card'+(pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))+"-"+pd_species["PhotoNumber"]+".jpg"
#pd_species["screenshot_new"] = f'{_base_folder}/screenshots/'+pd_species['Year'].astype(str)+'/'+pd_species['GeoRegion']+'/'+pd_species['ColonyName']+'/'+pd.to_datetime(pd_species["Date"].astype('str'), format="%m/%d/%y %H:%M:%S").dt.strftime("%d%B%y")+'Camera'+pd_species["CameraNumber"]+'-'+'Card'+(pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))+"-"+pd_species["PhotoNumber"]+".jpg"
#pd_species["thumbnail_new"] =  f'{_base_folder}/thumbnails/'+pd_species['Year'].astype(str)+'/'+pd_species['GeoRegion']+'/'+pd_species['ColonyName']+'/'+pd.to_datetime(pd_species["Date"].astype('str'), format="%m/%d/%y %H:%M:%S").dt.strftime("%d%B%y")+'Camera'+pd_species["CameraNumber"]+'-'+'Card'+(pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))+"-"+pd_species["PhotoNumber"]+".png"
#gdf = gp.GeoDataFrame(
#    pd_species, geometry=gp.points_from_xy(pd_species["Longitude"], pd_species["Latitude"]))
#set_index(["State","GeoRegion","ColonyName","Year", "Latitude", "Longitude", "Date", "SpeciesCode"])
#pd_species = pd_species.drop(columns=["AutoID","Subcolony"])
#gdf["huc"] = "TBD"

def _convert_to_degress(value):
    """
    Helper function to convert the GPS coordinates stored in the EXIF to degress in float format
    Borrowed from: https://gist.github.com/snakeye/fdc372dbf11370fe29eb
    Modified to recieve a tuple instead of a exifread.utils.Ratio
    :param value:
    :type value: tuple
    :rtype: float
    """
    d = float(value[0][0]) / float(value[0][1])
    m = float(value[1][0]) / float(value[1][1])
    s = float(value[2][0]) / float(value[2][1])

    return d + (m / 60.0) + (s / 3600.0)
```

# UPDATE Mime types 

In [None]:
def update_mime_type(t, mime="image/png"):
    s3_object = avian_data.Object(t)
    s3_object.copy_from(
        CopySource={"Bucket": avian_data.name, "Key": t},
        Metadata=s3_object.metadata,
        MetadataDirective="REPLACE",
        ContentType=mime,
    )


def copy_wnew_mime_type(t, k, mime="image/png"):
    avian_data.copy(
        {"Bucket": avian_data.name, "Key": t},
        k,
        Metadata=s3_object.metadata,
        MetadataDirective="REPLACE",
        ContentType=mime,
    )


In [None]:
if replace and create_thumbnails:
    m_grouped["thumbnail_new"].progress_apply(lambda t: update_mime_type(t))


In [None]:
if rename_files:
    with ThreadPoolExecutor(max_workers=16) as e:
        futures = list(
            tqdm(
                e.map(
                    lambda t: update_mime_type(t, mime="image/jpeg"),
                    m_grouped["HighResImage_new"].tolist(),
                ),
                total=m_grouped.shape[0],
            )
        )


# Reorganize/rename screenshots

In [None]:
m_grouped = pd.read_excel("UsedPhotos2010-2021.xlsx")
# to_rename = pd.read_excel("toRename2.xlsx")
to_rename = pd.read_excel("Renaming_Avian.xlsx")


In [None]:
if not "Original_Path" in to_rename:
    to_rename["Original_Path"] = to_rename["Path"]


In [None]:
to_rename = to_rename[["HighResImage_new", "Original_Path"]]


In [None]:
to_rename = to_rename[
    ~(to_rename["Original_Path"].isna() | to_rename["Original_Path"].isna())
]


In [None]:
to_rename["Original_Path"] = to_rename["Original_Path"].str.replace(
    "/to82sp", "DottedImages"
)


In [None]:
to_rename = to_rename.merge(m_grouped, on="HighResImage_new")


In [None]:
to_rename["screenshot_new"] = to_rename["HighResImage_new"].str.replace(
    "/high_resolution_photos/", "/screenshots/"
)


In [None]:
to_rename[["screenshot_new", "Original_Path"]]


In [None]:
if rename_files:
    with ThreadPoolExecutor(max_workers=16) as e:
        futures = list(
            tqdm(
                e.map(
                    lambda x, y: rename(x, y),
                    to_rename["Original_Path"].tolist(),
                    to_rename["screenshot_new"].tolist(),
                ),
                total=to_rename.shape[0],
            )
        )


# Join summary excel with locations from colonies

In [None]:
pd_species = pd.read_excel("avianmonitoring_2010-2021.xlsx")


In [None]:
totals = pd.read_excel("/mnt/z/Colibri2010-21ColonyTotalsMayJuneCombined_8Nov22.xlsx")


In [None]:
totals


In [None]:
unique_colonies = (
    pd_species[
        ["Year", "State", "GeoRegion", "ColonyName", "Longitude_y", "Latitude_y"]
    ]
    .drop_duplicates()
    .reset_index(drop=True)
)


In [None]:
join_cols = ["Year", "State", "GeoRegion", "ColonyName"]
for c in join_cols:
    totals[c] = totals[c].astype("str").str.strip()
    unique_colonies[c] = unique_colonies[c].astype("str").str.strip()


In [None]:
joined_totals = pd.merge(totals, unique_colonies, on=join_cols)


In [None]:
joined_totals


In [None]:
joined_totals.to_excel("joined_totals.xlsx", index=False)


# Compare totals from totals and pd_species

In [None]:
joined_totals = pd.read_excel("joined_totals.xlsx")
joined_totals["Date"] = joined_totals["Date"].astype("str").apply(clean_date)
pd_species["Date"] =  pd_species["Date"].astype("str").apply(clean_date)
pd_species.columns


In [None]:
join_cols = [
    "Year",
    "State",
    "GeoRegion",
    "ColonyName",
    "SpeciesCode",
    "Longitude_y",
    "Latitude_y",
]
for c in join_cols:
    pd_species[c] = pd_species[c].astype("str").str.strip()
    joined_totals[c] = joined_totals[c].astype("str").str.strip()

pd_species["SpeciesCode"] = pd_species["SpeciesCode"].str.upper()
joined_totals["SpeciesCode"] = joined_totals["SpeciesCode"].str.upper()
# For 2021 REEG data is divided in subspecies on the access database, but reported as REEG in the summary.
pd_species.loc[
    (pd_species.Year == "2021") & pd_species.SpeciesCode.str.startswith("REEG "),
    "SpeciesCode",
] = "REEG"

pd_species.loc[pd_species.SpeciesCode=="COGA","SpeciesCode"] = "COMO"
joined_totals.loc[joined_totals.SpeciesCode=="COGA","SpeciesCode"] = "COMO"

agg_pd_species = (
    pd_species.loc[
        pd_species.BestForBPE == "Y",
        [
            "Year",
            "Date",
            "State",
            "GeoRegion",
            "ColonyName",
            "SpeciesCode",
            "Longitude_y",
            "Latitude_y",
            "total_nests",
            "total_birds",
        ],
    ]
    .groupby(
        [
            "Year",
            "Date",
            "State",
            "GeoRegion",
            "ColonyName",
            "SpeciesCode",
            "Longitude_y",
            "Latitude_y",
        ]
    )
    .sum()
)



In [None]:
agg_totals = joined_totals.groupby(
    [
        "Year",
        "State",
        "GeoRegion",
        "ColonyName",
        "SpeciesCode",
        "Longitude_y",
        "Latitude_y",
    ]
).sum()


In [None]:
joined_totals = agg_totals.join(agg_pd_species.groupby(    [
        "Year",
        "State",
        "GeoRegion",
        "ColonyName",
        "SpeciesCode",
        "Longitude_y",
        "Latitude_y",
    ]).sum(), on=join_cols, how="outer").reset_index()


In [None]:
joined_totals[joined_totals.Nests.isna() ]


In [None]:
joined_totals["diff_nests"] = joined_totals["Nests"].fillna(0) - joined_totals[
    "total_nests"
].fillna(0)
joined_totals["diff_birds"] = joined_totals["Birds"].fillna(0) - joined_totals[
    "total_birds"
].fillna(0)


In [None]:
joined_totals[
    ((joined_totals["diff_nests"] != 0) | (joined_totals["diff_birds"] != 0))
   # & (joined_totals["SpeciesCode"] == "ROSA")
].sort_values(
#    ["Year","SpeciesCode","State","GeoRegion","ColonyName"]
    "diff_birds"
).to_excel("/mnt/z/differences_with_previous_summary.xlsx")


In [None]:
#joined_totals.to_excel("/mnt/z/joined_totals.xlsx")


In [None]:
pd_species.loc[
    (pd_species.Year == "2021") & pd_species.SpeciesCode.str.startswith("REEG "),
    "SpeciesCode",
]


In [None]:
agg_pd_species.reset_index().rename(columns={"total_nests": "Nests", "total_birds": "Birds"}).to_excel("SummaryFileGenerated.xlsx", index=False)

# Check existing files

In [None]:
pd_species = pd.read_excel("avianmonitoring_2010-2021.xlsx")
pd_species = pd_species.drop(columns=["Latitude_x", "Longitude_x"]).drop_duplicates()


In [None]:
print(pd_species["thumbnail_new"][0])


In [None]:
def exists_key(key, replace=False):
    try:
        return len(list(avian_data.objects.filter(Prefix=key))) > 0
    except Exception as e:
        print(f"key:{key}, exception {e}")
        return False


In [None]:
# This is equivalent, but faster than: pd_species["thumbnail_new"] = pd_species["thumbnail_new"].progress_apply(exists_key)
with ThreadPoolExecutor(max_workers=128) as e:
    futures = {}
    _grouped = pd_species[
        filter(lambda x: "_new" in x, pd_species.columns)
    ].drop_duplicates()
    for f in _grouped.columns:
        futures[f] = list(
            tqdm(
                e.map(lambda y: exists_key(y), _grouped[f].tolist()),
                total=_grouped.shape[0],
            )
        )

In [None]:
for x in futures:
    with pd.option_context("display.max_colwidth", None):
        display(_grouped.loc[~np.array(futures[x]), x])


In [None]:
x = "HighResImage_new"
pd_species[pd_species[x].isin(_grouped.loc[~np.array(futures[x]), x])]


In [None]:
for x in futures:
    print(x, np.sum(~np.array(futures[x])))
    with pd.option_context("display.max_colwidth", None):
        _no_exists = pd_species[
            pd_species[x].isin(_grouped.loc[~np.array(futures[x]), x])
        ]
        display(_no_exists)
        display(_no_exists[["Year", x]].drop_duplicates().groupby(["Year"]).count())
        pd_species.loc[
            pd_species[x].isin(_grouped.loc[~np.array(futures[x]), x]), x
        ] = None
#   with pd.option_context('display.max_colwidth', None):
#       display(pd_species.loc[~np.array(futures[x]), x].drop_duplicates())
#       pd_species.loc[~np.array(futures[x]), x].drop_duplicates().to_csv(f"not_found_{x}.csv", index=False)

#   display(pd_species.loc[~np.array(futures[x]),[x,"Year"]].drop_duplicates().groupby("Year").count())
#   pd_species.loc[~np.array(futures[x]), x] = None


In [None]:
pd_species.to_csv("avianmonitoring_2010-2021_Nulls.csv.gz", index=False)


In [None]:
pd_species.to_excel("avianmonitoring_2010-2021_Nulls.xlsx", index=False)


### Delete inconsistent photos
In some previous processing it seems some photos have been incorrectly created. This cell allows to display photos with high resolution images but without thumbnail, that will indicate an annomalus situation. 

In [None]:
with pd.option_context("display.max_colwidth", None):
    to_delete = pd_species.loc[
        pd_species["thumbnail_new"].isna() & ~pd_species["HighResImage_new"].isna(),
        futures.keys(),
    ].drop_duplicates()
    display(to_delete)
    # to_delete["HighResImage_new"].apply(lambda x:avian_data.Object(key=x).delete())


## Update list of files on AWS (for file browsing)

In [None]:
files_in_bucket = list(avian_data.objects.all())
tree = {}
filenames = [f.key for f in files_in_bucket if not f.key.endswith("/")]
for file in filenames:
    parent = tree
    filepath = file.split("/")
    for p in filepath:
        level = parent.get(p, {})
        parent[p] = level
        parent = parent[p]

In [None]:
def createListings(tree, path):
    """Recursive method to list all files and folders in a tree,
    with the format required by jquery browse files"""
    dirs = []
    files = []
    listings = {}
    for e in tree:
        if tree[e]:
            dirs.append(e)
            listings.update(createListings(tree[e], path + "/" + e))
        elif "." in e:
            files.append(e)

    listings[path] = {"dirs": dirs, "files": files}
    return listings

In [None]:
listings = createListings(tree, "")
import json

with open("file_listing.json", "w") as jsonfile:
    json.dump(listings, jsonfile)