# Avian data Data transformation notebook

This notebook performs and document the transformations performed on the data received from Colibri by The Water Institute as part of this project. 

The main goals of this transformations are:
  - Create a unified dataset for 2010-2021 data
  - Rename both high resolution photos and screenshots to have them in a unified folder structure
  - Create thumbnails for the high resolution photos to be visualized on the web dashboard
  - Analyze difference between the summary data and the totals calculated from the raw data (if any)
  - Generate datasets for being used in the visualizations

In [1]:
#%pip install tqdm pandas boto3

### Parameters

In [2]:
thumbnail_size = (518, 345)
create_thumbnails = True
replace = False
_base_folder = "avian_monitoring"

### Common imports

In [3]:
from PIL import Image
import re
import pandas as pd
#import geopandas as gpd
import pandas_access
from datetime import datetime
import pyodbc
import numpy as np
from multiprocessing import Pool
from functools import partial
import geopandas as gp
import boto3
import re
from tqdm.notebook import tqdm
from dateutil.parser import parse
from concurrent.futures import ThreadPoolExecutor
from io import BytesIO
from functools import partial

tqdm.pandas()

boto3.setup_default_session(profile_name="GLO")
bucket_name = "twi-avian-data"
starting_folder = "HighResolutionImages"
new_folder_hr = "avian_monitoring_"
aws_s3 = boto3.resource(
    "s3",
)
avian_data = aws_s3.Bucket(bucket_name)

### Utilities

In [4]:
def clean_date(text):
    """Return a date as yyyy-MMM-dd (e.g. 2022-May-15)"""
    datetimestr = parse(text)
    text = datetime.strftime(datetimestr, "%Y-%b-%d")
    return text


def update_mime_type(t, mime="image/png"):
    """Update the metadata of s3 objects to the given Mime type"""
    s3_object = avian_data.Object(t)
    s3_object.copy_from(
        CopySource={"Bucket": avian_data.name, "Key": t},
        Metadata=s3_object.metadata,
        MetadataDirective="REPLACE",
        ContentType=mime,
    )


def copy_wnew_mime_type(t, k, mime="image/png"):
    """Copy a s3 object, modifying its mime type"""
    s3_object = avian_data.Object(t)
    avian_data.copy(
        {"Bucket": avian_data.name, "Key": t},
        k,
        Metadata=s3_object.metadata,
        MetadataDirective="REPLACE",
        ContentType=mime,
    )


def rename(key, new_name, replace=False):
    """Copy an object (key) to a new location (new_name) on the avian_data bucket"""
    if not replace and [o for o in avian_data.objects.filter(Prefix=new_name)]:
        return True

    try:
        avian_data.copy({"Bucket": avian_data.name, "Key": key}, new_name)
    except Exception as e:
        print(e)
        check_lowercase = list(
            avian_data.objects.filter(Prefix=key.replace(".JPG", ".jpg"))
        )
        if check_lowercase:
            avian_data.copy(
                {"Bucket": avian_data.name, "Key": check_lowercase[0].key}, new_name
            )
            return True
        check_uppercase = list(
            avian_data.objects.filter(Prefix=key.replace(".jpg", ".JPG"))
        )
        if check_uppercase:
            avian_data.copy(
                {"Bucket": avian_data.name, "Key": check_uppercase[0].key}, new_name
            )
            return True
        print(f"{key} not found")
        return False


def generate_thumbnail(high_res_key, thumb_key, regenerate=False):
    """Generate a low resolution thumbnail at thumb_key of the high_res_key object.
    it uses the global thumbnail_size to define the size.
    """
    current = avian_data.objects.filter(Prefix=thumb_key)
    if not regenerate and [o for o in current]:
        return True
    objs = [o for o in avian_data.objects.filter(Prefix=high_res_key)]
    if not objs:
        print(f"there is no {high_res_key}")
        return False
    image = Image.open(BytesIO(objs[0].get()["Body"].read()))
    buffer = BytesIO()
    image.thumbnail(thumbnail_size)
    image.save(buffer, format="png")
    buffer.seek(0)
    avian_data.put_object(Key=thumb_key, Body=buffer, ContentType="image/png")
    return True

# Unified 2010-2021 Data

Data has been unified by Colibri into a single access database, however it contains tables discriminated by years, with some differences in the schema. This process will combine all the datasets into a single one with a common deffinition of the total birds and total nests. 

In [5]:
# Notes: In order to use all the fields, I did a rename of the columns containing '?' or '/'
# acc_db = "/mnt/c/Users/carizaporras/Downloads/Colibri_tblsSpeciesData2010-2021_18sept22.accdb"
acc_db = "/mnt/z/Colibri2010-2021CWBColonies_12Nov2022_working_copy.accdb"
schema = pandas_access.read_schema(acc_db)

In [6]:
ct_name = "tblRWCWB_ColonyInventory_10Nov22"
# colonies_table = schema.pop("tblRWCWB_ColonyInventory_13Sept2022")
colonies_table = schema.pop(ct_name)

There are three tables with slighly different schema

In [7]:
schema

{'tblColonySiteNotes2010': {'ID': 'Long Integer',
  'Latitude': 'Text (100)',
  'Longitude': 'Text (100)',
  'Dotter': 'Text (100)',
  'ColonyName': 'Text (100)',
  'Habitat': 'Text (100)',
  'Oil': 'Text (2)',
  'Notes': 'Text (510)'},
 'tblSpeciesCodes': {'SpeciesCode': 'Text (100) NOT NULL'},
 'tblSpeciesData2010': {'AutoID': 'Double',
  'Year': 'Double',
  'Date': 'DateTime',
  'ColonyName': 'Text (510)',
  'Latitude': 'Text (510)',
  'Longitude': 'Text (510)',
  'DottingAreaNumber': 'Text (510)',
  'CameraNumber': 'Text (510)',
  'CardNumber': 'Text (510)',
  'PhotoNumber': 'Text (510)',
  'PQ': 'Text (510)',
  'SpeciesCode': 'Text (510)',
  'WBN': 'Double',
  'ChickNestwithoutAdult': 'Double',
  'AbandNest': 'Double',
  'EmptyNest': 'Double',
  'PBN': 'Double',
  'Site': 'Double',
  'Brood': 'Double',
  'OtherAdultsInColony': 'Double',
  'OtherImmInColony': 'Double',
  'RoostingBirds': 'Double',
  'RoostingAdults': 'Double',
  'RoostingImmatures': 'Double',
  'UnknownAge': 'Doubl

Taking the 2015 to 2021 table as a reference, lets compare the schemas.

Fields in the reference but not in the `tblSpeciesData2011_2013` table

In [8]:
schema["tblSpeciesData2015_2018_2021"].keys() - schema["tblSpeciesData2011_2013"].keys()

{'ChickNest', 'OtherBirds', 'Subcolony', 'Territory'}

Fields in the `tblSpeciesData2011_2013` table but not in the reference

In [9]:
schema["tblSpeciesData2011_2013"].keys() - schema["tblSpeciesData2015_2018_2021"].keys() 

{'ChicksNestlings',
 'OtherAdultsInColony',
 'OtherImmInColony',
 'RoostingAdults',
 'RoostingBirds',
 'RoostingImmatures',
 'UnknownAge'}

Fields in the referece but not in the 2010 table

In [10]:
schema["tblSpeciesData2015_2018_2021"].keys() - schema["tblSpeciesData2010"].keys()

{'BestForBPE', 'ChickNest', 'Notes', 'OtherBirds', 'Subcolony', 'Territory'}

Fields in the referece but not in the 2010 table

In [11]:
schema["tblSpeciesData2010"].keys() - schema["tblSpeciesData2015_2018_2021"].keys()

{'Latitude',
 'Longitude',
 'OtherAdultsInColony',
 'OtherImmInColony',
 'RoostingAdults',
 'RoostingBirds',
 'RoostingImmatures',
 'UnknownAge'}

That also means that we have different formulas to calculate the total. The bestForBPE field is used as filter in 2013 to 2021, but it is not used in the 2010.

### Formulas

|                | 2010                                                                                                                                                               | 2011-2013                                                                                                                                                          | 2015-2021                                                                                                 |
|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------|
| Nests          | sum(<br>[WBN]<br>+[ChickNestw/outAdult]<br>+[AbandNest]<br>+[EmptyNest]<br>+[PBN]<br>+[Site]<br>+[Brood])                                                          | Sum(<br>[WBN]<br>+[ChickNestw/outAdult]<br>+[AbandNest]<br>+[EmptyNest]<br>+[PBN]<br>+[Site]<br>+[Brood])                                                          | Sum(<br>[WBN]<br>+[ChickNest]<br>+[ChickNestw/outAdult]<br>+[AbandNest]<br>+[PBN]<br>+[Site]<br>+[Brood]) |
| Birds          | sum([WBN]<br>+[PBN]<br>+[Site]<br>+[OtherAdultsInColony]<br>+[OtherImmInColony]<br>+[RoostingBirds]<br>+[RoostingAdults]<br>+[RoostingImmatures]<br>+[UnknownAge]) | Sum([WBN]<br>+[PBN]<br>+[Site]<br>+[OtherAdultsInColony]<br>+[OtherImmInColony]<br>+[RoostingBirds]<br>+[RoostingAdults]<br>+[RoostingImmatures]<br>+[UnknownAge]) | Sum(<br>[WBN]<br>+[ChickNest]<br>+[PBN]<br>+[Territory]<br>+[Site]<br>+[OtherBirds])                      |
| SumOfEmptyNest | sum(EmptyNest)                                                                                                                                                     | EmptyNest                                                                                                                                                          | EmptyNest                                                                                                 |


But 2015 to 2021 has additional categories: 

```SQL

Sum(WBN) AS SumOfWBN

Sum(ChickNest) AS SumOfChickNest

Sum([ChickNestw/outAdult]) AS [SumOfChickNestw/outAdult] 

Sum(Brood) AS SumOfBrood

Sum(AbandNest) AS SumOfAbandNest

Sum(PBN) AS SumOfPBN

Sum(Territory) AS SumOfTerritory
```

## Generating a common dataset:
- merge 2010 species with the colonies. 
- select only the common columns 

In [12]:
cols_2015_2021 = set(schema["tblSpeciesData2015_2018_2021"].keys())
cols_2011_2013 = set(schema["tblSpeciesData2011_2013"].keys())
cols_2010 = set(schema["tblSpeciesData2010"].keys())
common_fields = cols_2011_2013.intersection(cols_2015_2021)

In [13]:
cols_2010 - common_fields

{'Latitude',
 'Longitude',
 'OtherAdultsInColony',
 'OtherImmInColony',
 'RoostingAdults',
 'RoostingBirds',
 'RoostingImmatures',
 'UnknownAge'}

In [14]:
common_fields - cols_2010

{'BestForBPE', 'Notes'}

In [15]:
pd_species_2010 = pandas_access.read_table(acc_db, "tblSpeciesData2010")

In [16]:
pd_species_2010["Notes"] = ""
pd_species_2010["BestForBPE"] = "N"

In [17]:
pd_species_2011_2013 = pandas_access.read_table(
    acc_db, "tblSpeciesData2011_2013"
)  # [common_fields]
pd_species_2015_2021 = pandas_access.read_table(
    acc_db, "tblSpeciesData2015_2018_2021"
)  # [common_fields]

In [18]:
pd_species = pd.concat(
    [pd_species_2010, pd_species_2011_2013, pd_species_2015_2021], ignore_index=True
)

In [19]:
pd_species = pd_species.drop(columns="AutoID")

In [20]:
pd_species["Year"] = pd_species["Year"].astype(int).astype(str)

In [21]:
pd_colonies = pandas_access.read_table(acc_db, ct_name)
pd_colonies["ColonyName"] = pd_colonies["ColonyName"].astype(str)

In [22]:
pd_species = pd.merge(pd_species, pd_colonies, on="ColonyName")

In [23]:
pd_species["HighResImage_new"] = (
    f"{_base_folder}/high_resolution_photos/"
    + pd_species["Year"].astype(str)
    + "/"
    + pd_species["GeoRegion"]
    + "/"
    + pd_species["ColonyName"]
    + "/"
    + pd.to_datetime(
        pd_species["Date"].astype("str"), format="%m/%d/%y %H:%M:%S"
    ).dt.strftime("%d%B%y")
    + "Camera"
    + pd_species["CameraNumber"]
    + "-"
    + "Card"
    + (pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))
    + "-"
    + pd_species["PhotoNumber"]
    + ".jpg"
)
pd_species["screenshot_new"] = (
    f"{_base_folder}/screenshots/"
    + pd_species["Year"].astype(str)
    + "/"
    + pd_species["GeoRegion"]
    + "/"
    + pd_species["ColonyName"]
    + "/"
    + pd.to_datetime(
        pd_species["Date"].astype("str"), format="%m/%d/%y %H:%M:%S"
    ).dt.strftime("%d%B%y")
    + "Camera"
    + pd_species["CameraNumber"]
    + "-"
    + "Card"
    + (pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))
    + "-"
    + pd_species["PhotoNumber"]
    + ".jpg"
)
pd_species["thumbnail_new"] = (
    f"{_base_folder}/thumbnails/"
    + pd_species["Year"].astype(str)
    + "/"
    + pd_species["GeoRegion"]
    + "/"
    + pd_species["ColonyName"]
    + "/"
    + pd.to_datetime(
        pd_species["Date"].astype("str"), format="%m/%d/%y %H:%M:%S"
    ).dt.strftime("%d%B%y")
    + "Camera"
    + pd_species["CameraNumber"]
    + "-"
    + "Card"
    + (pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))
    + "-"
    + pd_species["PhotoNumber"]
    + ".png"
)

In [24]:
pd_species["total_nests"] = np.where(
    pd_species["Year"].isin(["2010", "2011", "2012", "2013"]),
    pd_species["WBN"]
    + pd_species["ChickNestwithoutAdult"]
    + pd_species["AbandNest"]
    + pd_species["EmptyNest"]
    + pd_species["PBN"]
    + pd_species["Site"]
    + pd_species["Brood"],
    pd_species["WBN"]
    + pd_species["ChickNestwithoutAdult"]
    + pd_species["AbandNest"]
    + pd_species["ChickNest"]
    + pd_species["PBN"]
    + pd_species["Site"]
    + pd_species["Brood"],
)
pd_species["total_birds"] = np.where(
    pd_species["Year"].isin(["2010", "2011", "2012", "2013"]),
    pd_species["WBN"]
    + pd_species["PBN"]
    + pd_species["Site"]
    + pd_species["OtherAdultsInColony"]
    + pd_species["OtherImmInColony"]
    + pd_species["RoostingBirds"]
    + pd_species["RoostingAdults"]
    + pd_species["RoostingImmatures"]
    + pd_species["UnknownAge"],
    pd_species["WBN"]
    + +pd_species["ChickNest"]
    + pd_species["PBN"]
    + pd_species["Territory"]
    + pd_species["Site"]
    + pd_species["OtherBirds"],
)

In [25]:
pd_species["date2"] = pd_species["Date"].astype("str").apply(clean_date)
pd_species["month"] = pd_species["date2"].apply(lambda x: x.split("-")[1])

agg_2010 = (
    pd_species[pd_species.Year == "2010"]
    .groupby(["Year", "month", "ColonyName", "SpeciesCode"])
    .agg({"total_birds": "sum"})
    .reset_index()
)

In [26]:
selected = (
    agg_2010.sort_values("total_birds")
    .drop_duplicates(["Year", "ColonyName", "SpeciesCode"], keep="last")[
        ["Year", "month", "ColonyName", "SpeciesCode"]
    ]
    .reset_index(drop=True)
)

In [27]:
selected[selected.SpeciesCode == "ROSA"]

Unnamed: 0,Year,month,ColonyName,SpeciesCode
25,2010,Jun,North Deer Island,ROSA
64,2010,May,Little Mud Grass Island Northeast,ROSA
592,2010,May,Biloxi North 32,ROSA
848,2010,May,Biloxi North 15,ROSA
977,2010,Jun,West Bay Bird Island New,ROSA
1076,2010,May,Louisiana West 1,ROSA
1228,2010,May,Horn Island,ROSA
1260,2010,Jun,Martin Island,ROSA
1321,2010,Jun,Saint George Causeway,ROSA
1351,2010,Jun,Long Reef Deadman Islands B,ROSA


In [28]:
pd_species.loc[
    pd_species[["Year", "month", "ColonyName", "SpeciesCode"]]
    .apply(lambda row: str([x for x in row]), axis=1)
    .isin(selected.apply(lambda row: str([x for x in row]), axis=1)),
    "BestForBPE",
] = "Y"

In [29]:
pd_species["uid"] = (
    pd_species["HighResImage_new"]
    + "#"
    + np.where(pd_species["SpeciesCode"].isna(), "N/A", pd_species["SpeciesCode"])
)

In [30]:
pd_species.to_csv("avianmonitoring_2010-2021.csv.gz", index=False)

In [31]:
pd_species.to_excel("avianmonitoring_2010-2021.xlsx", index=False)

# Reorganize Files
Photos from 2010 to 2021 originally are not organized by GeoRegion/Colony and can have different naming standard.
We are using the results of the dotting process to organize the images referenced from the Access database. 

We are also generating a thumbnail and reorganizing and renaming the screenshots from the dotting process. 

Note: This will be a subset of the photos, you can see all the high resolution photos available on the HighResolutionImages folder in the S3 bucket. 

In [32]:
pd_species[pd_species["HighResImage_new"].isna()]

Unnamed: 0,Year,Date,ColonyName,Latitude_x,Longitude_x,DottingAreaNumber,CameraNumber,CardNumber,PhotoNumber,PQ,...,OrigDotterID,NOTES August 2022,HighResImage_new,screenshot_new,thumbnail_new,total_nests,total_birds,date2,month,uid
24903,2012,05/27/12 00:00:00,Raccoon Island,,,146,,,,E,...,Raccoon Island,,,,,0.0,0.0,2012-May-27,May,
29427,2013,05/23/13 00:00:00,Queen Bess Island,,,78,,,,,...,MWP2; MWP13,,,,,0.0,0.0,2013-May-23,May,


In [33]:
files = [
    o
    for o in avian_data.objects.filter(Prefix=starting_folder)
    if re.match(r".*/20[1-2][0-9]/.*(\.jp(.{0,1})g|.tiff)", o.key.lower())
]

We will need to obtain the photo information from the HighResolution photo path. The file naming patter varies, even in the same year, but we can define a regular expression general enough to catch all the cases for most of the information. 

In [34]:
image_groups = re.compile(
    r"HighResolutionImages/(?P<year>\d{4}).*[/ ,]+(?P<date>\d+\s*[A-Z]+\s*\d{2,4}).*(Camera|Cam)\s*(?P<camera>\d+)[ /-]*(Card\s*(?P<card>\d+)){0,1}[\s-]*((\w+/))*(IMGP){0,1}(?P<photo>\d+)\.(?P<extension>jp.?g|tiff?)",
    flags=re.IGNORECASE,
)

In [35]:
m = image_groups.match(
    "HighResolutionImages/2010/June 2010/10 June 2010/10 June 2010 Camera 1 Card 1/10 June 2010 Camera 1 Card 1 010.JPG"
)

In [36]:
df_files = []
no_files = []
for i, o in tqdm(enumerate(files)):
    if image_groups.match(o.key):
        dict_t = image_groups.match(o.key).groupdict()
        df_files.append({"key": o.key, "object": o, **dict_t})
    else:
        no_files.append(o.key)

0it [00:00, ?it/s]

There are a few files with not enough information:

In [37]:
no_files

['HighResolutionImages/2010/June 2010/10 June 2010/10 June 2010 Camera 1 Card 1/corms and pelicans.jpg',
 'HighResolutionImages/2010/June 2010/8 June 2010/8 June 2010 Camera 1 Card 1/8 June 2010 Camera 1 Card 1 numbering off/100.JPG',
 'HighResolutionImages/2010/June 2010/8 June 2010/8 June 2010 Camera 1 Card 1/8 June 2010 Camera 1 Card 1 numbering off/101.JPG',
 'HighResolutionImages/2010/June 2010/8 June 2010/8 June 2010 Camera 1 Card 1/8 June 2010 Camera 1 Card 1 numbering off/102.JPG',
 'HighResolutionImages/2010/June 2010/8 June 2010/8 June 2010 Camera 1 Card 1/8 June 2010 Camera 1 Card 1 numbering off/103.JPG',
 'HighResolutionImages/2010/June 2010/8 June 2010/8 June 2010 Camera 1 Card 1/8 June 2010 Camera 1 Card 1 numbering off/104.JPG',
 'HighResolutionImages/2010/June 2010/8 June 2010/8 June 2010 Camera 1 Card 1/8 June 2010 Camera 1 Card 1 numbering off/105.JPG',
 'HighResolutionImages/2010/June 2010/8 June 2010/8 June 2010 Camera 1 Card 1/8 June 2010 Camera 1 Card 1 numbering

In [38]:
no_files = list(filter(lambda k: not "numbering off" in k, no_files))

In [39]:
with open("n_files_report.txt", "w") as n_files_report:
    n_files_report.write("\n".join(no_files))

Create a dataframe from the collected information, and merge it with the species dataset to be able to rearrange/rename the photos to the desire location based on the dotting information. 

In [40]:
fdf = pd.DataFrame(df_files)

In [41]:
fdf["extension"] = fdf["extension"].str.lower()

In [42]:
fdf["date2"] = fdf["date"].astype("str").apply(clean_date)
fdf["month"] = fdf["date2"].apply(lambda x: x.split("-")[1])
fdf["day"] = fdf["date2"].apply(lambda x: x.split("-")[2])

In [43]:
pd_species["date2"] = pd_species["Date"].astype("str").apply(clean_date)
pd_species["month"] = pd_species["date2"].apply(lambda x: x.split("-")[1])
pd_species["day"] = pd_species["date2"].apply(lambda x: x.split("-")[2])

In [44]:
pd_species[~pd_species["HighResImage_new"].isna()].to_csv(
    "avianData20102021.csv.gz", index=False
)

In [45]:
pd_species[~pd_species["HighResImage_new"].isna()] = pd_species[
    ~pd_species["HighResImage_new"].isna()
]

In [46]:
fdf = fdf.rename(
    columns={
        "camera": "CameraNumber",
        "card": "CardNumber",
        "photo": "PhotoNumber",
        "year": "Year",
    }
)

In [47]:
fdf["PhotoNumber"] = fdf["PhotoNumber"].str.rjust(5, "0")
pd_species["PhotoNumber"] = pd_species["PhotoNumber"].str.rjust(5, "0")

In [48]:
join_cols = ["CameraNumber", "CardNumber", "PhotoNumber", "Year", "month", "day"]
for c in join_cols:
    fdf[c] = fdf[c].astype("str").str.strip()
    pd_species[c] = pd_species[c].astype("str").str.strip()

c = "CardNumber"
fdf.loc[fdf[c] == "None", c] = (
    fdf.loc[fdf[c] == "None", "key"]
    .str.extract(r".*Card\s*(\d+).*", flags=re.IGNORECASE, expand=False)
    .where(fdf.loc[fdf[c] == "None", "key"].str.match(r".*Card\s(\d+).*", False), 1)
)

In [49]:
merged = pd_species.merge(fdf, on=join_cols, how="left")

In [50]:
m_grouped = (
    merged[
        [
            "HighResImage_new",
            "key",
            "CameraNumber",
            "CardNumber",
            "PhotoNumber",
            "Date",
            "Year",
            "month",
            "thumbnail_new",
        ]
    ]
    .groupby(
        [
            "HighResImage_new",
            "key",
            "CameraNumber",
            "CardNumber",
            "PhotoNumber",
            "Date",
            "Year",
            "month",
            "thumbnail_new",
        ]
    )
    .count()
    .reset_index()
)

In [51]:
# r = m_grouped.progress_apply(lambda x:avian_data.copy ({'Bucket': avian_data.name,'Key':x['key']}, x['HighResImage_new']), axis=1)
with ThreadPoolExecutor(max_workers=16) as e:
    futures = list(
        tqdm(
            e.map(
                lambda x, y: rename(x, y),
                m_grouped["key"].tolist(),
                m_grouped["HighResImage_new"].tolist(),
            ),
            total=m_grouped.shape[0],
        )
    )

  0%|          | 0/13622 [00:00<?, ?it/s]

In [52]:
with pd.option_context("display.max_colwidth", None):
    display(
        m_grouped[
            m_grouped["HighResImage_new"]
            == "avian_monitoring/high_resolution_photos/2021/Birdsfoot West/Birdsfoot West 6 A/22May21Camera1-Card2-3297.jpg"
        ]
    )

Unnamed: 0,HighResImage_new,key,CameraNumber,CardNumber,PhotoNumber,Date,Year,month,thumbnail_new
11877,avian_monitoring/high_resolution_photos/2021/Birdsfoot West/Birdsfoot West 6 A/22May21Camera1-Card2-3297.jpg,HighResolutionImages/2021/May2021/22May21/22May21Camera1 Card 2/22May2021Camera1-3297.jpg,1,2,3297,05/22/21 00:00:00,2021,May,avian_monitoring/thumbnails/2021/Birdsfoot West/Birdsfoot West 6 A/22May21Camera1-Card2-3297.png


In [53]:
m_grouped.to_excel("UsedPhotos2010-2021.xlsx", index=False)

In [54]:
# As checkpoint, we can run:
# m_grouped = pd.read_excel("UsedPhotos2010-2021.xlsx")

In [55]:
pd_species[
    pd_species["HighResImage_new"]
    == "avian_monitoring/high_resolution_photos/2021/Biloxi North/Biloxi North 4/16June21Camera2-Card2-6268.jpg"
]

Unnamed: 0,Year,Date,ColonyName,Latitude_x,Longitude_x,DottingAreaNumber,CameraNumber,CardNumber,PhotoNumber,PQ,...,NOTES August 2022,HighResImage_new,screenshot_new,thumbnail_new,total_nests,total_birds,date2,month,uid,day
14002,2021,06/16/21 00:00:00,Biloxi North 4,,,1,2,2,6268,E,...,,avian_monitoring/high_resolution_photos/2021/B...,avian_monitoring/screenshots/2021/Biloxi North...,avian_monitoring/thumbnails/2021/Biloxi North/...,16.0,40.0,2021-Jun-16,Jun,avian_monitoring/high_resolution_photos/2021/B...,16
14003,2021,06/16/21 00:00:00,Biloxi North 4,,,1,2,2,6268,E,...,,avian_monitoring/high_resolution_photos/2021/B...,avian_monitoring/screenshots/2021/Biloxi North...,avian_monitoring/thumbnails/2021/Biloxi North/...,1.0,1.0,2021-Jun-16,Jun,avian_monitoring/high_resolution_photos/2021/B...,16
14004,2021,06/16/21 00:00:00,Biloxi North 4,,,1,2,2,6268,E,...,,avian_monitoring/high_resolution_photos/2021/B...,avian_monitoring/screenshots/2021/Biloxi North...,avian_monitoring/thumbnails/2021/Biloxi North/...,1.0,1.0,2021-Jun-16,Jun,avian_monitoring/high_resolution_photos/2021/B...,16


In [56]:
with ThreadPoolExecutor(max_workers=16) as e:
    futures = list(
        tqdm(
            e.map(
                partial(generate_thumbnail, regenerate=False),
                m_grouped["HighResImage_new"].tolist(),
                m_grouped["thumbnail_new"].tolist(),
            ),
            total=m_grouped.shape[0],
        )
    )

  0%|          | 0/13622 [00:00<?, ?it/s]

# Previous version code
```python
#acc_db = "/mnt/c/Users/carizaporras/Downloads/Colibri_tblSpeciesData2015_2018_2021_2Sept2022 (1).accdb"
#schema = pandas_access.read_schema(acc_db)
#pd_species = pandas_access.read_table(acc_db, "tblSpeciesData2015_2018_2021")
#pd_species["ColonyName"] = pd_species["ColonyName"].astype(str)
#pd_colonies = pandas_access.read_table(acc_db, "tblRWCWB_ColonyInventory_2022")
#pd_colonies["ColonyName"] = pd_colonies["ColonyName"].astype(str)
#pd_species = pd.merge(pd_species, pd_colonies, on="ColonyName")
#pd_species["HighResImage_new"] = f"{_base_folder}/high_resolution_photos/"+pd_species['Year'].astype(str)+'/'+pd_species['GeoRegion']+'/'+pd_species['ColonyName']+'/'+pd.to_datetime(pd_species["Date"].astype('str'), format="%m/%d/%y %H:%M:%S").dt.strftime("%d%B%y")+'Camera'+pd_species["CameraNumber"]+'-'+'Card'+(pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))+"-"+pd_species["PhotoNumber"]+".jpg"
#pd_species["screenshot_new"] = f'{_base_folder}/screenshots/'+pd_species['Year'].astype(str)+'/'+pd_species['GeoRegion']+'/'+pd_species['ColonyName']+'/'+pd.to_datetime(pd_species["Date"].astype('str'), format="%m/%d/%y %H:%M:%S").dt.strftime("%d%B%y")+'Camera'+pd_species["CameraNumber"]+'-'+'Card'+(pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))+"-"+pd_species["PhotoNumber"]+".jpg"
#pd_species["thumbnail_new"] =  f'{_base_folder}/thumbnails/'+pd_species['Year'].astype(str)+'/'+pd_species['GeoRegion']+'/'+pd_species['ColonyName']+'/'+pd.to_datetime(pd_species["Date"].astype('str'), format="%m/%d/%y %H:%M:%S").dt.strftime("%d%B%y")+'Camera'+pd_species["CameraNumber"]+'-'+'Card'+(pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))+"-"+pd_species["PhotoNumber"]+".png"
#gdf = gp.GeoDataFrame(
#    pd_species, geometry=gp.points_from_xy(pd_species["Longitude"], pd_species["Latitude"]))
#set_index(["State","GeoRegion","ColonyName","Year", "Latitude", "Longitude", "Date", "SpeciesCode"])
#pd_species = pd_species.drop(columns=["AutoID","Subcolony"])
#gdf["huc"] = "TBD"

def _convert_to_degress(value):
    """
    Helper function to convert the GPS coordinates stored in the EXIF to degress in float format
    Borrowed from: https://gist.github.com/snakeye/fdc372dbf11370fe29eb
    Modified to recieve a tuple instead of a exifread.utils.Ratio
    :param value:
    :type value: tuple
    :rtype: float
    """
    d = float(value[0][0]) / float(value[0][1])
    m = float(value[1][0]) / float(value[1][1])
    s = float(value[2][0]) / float(value[2][1])

    return d + (m / 60.0) + (s / 3600.0)
```

# UPDATE Mime types 

In [57]:
def update_mime_type(t, mime="image/png"):
    s3_object = avian_data.Object(t)
    s3_object.copy_from(
        CopySource={"Bucket": avian_data.name, "Key": t},
        Metadata=s3_object.metadata,
        MetadataDirective="REPLACE",
        ContentType=mime,
    )


def copy_wnew_mime_type(t, k, mime="image/png"):
    avian_data.copy(
        {"Bucket": avian_data.name, "Key": t},
        k,
        Metadata=s3_object.metadata,
        MetadataDirective="REPLACE",
        ContentType=mime,
    )

In [58]:
if replace and create_thumbnails:
    m_grouped["thumbnail_new"].progress_apply(lambda t: update_mime_type(t))

In [59]:
with ThreadPoolExecutor(max_workers=16) as e:
    futures = list(
        tqdm(
            e.map(
                lambda t: update_mime_type(t, mime="image/jpeg"),
                m_grouped["HighResImage_new"].tolist(),
            ),
            total=m_grouped.shape[0],
        )
    )

  0%|          | 0/13622 [00:00<?, ?it/s]

# Reorganize/rename screenshots

In [60]:
m_grouped = pd.read_excel("UsedPhotos2010-2021.xlsx")
# to_rename = pd.read_excel("toRename2.xlsx")
to_rename = pd.read_excel("Renaming_Avian.xlsx")

In [61]:
if not "Original_Path" in to_rename:
    to_rename["Original_Path"] = to_rename["Path"]

In [62]:
to_rename = to_rename[["HighResImage_new", "Original_Path"]]

In [63]:
to_rename = to_rename[
    ~(to_rename["Original_Path"].isna() | to_rename["Original_Path"].isna())
]

In [64]:
to_rename["Original_Path"] = to_rename["Original_Path"].str.replace(
    "/to82sp", "DottedImages"
)

In [65]:
to_rename = to_rename.merge(m_grouped, on="HighResImage_new")

In [66]:
to_rename["screenshot_new"] = to_rename["HighResImage_new"].str.replace(
    "/high_resolution_photos/", "/screenshots/"
)

In [67]:
to_rename[["screenshot_new", "Original_Path"]]

Unnamed: 0,screenshot_new,Original_Path
0,avian_monitoring/screenshots/2010/Barataria Ba...,DottedImages/2010-2013 Dotted Images/2010/SLF ...
1,avian_monitoring/screenshots/2010/Barataria Ba...,DottedImages/2010-2013 Dotted Images/2010/SLF ...
2,avian_monitoring/screenshots/2010/Barataria Ba...,DottedImages/2010-2013 Dotted Images/2010/SLF ...
3,avian_monitoring/screenshots/2010/Barataria Ba...,DottedImages/2010-2013 Dotted Images/2010/SLF ...
4,avian_monitoring/screenshots/2010/Barataria Ba...,DottedImages/2010-2013 Dotted Images/2010/MWP ...
...,...,...
11105,avian_monitoring/screenshots/2021/Vermilion Ba...,DottedImages/Task 2 2021 Waterbird Colony Phot...
11106,avian_monitoring/screenshots/2021/Vermilion Ba...,DottedImages/Task 2 2021 Waterbird Colony Phot...
11107,avian_monitoring/screenshots/2021/Vermilion Ba...,DottedImages/Task 2 2021 Waterbird Colony Phot...
11108,avian_monitoring/screenshots/2021/Vermilion Ba...,DottedImages/Task 2 2021 Waterbird Colony Phot...


In [68]:
with ThreadPoolExecutor(max_workers=16) as e:
    futures = list(
        tqdm(
            e.map(
                lambda x, y: rename(x, y),
                to_rename["Original_Path"].tolist(),
                to_rename["screenshot_new"].tolist(),
            ),
            total=to_rename.shape[0],
        )
    )

  0%|          | 0/11110 [00:00<?, ?it/s]

# Join summary excel with locations from colonies

In [69]:
pd_species = pd.read_excel("avianmonitoring_2010-2021.xlsx")

In [70]:
totals = pd.read_excel("/mnt/z/Colibri2010-21ColonyTotalsMayJuneCombined_8Nov22.xlsx")

In [71]:
totals

Unnamed: 0,Year,Date,State,GeoRegion,ColonyName,SpeciesCode,Nests,Birds,CombinedMayJuneTotal?
0,2021,2021-05-24,AL,Mississippi Sound,Cat Island,BLSK,40,89,N
1,2021,2021-06-15,AL,Mississippi Sound,Cat Island,BRPE,0,150,N
2,2021,2021-05-24,AL,Mississippi Sound,Cat Island,GBTE,3,5,N
3,2021,2021-05-24,AL,Mississippi Sound,Cat Island,ROYT,373,374,N
4,2021,2021-05-24,AL,Mississippi Sound,Cat Island,SATE,4,4,N
...,...,...,...,...,...,...,...,...,...
5925,2010,2010-06-26,TX,Salt Bayou,High Island,ROSP,24,98,N
5926,2010,2010-06-26,TX,Salt Bayou,High Island,SNEG,1,2,N
5927,2010,2010-06-26,TX,Salt Bayou,High Island,TRHE,18,21,N
5928,2010,2010-06-26,TX,Salt Bayou,High Island,WHIB,0,1,N


In [72]:
unique_colonies = (
    pd_species[
        ["Year", "State", "GeoRegion", "ColonyName", "Longitude_y", "Latitude_y"]
    ]
    .drop_duplicates()
    .reset_index(drop=True)
)

In [73]:
join_cols = ["Year", "State", "GeoRegion", "ColonyName"]
for c in join_cols:
    totals[c] = totals[c].astype("str").str.strip()
    unique_colonies[c] = unique_colonies[c].astype("str").str.strip()

In [74]:
joined_totals = pd.merge(totals, unique_colonies, on=join_cols)

In [75]:
joined_totals

Unnamed: 0,Year,Date,State,GeoRegion,ColonyName,SpeciesCode,Nests,Birds,CombinedMayJuneTotal?,Longitude_y,Latitude_y
0,2021,2021-05-24,AL,Mississippi Sound,Cat Island,BLSK,40,89,N,-88.2099,30.32070
1,2021,2021-06-15,AL,Mississippi Sound,Cat Island,BRPE,0,150,N,-88.2099,30.32070
2,2021,2021-05-24,AL,Mississippi Sound,Cat Island,GBTE,3,5,N,-88.2099,30.32070
3,2021,2021-05-24,AL,Mississippi Sound,Cat Island,ROYT,373,374,N,-88.2099,30.32070
4,2021,2021-05-24,AL,Mississippi Sound,Cat Island,SATE,4,4,N,-88.2099,30.32070
...,...,...,...,...,...,...,...,...,...,...,...
5915,2010,2010-06-26,TX,Salt Bayou,High Island,ROSP,24,98,N,-94.3893,29.57351
5916,2010,2010-06-26,TX,Salt Bayou,High Island,SNEG,1,2,N,-94.3893,29.57351
5917,2010,2010-06-26,TX,Salt Bayou,High Island,TRHE,18,21,N,-94.3893,29.57351
5918,2010,2010-06-26,TX,Salt Bayou,High Island,WHIB,0,1,N,-94.3893,29.57351


In [76]:
joined_totals.to_excel("joined_totals.xlsx", index=False)

# Compare totals from totals and pd_species

In [77]:
joined_totals = pd.read_excel("joined_totals.xlsx")
pd_species.columns

Index(['Year', 'Date', 'ColonyName', 'Latitude_x', 'Longitude_x',
       'DottingAreaNumber', 'CameraNumber', 'CardNumber', 'PhotoNumber', 'PQ',
       'SpeciesCode', 'WBN', 'ChickNestwithoutAdult', 'AbandNest', 'EmptyNest',
       'PBN', 'Site', 'Brood', 'OtherAdultsInColony', 'OtherImmInColony',
       'Chicks/Nestlings', 'RoostingBirds', 'RoostingAdults',
       'RoostingImmatures', 'UnknownAge', 'Dotter', 'Dotter'sColonyNumber',
       'DateDotted', 'Notes', 'BestForBPE', 'ChicksNestlings',
       'AdditionalNotes', 'Subcolony', 'ChickNest', 'Territory', 'OtherBirds',
       'ColonyID', 'ActiveInventory', 'ColonyGroupBuffer', 'State',
       'Longitude_y', 'Latitude_y', 'PrimaryHabitat', 'LandForm', 'GeoRegion',
       'ExtrapArea', 'TerrestEcoRegion', 'MarineEcoRegion', 'FormerNames',
       'OrigDotterID', 'NOTES August 2022', 'HighResImage_new',
       'screenshot_new', 'thumbnail_new', 'total_nests', 'total_birds',
       'date2', 'month', 'uid'],
      dtype='object')

In [78]:
join_cols = [
    "Year",
    "State",
    "GeoRegion",
    "ColonyName",
    "SpeciesCode",
    "Longitude_y",
    "Latitude_y",
]
for c in join_cols:
    pd_species[c] = pd_species[c].astype("str").str.strip()
    joined_totals[c] = joined_totals[c].astype("str").str.strip()
# For 2021 REEG data is divided in subspecies on the access database, but reported as REEG in the summary.
pd_species.loc[
    (pd_species.Year == "2021") & pd_species.SpeciesCode.str.startswith("REEG "),
    "SpeciesCode",
] = "REEG"
agg_pd_species = (
    pd_species.loc[
        pd_species.BestForBPE == "Y",
        [
            "Year",
            "State",
            "GeoRegion",
            "ColonyName",
            "SpeciesCode",
            "Longitude_y",
            "Latitude_y",
            "total_nests",
            "total_birds",
        ],
    ]
    .groupby(
        [
            "Year",
            "State",
            "GeoRegion",
            "ColonyName",
            "SpeciesCode",
            "Longitude_y",
            "Latitude_y",
        ]
    )
    .sum()
)

In [79]:
agg_totals = joined_totals.groupby(
    [
        "Year",
        "State",
        "GeoRegion",
        "ColonyName",
        "SpeciesCode",
        "Longitude_y",
        "Latitude_y",
    ]
).sum()

In [80]:
joined_totals = agg_totals.join(agg_pd_species, on=join_cols, how="outer").reset_index()

In [81]:
joined_totals[joined_totals.total_nests.isna() & (joined_totals.Year == "2021")]

Unnamed: 0,Year,State,GeoRegion,ColonyName,SpeciesCode,Longitude_y,Latitude_y,Nests,Birds,total_nests,total_birds


In [82]:
joined_totals["diff_nests"] = joined_totals["Nests"].fillna(0) - joined_totals[
    "total_nests"
].fillna(0)
joined_totals["diff_birds"] = joined_totals["Birds"].fillna(0) - joined_totals[
    "total_birds"
].fillna(0)

In [83]:
joined_totals[
    ((joined_totals["diff_nests"] != 0) | (joined_totals["diff_birds"] != 0))
    & (joined_totals["SpeciesCode"] == "ROSA")
]

Unnamed: 0,Year,State,GeoRegion,ColonyName,SpeciesCode,Longitude_y,Latitude_y,Nests,Birds,total_nests,total_birds,diff_nests,diff_birds
871,2010,LA,Breton-Chandeleur Islands,Breton Island,ROSA,-89.1742,29.4955,8937.0,7858.0,5415.0,8896.0,3522.0,-1038.0
5965,2010,LA,Biloxi North,Martin Island,ROSA,-89.1984,29.959,,,107.0,154.0,-107.0,-154.0


In [84]:
joined_totals.to_excel("/mnt/z/joined_totals.xlsx")

In [85]:
pd_species.loc[
    (pd_species.Year == "2021") & pd_species.SpeciesCode.str.startswith("REEG "),
    "SpeciesCode",
]

Series([], Name: SpeciesCode, dtype: object)

# Check existing files

In [86]:
pd_species = pd.read_excel("avianmonitoring_2010-2021.xlsx")
pd_species = pd_species.drop(columns=["Latitude_x", "Longitude_x"]).drop_duplicates()

In [87]:
print(pd_species["thumbnail_new"][0])

avian_monitoring/thumbnails/2010/Biloxi South/Biloxi South 2/08May10Camera1-Card1-0012.png


In [88]:
def exists_key(key, replace=False):
    try:
        return len(list(avian_data.objects.filter(Prefix=key))) > 0
    except Exception as e:
        print(f"key:{key}, exception {e}")
        return False

In [89]:
# This is equivalent, but faster than: pd_species["thumbnail_new"] = pd_species["thumbnail_new"].progress_apply(exists_key)
with ThreadPoolExecutor(max_workers=128) as e:
    futures = {}
    _grouped = pd_species[
        filter(lambda x: "_new" in x, pd_species.columns)
    ].drop_duplicates()
    for f in _grouped.columns:
        futures[f] = list(
            tqdm(
                e.map(lambda y: exists_key(y), _grouped[f].tolist()),
                total=_grouped.shape[0],
            )
        )


  0%|          | 0/18305 [00:00<?, ?it/s]

key:nan, exception Parameter validation failed:
Invalid type for parameter Prefix, value: nan, type: <class 'float'>, valid types: <class 'str'>


  0%|          | 0/18305 [00:00<?, ?it/s]

key:nan, exception Parameter validation failed:
Invalid type for parameter Prefix, value: nan, type: <class 'float'>, valid types: <class 'str'>


  0%|          | 0/18305 [00:00<?, ?it/s]

key:nan, exception Parameter validation failed:
Invalid type for parameter Prefix, value: nan, type: <class 'float'>, valid types: <class 'str'>


In [90]:
for x in futures:
    with pd.option_context("display.max_colwidth", None):
        display(_grouped.loc[~np.array(futures[x]), x])

4618                      avian_monitoring/high_resolution_photos/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.jpg
11186                    avian_monitoring/high_resolution_photos/2010/Biloxi North/Biloxi North 10 A/18May10Camera1-Card1-0261.jpg
11188                    avian_monitoring/high_resolution_photos/2010/Biloxi North/Biloxi North 10 A/18May10Camera1-Card1-0276.jpg
11189                    avian_monitoring/high_resolution_photos/2010/Biloxi North/Biloxi North 10 A/18May10Camera1-Card1-0265.jpg
11194                    avian_monitoring/high_resolution_photos/2010/Biloxi North/Biloxi North 10 A/18May10Camera1-Card1-0270.jpg
                                                                   ...                                                            
39008                       avian_monitoring/high_resolution_photos/2021/Biloxi North/Biloxi North 35/17June21Camera1-Card1-33.jpg
43651     avian_monitoring/high_resolution_photos/2018/Terrebonne Bay/Houma Navigat

6                           avian_monitoring/screenshots/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0732.jpg
8                           avian_monitoring/screenshots/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0734.jpg
10                          avian_monitoring/screenshots/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0737.jpg
12                          avian_monitoring/screenshots/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0739.jpg
14                          avian_monitoring/screenshots/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0742.jpg
                                                              ...                                                      
48349                 avian_monitoring/screenshots/2021/Birdsfoot West/Birdsfoot West 6 A/22May21Camera1-Card2-3285.jpg
48350                 avian_monitoring/screenshots/2021/Birdsfoot West/Birdsfoot West 6 A/22May21Camera1-Card2-3297.jpg
48522                  avian_monitoring/

4618                      avian_monitoring/thumbnails/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.png
11186                    avian_monitoring/thumbnails/2010/Biloxi North/Biloxi North 10 A/18May10Camera1-Card1-0261.png
11188                    avian_monitoring/thumbnails/2010/Biloxi North/Biloxi North 10 A/18May10Camera1-Card1-0276.png
11189                    avian_monitoring/thumbnails/2010/Biloxi North/Biloxi North 10 A/18May10Camera1-Card1-0265.png
11194                    avian_monitoring/thumbnails/2010/Biloxi North/Biloxi North 10 A/18May10Camera1-Card1-0270.png
                                                             ...                                                      
39008                       avian_monitoring/thumbnails/2021/Biloxi North/Biloxi North 35/17June21Camera1-Card1-33.png
43651     avian_monitoring/thumbnails/2018/Terrebonne Bay/Houma Navigation Canal Island/25June18Camera2-Card1-3477.png
45585                avian_monitoring/thumbnails

In [91]:
x = "HighResImage_new"
pd_species[pd_species[x].isin(_grouped.loc[~np.array(futures[x]), x])]

Unnamed: 0,Year,Date,ColonyName,DottingAreaNumber,CameraNumber,CardNumber,PhotoNumber,PQ,SpeciesCode,WBN,...,OrigDotterID,NOTES August 2022,HighResImage_new,screenshot_new,thumbnail_new,total_nests,total_birds,date2,month,uid
4618,2018,06/23/18 00:00:00,Long Bay Island,7,2.0,1,523.0,E,LAGU,0,...,PJC14,,avian_monitoring/high_resolution_photos/2018/B...,avian_monitoring/screenshots/2018/Biloxi South...,avian_monitoring/thumbnails/2018/Biloxi South/...,0.0,2.0,2018-Jun-23,Jun,avian_monitoring/high_resolution_photos/2018/B...
4619,2018,06/23/18 00:00:00,Long Bay Island,7,2.0,1,523.0,E,ROYT,0,...,PJC14,,avian_monitoring/high_resolution_photos/2018/B...,avian_monitoring/screenshots/2018/Biloxi South...,avian_monitoring/thumbnails/2018/Biloxi South/...,0.0,139.0,2018-Jun-23,Jun,avian_monitoring/high_resolution_photos/2018/B...
4620,2018,06/23/18 00:00:00,Long Bay Island,7,2.0,1,523.0,E,BRPE,0,...,PJC14,,avian_monitoring/high_resolution_photos/2018/B...,avian_monitoring/screenshots/2018/Biloxi South...,avian_monitoring/thumbnails/2018/Biloxi South/...,0.0,24.0,2018-Jun-23,Jun,avian_monitoring/high_resolution_photos/2018/B...
4621,2018,06/23/18 00:00:00,Long Bay Island,7,2.0,1,523.0,E,AWPE,0,...,PJC14,,avian_monitoring/high_resolution_photos/2018/B...,avian_monitoring/screenshots/2018/Biloxi South...,avian_monitoring/thumbnails/2018/Biloxi South/...,0.0,6.0,2018-Jun-23,Jun,avian_monitoring/high_resolution_photos/2018/B...
11186,2010,05/18/10 00:00:00,Biloxi North 10 A,1,1.0,1,261.0,E,WHIB,0,...,SLF17,Name changed from BN_10 to BN_10 A. Dotted sep...,avian_monitoring/high_resolution_photos/2010/B...,avian_monitoring/screenshots/2010/Biloxi North...,avian_monitoring/thumbnails/2010/Biloxi North/...,8.0,20.0,2010-May-18,May,avian_monitoring/high_resolution_photos/2010/B...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45587,2021,06/20/21 00:00:00,Bahia Grande B,1,1.0,3,8288.0,E,BCNH,0,...,,,avian_monitoring/high_resolution_photos/2021/L...,avian_monitoring/screenshots/2021/Lower Laguna...,avian_monitoring/thumbnails/2021/Lower Laguna ...,1.0,1.0,2021-Jun-20,Jun,avian_monitoring/high_resolution_photos/2021/L...
45588,2021,06/20/21 00:00:00,Bahia Grande B,1,1.0,3,8288.0,E,LAGU,0,...,,,avian_monitoring/high_resolution_photos/2021/L...,avian_monitoring/screenshots/2021/Lower Laguna...,avian_monitoring/thumbnails/2021/Lower Laguna ...,0.0,23.0,2021-Jun-20,Jun,avian_monitoring/high_resolution_photos/2021/L...
47605,2021,06/18/21 00:00:00,South Deer Island,19,1.0,2,265.0,E,LAGU,0,...,,,avian_monitoring/high_resolution_photos/2021/G...,avian_monitoring/screenshots/2021/Galveston/So...,avian_monitoring/thumbnails/2021/Galveston/Sou...,5.0,5.0,2021-Jun-18,Jun,avian_monitoring/high_resolution_photos/2021/G...
48729,2021,05/17/21 00:00:00,Naval Air Station Islands O,1,2.0,3,1328.0,E,LAGU,0,...,,,avian_monitoring/high_resolution_photos/2021/U...,avian_monitoring/screenshots/2021/Upper Laguna...,avian_monitoring/thumbnails/2021/Upper Laguna ...,256.0,374.0,2021-May-17,May,avian_monitoring/high_resolution_photos/2021/U...


In [92]:
for x in futures:
    print(x, np.sum(~np.array(futures[x])))
    with pd.option_context("display.max_colwidth", None):
        _no_exists = pd_species[
            pd_species[x].isin(_grouped.loc[~np.array(futures[x]), x])
        ]
        display(_no_exists)
        display(_no_exists[["Year", x]].drop_duplicates().groupby(["Year"]).count())
        pd_species.loc[
            pd_species[x].isin(_grouped.loc[~np.array(futures[x]), x]), x
        ] = None
#   with pd.option_context('display.max_colwidth', None):
#       display(pd_species.loc[~np.array(futures[x]), x].drop_duplicates())
#       pd_species.loc[~np.array(futures[x]), x].drop_duplicates().to_csv(f"not_found_{x}.csv", index=False)

#   display(pd_species.loc[~np.array(futures[x]),[x,"Year"]].drop_duplicates().groupby("Year").count())
#   pd_species.loc[~np.array(futures[x]), x] = None

HighResImage_new 190


Unnamed: 0,Year,Date,ColonyName,DottingAreaNumber,CameraNumber,CardNumber,PhotoNumber,PQ,SpeciesCode,WBN,...,OrigDotterID,NOTES August 2022,HighResImage_new,screenshot_new,thumbnail_new,total_nests,total_birds,date2,month,uid
4618,2018,06/23/18 00:00:00,Long Bay Island,7,2.0,1,523.0,E,LAGU,0,...,PJC14,,avian_monitoring/high_resolution_photos/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.jpg,avian_monitoring/screenshots/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.jpg,avian_monitoring/thumbnails/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.png,0.0,2.0,2018-Jun-23,Jun,avian_monitoring/high_resolution_photos/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.jpg#LAGU
4619,2018,06/23/18 00:00:00,Long Bay Island,7,2.0,1,523.0,E,ROYT,0,...,PJC14,,avian_monitoring/high_resolution_photos/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.jpg,avian_monitoring/screenshots/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.jpg,avian_monitoring/thumbnails/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.png,0.0,139.0,2018-Jun-23,Jun,avian_monitoring/high_resolution_photos/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.jpg#ROYT
4620,2018,06/23/18 00:00:00,Long Bay Island,7,2.0,1,523.0,E,BRPE,0,...,PJC14,,avian_monitoring/high_resolution_photos/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.jpg,avian_monitoring/screenshots/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.jpg,avian_monitoring/thumbnails/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.png,0.0,24.0,2018-Jun-23,Jun,avian_monitoring/high_resolution_photos/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.jpg#BRPE
4621,2018,06/23/18 00:00:00,Long Bay Island,7,2.0,1,523.0,E,AWPE,0,...,PJC14,,avian_monitoring/high_resolution_photos/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.jpg,avian_monitoring/screenshots/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.jpg,avian_monitoring/thumbnails/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.png,0.0,6.0,2018-Jun-23,Jun,avian_monitoring/high_resolution_photos/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.jpg#AWPE
11186,2010,05/18/10 00:00:00,Biloxi North 10 A,1,1.0,1,261.0,E,WHIB,0,...,SLF17,Name changed from BN_10 to BN_10 A. Dotted separately but all island fragments originally BN_10; letter designations assigned with island fragmentation and colony location changes.,avian_monitoring/high_resolution_photos/2010/Biloxi North/Biloxi North 10 A/18May10Camera1-Card1-0261.jpg,avian_monitoring/screenshots/2010/Biloxi North/Biloxi North 10 A/18May10Camera1-Card1-0261.jpg,avian_monitoring/thumbnails/2010/Biloxi North/Biloxi North 10 A/18May10Camera1-Card1-0261.png,8.0,20.0,2010-May-18,May,avian_monitoring/high_resolution_photos/2010/Biloxi North/Biloxi North 10 A/18May10Camera1-Card1-0261.jpg#WHIB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45587,2021,06/20/21 00:00:00,Bahia Grande B,1,1.0,3,8288.0,E,BCNH,0,...,,,avian_monitoring/high_resolution_photos/2021/Lower Laguna Madre/Bahia Grande B/20June21Camera1-Card3-8288.jpg,avian_monitoring/screenshots/2021/Lower Laguna Madre/Bahia Grande B/20June21Camera1-Card3-8288.jpg,avian_monitoring/thumbnails/2021/Lower Laguna Madre/Bahia Grande B/20June21Camera1-Card3-8288.png,1.0,1.0,2021-Jun-20,Jun,avian_monitoring/high_resolution_photos/2021/Lower Laguna Madre/Bahia Grande B/20June21Camera1-Card3-8288.jpg#BCNH
45588,2021,06/20/21 00:00:00,Bahia Grande B,1,1.0,3,8288.0,E,LAGU,0,...,,,avian_monitoring/high_resolution_photos/2021/Lower Laguna Madre/Bahia Grande B/20June21Camera1-Card3-8288.jpg,avian_monitoring/screenshots/2021/Lower Laguna Madre/Bahia Grande B/20June21Camera1-Card3-8288.jpg,avian_monitoring/thumbnails/2021/Lower Laguna Madre/Bahia Grande B/20June21Camera1-Card3-8288.png,0.0,23.0,2021-Jun-20,Jun,avian_monitoring/high_resolution_photos/2021/Lower Laguna Madre/Bahia Grande B/20June21Camera1-Card3-8288.jpg#LAGU
47605,2021,06/18/21 00:00:00,South Deer Island,19,1.0,2,265.0,E,LAGU,0,...,,,avian_monitoring/high_resolution_photos/2021/Galveston/South Deer Island/18June21Camera1-Card2-265.jpg,avian_monitoring/screenshots/2021/Galveston/South Deer Island/18June21Camera1-Card2-265.jpg,avian_monitoring/thumbnails/2021/Galveston/South Deer Island/18June21Camera1-Card2-265.png,5.0,5.0,2021-Jun-18,Jun,avian_monitoring/high_resolution_photos/2021/Galveston/South Deer Island/18June21Camera1-Card2-265.jpg#LAGU
48729,2021,05/17/21 00:00:00,Naval Air Station Islands O,1,2.0,3,1328.0,E,LAGU,0,...,,,avian_monitoring/high_resolution_photos/2021/Upper Laguna Madre/Naval Air Station Islands O/17May21Camera2-Card3-1328.jpg,avian_monitoring/screenshots/2021/Upper Laguna Madre/Naval Air Station Islands O/17May21Camera2-Card3-1328.jpg,avian_monitoring/thumbnails/2021/Upper Laguna Madre/Naval Air Station Islands O/17May21Camera2-Card3-1328.png,256.0,374.0,2021-May-17,May,avian_monitoring/high_resolution_photos/2021/Upper Laguna Madre/Naval Air Station Islands O/17May21Camera2-Card3-1328.jpg#LAGU


Unnamed: 0_level_0,HighResImage_new
Year,Unnamed: 1_level_1
2010,139
2011,12
2012,3
2013,2
2018,12
2021,21


screenshot_new 3053


Unnamed: 0,Year,Date,ColonyName,DottingAreaNumber,CameraNumber,CardNumber,PhotoNumber,PQ,SpeciesCode,WBN,...,OrigDotterID,NOTES August 2022,HighResImage_new,screenshot_new,thumbnail_new,total_nests,total_birds,date2,month,uid
6,2013,05/24/13 00:00:00,Biloxi South 2,1,2.0,2,732.0,E,LAGU,0,...,PJC1,,avian_monitoring/high_resolution_photos/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0732.jpg,avian_monitoring/screenshots/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0732.jpg,avian_monitoring/thumbnails/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0732.png,22.0,37.0,2013-May-24,May,avian_monitoring/high_resolution_photos/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0732.jpg#LAGU
7,2013,05/24/13 00:00:00,Biloxi South 2,1,2.0,2,732.0,G,FOTE,0,...,PJC1,,avian_monitoring/high_resolution_photos/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0732.jpg,avian_monitoring/screenshots/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0732.jpg,avian_monitoring/thumbnails/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0732.png,13.0,43.0,2013-May-24,May,avian_monitoring/high_resolution_photos/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0732.jpg#FOTE
8,2013,05/24/13 00:00:00,Biloxi South 2,2,2.0,2,734.0,E,LAGU,0,...,PJC1,,avian_monitoring/high_resolution_photos/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0734.jpg,avian_monitoring/screenshots/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0734.jpg,avian_monitoring/thumbnails/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0734.png,33.0,39.0,2013-May-24,May,avian_monitoring/high_resolution_photos/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0734.jpg#LAGU
9,2013,05/24/13 00:00:00,Biloxi South 2,2,2.0,2,734.0,G,FOTE,0,...,PJC1,,avian_monitoring/high_resolution_photos/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0734.jpg,avian_monitoring/screenshots/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0734.jpg,avian_monitoring/thumbnails/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0734.png,29.0,51.0,2013-May-24,May,avian_monitoring/high_resolution_photos/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0734.jpg#FOTE
10,2013,05/24/13 00:00:00,Biloxi South 2,3,2.0,2,737.0,E,LAGU,0,...,PJC1,,avian_monitoring/high_resolution_photos/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0737.jpg,avian_monitoring/screenshots/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0737.jpg,avian_monitoring/thumbnails/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0737.png,8.0,10.0,2013-May-24,May,avian_monitoring/high_resolution_photos/2013/Biloxi South/Biloxi South 2/24May13Camera2-Card2-0737.jpg#LAGU
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48528,2021,06/20/21 00:00:00,Green Island,26,2.0,3,6289.0,E,LBHE,0,...,,,avian_monitoring/high_resolution_photos/2021/Lower Laguna Madre/Green Island/20June21Camera2-Card3-6289.jpg,avian_monitoring/screenshots/2021/Lower Laguna Madre/Green Island/20June21Camera2-Card3-6289.jpg,avian_monitoring/thumbnails/2021/Lower Laguna Madre/Green Island/20June21Camera2-Card3-6289.png,0.0,1.0,2021-Jun-20,Jun,avian_monitoring/high_resolution_photos/2021/Lower Laguna Madre/Green Island/20June21Camera2-Card3-6289.jpg#LBHE
48529,2021,06/20/21 00:00:00,Green Island,26,2.0,3,6289.0,E,REEG WM,0,...,,,avian_monitoring/high_resolution_photos/2021/Lower Laguna Madre/Green Island/20June21Camera2-Card3-6289.jpg,avian_monitoring/screenshots/2021/Lower Laguna Madre/Green Island/20June21Camera2-Card3-6289.jpg,avian_monitoring/thumbnails/2021/Lower Laguna Madre/Green Island/20June21Camera2-Card3-6289.png,3.0,2.0,2021-Jun-20,Jun,avian_monitoring/high_resolution_photos/2021/Lower Laguna Madre/Green Island/20June21Camera2-Card3-6289.jpg#REEG WM
48729,2021,05/17/21 00:00:00,Naval Air Station Islands O,1,2.0,3,1328.0,E,LAGU,0,...,,,,avian_monitoring/screenshots/2021/Upper Laguna Madre/Naval Air Station Islands O/17May21Camera2-Card3-1328.jpg,avian_monitoring/thumbnails/2021/Upper Laguna Madre/Naval Air Station Islands O/17May21Camera2-Card3-1328.png,256.0,374.0,2021-May-17,May,avian_monitoring/high_resolution_photos/2021/Upper Laguna Madre/Naval Air Station Islands O/17May21Camera2-Card3-1328.jpg#LAGU
48730,2021,05/17/21 00:00:00,Naval Air Station Islands O,1,2.0,3,1328.0,E,RUTU,0,...,,,,avian_monitoring/screenshots/2021/Upper Laguna Madre/Naval Air Station Islands O/17May21Camera2-Card3-1328.jpg,avian_monitoring/thumbnails/2021/Upper Laguna Madre/Naval Air Station Islands O/17May21Camera2-Card3-1328.png,0.0,1.0,2021-May-17,May,avian_monitoring/high_resolution_photos/2021/Upper Laguna Madre/Naval Air Station Islands O/17May21Camera2-Card3-1328.jpg#RUTU


Unnamed: 0_level_0,screenshot_new
Year,Unnamed: 1_level_1
2010,259
2011,408
2012,259
2013,1862
2015,3
2018,36
2021,225


thumbnail_new 189


Unnamed: 0,Year,Date,ColonyName,DottingAreaNumber,CameraNumber,CardNumber,PhotoNumber,PQ,SpeciesCode,WBN,...,OrigDotterID,NOTES August 2022,HighResImage_new,screenshot_new,thumbnail_new,total_nests,total_birds,date2,month,uid
4618,2018,06/23/18 00:00:00,Long Bay Island,7,2.0,1,523.0,E,LAGU,0,...,PJC14,,,,avian_monitoring/thumbnails/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.png,0.0,2.0,2018-Jun-23,Jun,avian_monitoring/high_resolution_photos/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.jpg#LAGU
4619,2018,06/23/18 00:00:00,Long Bay Island,7,2.0,1,523.0,E,ROYT,0,...,PJC14,,,,avian_monitoring/thumbnails/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.png,0.0,139.0,2018-Jun-23,Jun,avian_monitoring/high_resolution_photos/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.jpg#ROYT
4620,2018,06/23/18 00:00:00,Long Bay Island,7,2.0,1,523.0,E,BRPE,0,...,PJC14,,,,avian_monitoring/thumbnails/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.png,0.0,24.0,2018-Jun-23,Jun,avian_monitoring/high_resolution_photos/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.jpg#BRPE
4621,2018,06/23/18 00:00:00,Long Bay Island,7,2.0,1,523.0,E,AWPE,0,...,PJC14,,,,avian_monitoring/thumbnails/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.png,0.0,6.0,2018-Jun-23,Jun,avian_monitoring/high_resolution_photos/2018/Biloxi South/Long Bay Island/23June18Camera2-Card1-0523.jpg#AWPE
11186,2010,05/18/10 00:00:00,Biloxi North 10 A,1,1.0,1,261.0,E,WHIB,0,...,SLF17,Name changed from BN_10 to BN_10 A. Dotted separately but all island fragments originally BN_10; letter designations assigned with island fragmentation and colony location changes.,,,avian_monitoring/thumbnails/2010/Biloxi North/Biloxi North 10 A/18May10Camera1-Card1-0261.png,8.0,20.0,2010-May-18,May,avian_monitoring/high_resolution_photos/2010/Biloxi North/Biloxi North 10 A/18May10Camera1-Card1-0261.jpg#WHIB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
45587,2021,06/20/21 00:00:00,Bahia Grande B,1,1.0,3,8288.0,E,BCNH,0,...,,,,,avian_monitoring/thumbnails/2021/Lower Laguna Madre/Bahia Grande B/20June21Camera1-Card3-8288.png,1.0,1.0,2021-Jun-20,Jun,avian_monitoring/high_resolution_photos/2021/Lower Laguna Madre/Bahia Grande B/20June21Camera1-Card3-8288.jpg#BCNH
45588,2021,06/20/21 00:00:00,Bahia Grande B,1,1.0,3,8288.0,E,LAGU,0,...,,,,,avian_monitoring/thumbnails/2021/Lower Laguna Madre/Bahia Grande B/20June21Camera1-Card3-8288.png,0.0,23.0,2021-Jun-20,Jun,avian_monitoring/high_resolution_photos/2021/Lower Laguna Madre/Bahia Grande B/20June21Camera1-Card3-8288.jpg#LAGU
47605,2021,06/18/21 00:00:00,South Deer Island,19,1.0,2,265.0,E,LAGU,0,...,,,,,avian_monitoring/thumbnails/2021/Galveston/South Deer Island/18June21Camera1-Card2-265.png,5.0,5.0,2021-Jun-18,Jun,avian_monitoring/high_resolution_photos/2021/Galveston/South Deer Island/18June21Camera1-Card2-265.jpg#LAGU
48729,2021,05/17/21 00:00:00,Naval Air Station Islands O,1,2.0,3,1328.0,E,LAGU,0,...,,,,,avian_monitoring/thumbnails/2021/Upper Laguna Madre/Naval Air Station Islands O/17May21Camera2-Card3-1328.png,256.0,374.0,2021-May-17,May,avian_monitoring/high_resolution_photos/2021/Upper Laguna Madre/Naval Air Station Islands O/17May21Camera2-Card3-1328.jpg#LAGU


Unnamed: 0_level_0,thumbnail_new
Year,Unnamed: 1_level_1
2010,139
2011,12
2012,3
2013,2
2018,12
2021,20


In [93]:
pd_species.to_csv("avianmonitoring_2010-2021_Nulls.csv.gz", index=False)

In [94]:
pd_species.to_excel("avianmonitoring_2010-2021_Nulls.xlsx", index=False)

### Delete inconsistent photos
In some previous processing it seems some photos have been incorrectly created. This cell allows to display photos with high resolution images but without thumbnail, that will indicate an annomalus situation. 

In [95]:
with pd.option_context("display.max_colwidth", None):
    to_delete = pd_species.loc[
        pd_species["thumbnail_new"].isna() & ~pd_species["HighResImage_new"].isna(),
        futures.keys(),
    ].drop_duplicates()
    display(to_delete)
    # to_delete["HighResImage_new"].apply(lambda x:avian_data.Object(key=x).delete())

Unnamed: 0,HighResImage_new,screenshot_new,thumbnail_new
