In [None]:
!pip install tqdm

In [1]:
thumbnail_size = (518,345)
create_thumbnails = False
replace = False
_base_folder = 'avian_monitoring'

In [15]:
from PIL import Image, ExifTags
import os
import re
import json
import pandas as pd
import geopandas as gpd
from pathlib import Path
import platform
import pandas_access
from datetime import datetime
import pyodbc

# PILlow have methods to read exif, however, in the tests
# made, it returns a empty dict for images with actual exif data
import piexif
import fiona
import numpy as np
from multiprocessing import Pool
from functools import partial
import sqlite3
import tempfile
from pandas.io.json import json_normalize
import geopandas as gp
import numpy as np
import boto3
import re
from tqdm.notebook import tqdm
from dateutil.parser import parse
from concurrent.futures import ThreadPoolExecutor,as_completed
tqdm.pandas()

boto3.setup_default_session(profile_name='GLO')

In [3]:
def clean_date(text):
  datetimestr = parse(text)
  text = datetime.strftime(datetimestr, '%Y-%b-%d')
  return text

# Unified 2010-2021 Data

In [4]:
# Notes: In order to use all the fields, I did a rename of the columns containing '?' or '/'
#acc_db = "/mnt/c/Users/carizaporras/Downloads/Colibri_tblsSpeciesData2010-2021_18sept22.accdb"
acc_db = "/mnt/z/Colibri2010-2021CWBColonies_12Nov2022_working_copy.accdb"
schema = pandas_access.read_schema(acc_db) 

In [5]:
ct_name = "tblRWCWB_ColonyInventory_10Nov22"
#colonies_table = schema.pop("tblRWCWB_ColonyInventory_13Sept2022")
colonies_table =  schema.pop(ct_name)

There are three tables with slighly different schema

In [6]:
schema

{'tblColonySiteNotes2010': {'ID': 'Long Integer',
  'Latitude': 'Text (100)',
  'Longitude': 'Text (100)',
  'Dotter': 'Text (100)',
  'ColonyName': 'Text (100)',
  'Habitat': 'Text (100)',
  'Oil': 'Text (2)',
  'Notes': 'Text (510)'},
 'tblSpeciesCodes': {'SpeciesCode': 'Text (100) NOT NULL'},
 'tblSpeciesData2010': {'AutoID': 'Double',
  'Year': 'Double',
  'Date': 'DateTime',
  'ColonyName': 'Text (510)',
  'Latitude': 'Text (510)',
  'Longitude': 'Text (510)',
  'DottingAreaNumber': 'Text (510)',
  'CameraNumber': 'Text (510)',
  'CardNumber': 'Text (510)',
  'PhotoNumber': 'Text (510)',
  'PQ': 'Text (510)',
  'SpeciesCode': 'Text (510)',
  'WBN': 'Double',
  'ChickNestwithoutAdult': 'Double',
  'AbandNest': 'Double',
  'EmptyNest': 'Double',
  'PBN': 'Double',
  'Site': 'Double',
  'Brood': 'Double',
  'OtherAdultsInColony': 'Double',
  'OtherImmInColony': 'Double',
  'RoostingBirds': 'Double',
  'RoostingAdults': 'Double',
  'RoostingImmatures': 'Double',
  'UnknownAge': 'Doubl

Taking the 2015 to 2021 table as a reference, lets compare the schemas.

Fields in the reference but not in the `tblSpeciesData2011_2013` table

In [7]:
schema["tblSpeciesData2015_2018_2021"].keys() - schema["tblSpeciesData2011_2013"].keys()

{'ChickNest', 'OtherBirds', 'Subcolony', 'Territory'}

Fields in the `tblSpeciesData2011_2013` table but not in the reference

In [8]:
 schema["tblSpeciesData2011_2013"].keys() - schema["tblSpeciesData2015_2018_2021"].keys() 

{'ChicksNestlings',
 'OtherAdultsInColony',
 'OtherImmInColony',
 'RoostingAdults',
 'RoostingBirds',
 'RoostingImmatures',
 'UnknownAge'}

Fields in the referece but not in the 2010 table

In [9]:
schema["tblSpeciesData2015_2018_2021"].keys() - schema["tblSpeciesData2010"].keys()

{'BestForBPE', 'ChickNest', 'Notes', 'OtherBirds', 'Subcolony', 'Territory'}

Fields in the referece but not in the 2010 table

In [10]:
schema["tblSpeciesData2010"].keys() - schema["tblSpeciesData2015_2018_2021"].keys() 

{'Latitude',
 'Longitude',
 'OtherAdultsInColony',
 'OtherImmInColony',
 'RoostingAdults',
 'RoostingBirds',
 'RoostingImmatures',
 'UnknownAge'}

That also means that we have different formulas to calculate the total. The bestForBPE field is used as filter in 2013 to 2021, but it is not used in the 2010.

### Formulas

|                | 2010                                                                                                                                                               | 2011-2013                                                                                                                                                          | 2015-2021                                                                                                 |
|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------|
| Nests          | sum(<br>[WBN]<br>+[ChickNestw/outAdult]<br>+[AbandNest]<br>+[EmptyNest]<br>+[PBN]<br>+[Site]<br>+[Brood])                                                          | Sum(<br>[WBN]<br>+[ChickNestw/outAdult]<br>+[AbandNest]<br>+[EmptyNest]<br>+[PBN]<br>+[Site]<br>+[Brood])                                                          | Sum(<br>[WBN]<br>+[ChickNest]<br>+[ChickNestw/outAdult]<br>+[AbandNest]<br>+[PBN]<br>+[Site]<br>+[Brood]) |
| Birds          | sum([WBN]<br>+[PBN]<br>+[Site]<br>+[OtherAdultsInColony]<br>+[OtherImmInColony]<br>+[RoostingBirds]<br>+[RoostingAdults]<br>+[RoostingImmatures]<br>+[UnknownAge]) | Sum([WBN]<br>+[PBN]<br>+[Site]<br>+[OtherAdultsInColony]<br>+[OtherImmInColony]<br>+[RoostingBirds]<br>+[RoostingAdults]<br>+[RoostingImmatures]<br>+[UnknownAge]) | Sum(<br>[WBN]<br>+[ChickNest]<br>+[PBN]<br>+[Territory]<br>+[Site]<br>+[OtherBirds])                      |
| SumOfEmptyNest | sum(EmptyNest)                                                                                                                                                     | EmptyNest                                                                                                                                                          | EmptyNest                                                                                                 |


But 2015 to 2021 has additional categories: 

```SQL

Sum(WBN) AS SumOfWBN

Sum(ChickNest) AS SumOfChickNest

Sum([ChickNestw/outAdult]) AS [SumOfChickNestw/outAdult] 

Sum(Brood) AS SumOfBrood

Sum(AbandNest) AS SumOfAbandNest

Sum(PBN) AS SumOfPBN

Sum(Territory) AS SumOfTerritory
```

## Generating a common dataset:
- merge 2010 species with the colonies. 
- select only the common columns 

In [11]:
cols_2015_2021 = set(schema["tblSpeciesData2015_2018_2021"].keys())
cols_2011_2013 = set(schema["tblSpeciesData2011_2013"].keys())
cols_2010 = set(schema["tblSpeciesData2010"].keys())
common_fields = cols_2011_2013.intersection(cols_2015_2021)

In [12]:
cols_2010 - common_fields

{'Latitude',
 'Longitude',
 'OtherAdultsInColony',
 'OtherImmInColony',
 'RoostingAdults',
 'RoostingBirds',
 'RoostingImmatures',
 'UnknownAge'}

In [13]:
common_fields - cols_2010

{'BestForBPE', 'Notes'}

In [14]:
pd_species_2010 = pandas_access.read_table(acc_db, "tblSpeciesData2010")

In [15]:
pd_species_2010['Notes'] = ""
pd_species_2010["BestForBPE"] = "N"

In [16]:
pd_species_2010 = pd_species_2010#[common_fields]

In [17]:
pd_species_2011_2013 = pandas_access.read_table(acc_db, "tblSpeciesData2011_2013")#[common_fields]
pd_species_2015_2021 = pandas_access.read_table(acc_db, "tblSpeciesData2015_2018_2021")#[common_fields]

In [18]:
pd_species = pd.concat([pd_species_2010, pd_species_2011_2013, pd_species_2015_2021], ignore_index=True)

In [19]:
pd_species = pd_species.drop(columns="AutoID")

In [20]:
pd_species["Year"] = pd_species["Year"].astype(int).astype(str) 

In [21]:
pd_colonies = pandas_access.read_table(acc_db, ct_name)
pd_colonies["ColonyName"] = pd_colonies["ColonyName"].astype(str)

In [22]:
pd_species = pd.merge(pd_species, pd_colonies, on="ColonyName")

In [23]:
pd_species["HighResImage_new"] = f"{_base_folder}/high_resolution_photos/"+pd_species['Year'].astype(str)+'/'+pd_species['GeoRegion']+'/'+pd_species['ColonyName']+'/'+pd.to_datetime(pd_species["Date"].astype('str'), format="%m/%d/%y %H:%M:%S").dt.strftime("%d%B%y")+'Camera'+pd_species["CameraNumber"]+'-'+'Card'+(pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))+"-"+pd_species["PhotoNumber"]+".jpg"
pd_species["screenshot_new"] = f'{_base_folder}/screenshots/'+pd_species['Year'].astype(str)+'/'+pd_species['GeoRegion']+'/'+pd_species['ColonyName']+'/'+pd.to_datetime(pd_species["Date"].astype('str'), format="%m/%d/%y %H:%M:%S").dt.strftime("%d%B%y")+'Camera'+pd_species["CameraNumber"]+'-'+'Card'+(pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))+"-"+pd_species["PhotoNumber"]+".jpg"
pd_species["thumbnail_new"] =  f'{_base_folder}/thumbnails/'+pd_species['Year'].astype(str)+'/'+pd_species['GeoRegion']+'/'+pd_species['ColonyName']+'/'+pd.to_datetime(pd_species["Date"].astype('str'), format="%m/%d/%y %H:%M:%S").dt.strftime("%d%B%y")+'Camera'+pd_species["CameraNumber"]+'-'+'Card'+(pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))+"-"+pd_species["PhotoNumber"]+".png"

In [24]:
pd_species["total_nests"] = np.where(pd_species["Year"].isin(["2010","2011","2012","2013"]),
                                    pd_species["WBN"]+pd_species["ChickNestwithoutAdult"]
                                    +pd_species["AbandNest"]
                                    +pd_species["EmptyNest"]
                                    +pd_species["PBN"]
                                    +pd_species["Site"]
                                    +pd_species["Brood"],
                                     pd_species["WBN"]+pd_species["ChickNestwithoutAdult"]
                                    +pd_species["AbandNest"]
                                    +pd_species["ChickNest"]
                                    +pd_species["PBN"]
                                    +pd_species["Site"]
                                    +pd_species["Brood"]
                                    )
pd_species["total_birds"] = np.where(pd_species["Year"].isin(["2010","2011","2012","2013"]),
                                    pd_species["WBN"]
                                                                   
                                    +pd_species["PBN"]
                                    
                                     +pd_species["Site"]
                                    +pd_species["OtherAdultsInColony"]
                                     +pd_species["OtherImmInColony"]
                                     +pd_species["RoostingBirds"]
                                     +pd_species["RoostingAdults"]
                                     +pd_species["RoostingImmatures"]
                                     +pd_species["UnknownAge"],
                                     
                                     pd_species["WBN"]+
                                    
                                    +pd_species["ChickNest"]
                                    +pd_species["PBN"]
                                    +pd_species["Territory"]
                                     +pd_species["Site"]
                                    +pd_species["OtherBirds"]
                                    )

In [25]:
pd_species['date2']=pd_species['Date'].astype('str').apply(clean_date)
pd_species['month'] = pd_species["date2"].apply(lambda x: x.split('-')[1])

agg_2010 = pd_species[pd_species.Year=="2010"].groupby(["Year","month","ColonyName","SpeciesCode"]).agg({"total_birds":"sum"}).reset_index()


In [26]:
selected = agg_2010.sort_values("total_birds").drop_duplicates(["Year","ColonyName","SpeciesCode"], keep="last")[["Year","month","ColonyName","SpeciesCode"]].reset_index(drop=True)

In [27]:
selected[selected.SpeciesCode == "ROSA"]

Unnamed: 0,Year,month,ColonyName,SpeciesCode
25,2010,Jun,North Deer Island,ROSA
64,2010,May,Little Mud Grass Island Northeast,ROSA
592,2010,May,Biloxi North 32,ROSA
848,2010,May,Biloxi North 15,ROSA
977,2010,Jun,West Bay Bird Island New,ROSA
1076,2010,May,Louisiana West 1,ROSA
1228,2010,May,Horn Island,ROSA
1260,2010,Jun,Martin Island,ROSA
1321,2010,Jun,Saint George Causeway,ROSA
1351,2010,Jun,Long Reef Deadman Islands B,ROSA


In [28]:
pd_species.loc[pd_species[["Year","month","ColonyName","SpeciesCode"]].apply(lambda row: str([x for x in row]), axis =1).isin(selected.apply(lambda row: str([x for x in row]), axis =1)),"BestForBPE"] = 'Y'

In [29]:
pd_species["uid"]= pd_species["HighResImage_new"]+"#"+np.where(pd_species["SpeciesCode"].isna(),"N/A",pd_species["SpeciesCode"].isna())

In [30]:
pd_species.to_csv("avianmonitoring_2010-2021.csv.gz", index=False)

In [31]:
pd_species.to_excel("avianmonitoring_2010-2021.xlsx", index=False)

# Reorganize Files

In [4]:
bucket_name  = "twi-avian-data"
starting_folder = "HighResolutionImages"
new_folder_hr = "avian_monitoring_"
aws_s3 = boto3.resource(
        "s3",
   )
avian_data = aws_s3.Bucket(bucket_name)

In [None]:
files = [o for o in avian_data.objects.filter(Prefix=starting_folder) if re.match(r".*/20[1-2][0-9]/.*(\.jp(.{0,1})g|.tiff)",o.key.lower())]

In [None]:
image_groups = re.compile(r'HighResolutionImages/(?P<year>\d{4}).*[/ ,]+(?P<date>\d+\s*[A-Z]+\s*\d{2,4}).*(Camera|Cam)\s*(?P<camera>\d+)[ /-]*(Card\s*(?P<card>\d+)){0,1}[\s-]*((\w+/))*(IMGP){0,1}(?P<photo>\d+)\.(?P<extension>jp.?g|tiff?)', flags=re.IGNORECASE)

In [None]:
m = image_groups.match("HighResolutionImages/2010/June 2010/10 June 2010/10 June 2010 Camera 1 Card 1/10 June 2010 Camera 1 Card 1 010.JPG")

In [None]:
df_files = []
no_files = []
for i,o in tqdm(enumerate(files)): 
    if image_groups.match(o.key):
        dict_t = image_groups.match(o.key).groupdict()
        df_files.append({"key":o.key, "object":o, **dict_t})
    else:
        no_files.append(o.key)
    #if i%10000 == 0 :
    #    print(i, len(df_files), len(no_files))

In [None]:
no_files = list(filter(lambda k: not "numbering off" in k, no_files) )

In [None]:
with open("n_files_report.txt","w") as n_files_report: 
    n_files_report.write("\n".join(no_files))

In [None]:

fdf = pd.DataFrame(df_files) 

In [None]:
fdf["extension"] = fdf["extension"].str.lower()

In [None]:

fdf['date2'] = fdf['date'].astype('str').apply(clean_date)
fdf['month'] = fdf["date2"].apply(lambda x: x.split('-')[1])
fdf['day'] = fdf["date2"].apply(lambda x: x.split('-')[2])

In [None]:
pd_species['date2']=pd_species['Date'].astype('str').apply(clean_date)
pd_species['month'] = pd_species["date2"].apply(lambda x: x.split('-')[1])
pd_species['day'] = pd_species["date2"].apply(lambda x: x.split('-')[2])

In [None]:
pd_species[~pd_species["HighResImage_new"].isna()].to_csv("avianData20102021.csv.gz", index=False)

In [None]:
pd_species[~pd_species["HighResImage_new"].isna()]

In [None]:
fdf = fdf.rename(columns={"camera":"CameraNumber","card":"CardNumber", "photo":"PhotoNumber", "year":"Year" })

In [None]:
fdf["PhotoNumber"] = fdf["PhotoNumber"].str.rjust(5,'0') 
pd_species["PhotoNumber"] = pd_species["PhotoNumber"].str.rjust(5,'0')

In [None]:
join_cols = ['CameraNumber','CardNumber','PhotoNumber','Year', 'month','day']
for c in join_cols:
    fdf[c] = fdf[c].astype('str').str.strip()
    pd_species[c] = pd_species[c].astype('str').str.strip()
c = 'CardNumber'
fdf.loc[fdf[c]=='None',c]= '1'

In [None]:
merged = pd_species.merge(fdf, on=join_cols, how="left")

In [None]:
pd_species["PhotoNumber"]

In [None]:
m_grouped =merged[["HighResImage_new","key","CameraNumber","CardNumber","PhotoNumber","Date","Year","month", "thumbnail_new"]].groupby(["HighResImage_new","key","CameraNumber","CardNumber","PhotoNumber","Date","Year","month","thumbnail_new"]).count().reset_index()

In [None]:
r = m_grouped.progress_apply(lambda x:avian_data.copy ({'Bucket': avian_data.name,'Key':x['key']}, x['HighResImage_new']), axis=1)

In [None]:
np.any(~r.isnull())

In [None]:
m_grouped

In [None]:
m_grouped.to_excel("UsedPhotos2010-2021.xlsx", index=False)

In [None]:
m_grouped = pd.read_excel("UsedPhotos2010-2021.xlsx")

In [54]:
from PIL import Image, ExifTags
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor,as_completed
from functools import partial

In [None]:
def generate_thumbnail(high_res_key, thumb_key, regenerate=False):
    current = avian_data.objects.filter(Prefix=thumb_key)
    if not regenerate and  [o for o in current]:
        return True
    objs = [o for o in avian_data.objects.filter(Prefix=high_res_key)]
    if not objs:
        print(f"there is no {high_res_key}")
        return False
    image = Image.open(BytesIO(objs[0].get()["Body"].read()))
    buffer = BytesIO()
    image.thumbnail(thumbnail_size)
    image.save(buffer, format="png")
    buffer.seek(0)
    avian_data.put_object(Key=thumb_key, Body=buffer,ContentType="image/png")
    return True

In [None]:
with ThreadPoolExecutor(max_workers=16) as e:
    futures = list(tqdm(e.map(partial(generate_thumbnail, regenerate=True), m_grouped["HighResImage_new"].tolist(), m_grouped["thumbnail_new"].tolist()), total=m_grouped.shape[0]))

In [None]:
m_grouped.shape

# Previous version code
```python
#acc_db = "/mnt/c/Users/carizaporras/Downloads/Colibri_tblSpeciesData2015_2018_2021_2Sept2022 (1).accdb"
#schema = pandas_access.read_schema(acc_db)
#pd_species = pandas_access.read_table(acc_db, "tblSpeciesData2015_2018_2021")
#pd_species["ColonyName"] = pd_species["ColonyName"].astype(str)
#pd_colonies = pandas_access.read_table(acc_db, "tblRWCWB_ColonyInventory_2022")
#pd_colonies["ColonyName"] = pd_colonies["ColonyName"].astype(str)
#pd_species = pd.merge(pd_species, pd_colonies, on="ColonyName")
#pd_species["HighResImage_new"] = f"{_base_folder}/high_resolution_photos/"+pd_species['Year'].astype(str)+'/'+pd_species['GeoRegion']+'/'+pd_species['ColonyName']+'/'+pd.to_datetime(pd_species["Date"].astype('str'), format="%m/%d/%y %H:%M:%S").dt.strftime("%d%B%y")+'Camera'+pd_species["CameraNumber"]+'-'+'Card'+(pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))+"-"+pd_species["PhotoNumber"]+".jpg"
#pd_species["screenshot_new"] = f'{_base_folder}/screenshots/'+pd_species['Year'].astype(str)+'/'+pd_species['GeoRegion']+'/'+pd_species['ColonyName']+'/'+pd.to_datetime(pd_species["Date"].astype('str'), format="%m/%d/%y %H:%M:%S").dt.strftime("%d%B%y")+'Camera'+pd_species["CameraNumber"]+'-'+'Card'+(pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))+"-"+pd_species["PhotoNumber"]+".jpg"
#pd_species["thumbnail_new"] =  f'{_base_folder}/thumbnails/'+pd_species['Year'].astype(str)+'/'+pd_species['GeoRegion']+'/'+pd_species['ColonyName']+'/'+pd.to_datetime(pd_species["Date"].astype('str'), format="%m/%d/%y %H:%M:%S").dt.strftime("%d%B%y")+'Camera'+pd_species["CameraNumber"]+'-'+'Card'+(pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))+"-"+pd_species["PhotoNumber"]+".png"
#gdf = gp.GeoDataFrame(
#    pd_species, geometry=gp.points_from_xy(pd_species["Longitude"], pd_species["Latitude"]))
#set_index(["State","GeoRegion","ColonyName","Year", "Latitude", "Longitude", "Date", "SpeciesCode"])
#pd_species = pd_species.drop(columns=["AutoID","Subcolony"])
#gdf["huc"] = "TBD"

def _convert_to_degress(value):
    """
    Helper function to convert the GPS coordinates stored in the EXIF to degress in float format
    Borrowed from: https://gist.github.com/snakeye/fdc372dbf11370fe29eb
    Modified to recieve a tuple instead of a exifread.utils.Ratio
    :param value:
    :type value: tuple
    :rtype: float
    """
    d = float(value[0][0]) / float(value[0][1])
    m = float(value[1][0]) / float(value[1][1])
    s = float(value[2][0]) / float(value[2][1])

    return d + (m / 60.0) + (s / 3600.0)
```

# UPDATE Mime types 

In [None]:
def update_mime_type(t, mime="image/png"):
    s3_object = avian_data.Object(t)
    s3_object.copy_from(CopySource={'Bucket':avian_data.name, 'Key':t}, Metadata=s3_object.metadata,MetadataDirective="REPLACE",ContentType=mime)
    
def copy_wnew_mime_type(t, k, mime="image/png"):
    avian_data.copy({'Bucket':avian_data.name, 'Key':t},k, Metadata=s3_object.metadata,MetadataDirective="REPLACE",ContentType=mime)
    

In [None]:
if replace and create_thumbnails:
    m_grouped["thumbnail_new"].progress_apply(lambda t:update_mime_type(t))

In [None]:
with ThreadPoolExecutor(max_workers=16) as e:
    futures = list(tqdm(e.map(lambda t:update_mime_type(t, mime="image/jpeg"),m_grouped["HighResImage_new"].tolist()), total=m_grouped.shape[0]))

# Reorganize/rename screenshots

In [5]:
m_grouped = pd.read_excel("UsedPhotos2010-2021.xlsx")
#to_rename = pd.read_excel("toRename2.xlsx")
to_rename = pd.read_excel("/mnt/c/Users/carizaporras/Downloads/Renaming_Final.xlsx")

In [6]:
if not "Original_Path" in to_rename:
    to_rename["Original_Path"] = to_rename["Path"]
    

In [7]:
to_rename = to_rename[["HighResImage_new","Original_Path"]]

In [8]:
to_rename = to_rename[~(to_rename["Original_Path"].isna() | to_rename["Original_Path"].isna())]

In [9]:
to_rename["Original_Path"] = to_rename["Original_Path"].str.replace("/to82sp","DottedImages")

In [10]:
to_rename = to_rename.merge(m_grouped, on="HighResImage_new")

In [11]:
to_rename["screenshot_new"] = to_rename["HighResImage_new"].str.replace("/high_resolution_photos/","/screenshots/")

In [12]:
to_rename[["screenshot_new", "Original_Path"]] 

Unnamed: 0,screenshot_new,Original_Path
0,avian_monitoring/screenshots/2010/Apalachee Ba...,DottedImages/2010-2013 Dotted Images/2010/KMR ...
1,avian_monitoring/screenshots/2010/Apalachee Ba...,DottedImages/2010-2013 Dotted Images/2010/KMR ...
2,avian_monitoring/screenshots/2010/Apalachee Ba...,DottedImages/2010-2013 Dotted Images/2010/KMR ...
3,avian_monitoring/screenshots/2010/Apalachee Ba...,DottedImages/2010-2013 Dotted Images/2010/KMR ...
4,avian_monitoring/screenshots/2010/Apalachee Ba...,DottedImages/2010-2013 Dotted Images/2010/KMR ...
...,...,...
15664,avian_monitoring/screenshots/2021/Vermilion Ba...,DottedImages/Task 2 2021 Waterbird Colony Phot...
15665,avian_monitoring/screenshots/2021/Vermilion Ba...,DottedImages/Task 2 2021 Waterbird Colony Phot...
15666,avian_monitoring/screenshots/2021/Vermilion Ba...,DottedImages/Task 2 2021 Waterbird Colony Phot...
15667,avian_monitoring/screenshots/2021/Vermilion Ba...,DottedImages/Task 2 2021 Waterbird Colony Phot...


In [13]:
def rename(key, new_name, replace=False):
    if not replace and list(avian_data.objects.filter(Prefix=key)): 
        return

    try:
        avian_data.copy({'Bucket': avian_data.name,'Key':key},new_name)
    except Exception as e:
        print(e)
        check_lowercase = list(avian_data.objects.filter(Prefix=key.replace(".JPG",".jpg")))
        if check_lowercase:
            avian_data.copy({'Bucket': avian_data.name,'Key':key.replace(".JPG",".jpg")},new_name)
            return
        check_uppercase = list(avian_data.objects.filter(Prefix=key.replace(".jpg",".JPG")))
        if check_uppercase:
            avian_data.copy({'Bucket': avian_data.name,'Key':key.replace(".jpg",".JPG")},new_name)
            return
        print(f"{key} not found")

In [16]:
with ThreadPoolExecutor(max_workers=16) as e:
    futures = list(tqdm(e.map(lambda x,y: rename(x,y), to_rename["Original_Path"].tolist(), to_rename["screenshot_new"].tolist()), total=to_rename.shape[0]))

  0%|          | 0/15669 [00:00<?, ?it/s]

An error occurred (404) when calling the HeadObject operation: Not Found
DottedImages/2010-2013 Dotted Images/2010/KKN 2010 Screen Captures/7May10KKN005-AREA07.JPG not found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found
DottedImages/2010-2013 Dotted Images/2011-2012ScreenCaptures_1October2013/KMR_ScreenCaptures/PJC16_17May11/17May11PJC16Area90.JPG not found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not FoundAn error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found

An error occurred (404) when 

DottedImages/2010-2013 Dotted Images/2011-2012ScreenCaptures_1October2013/MWP_ScreenCaptures/2011/May 2011/Gaillard Island/20 May 2011/20May2011 Gaillard island MWP8 Area115.JPG not found
DottedImages/2010-2013 Dotted Images/2011-2012ScreenCaptures_1October2013/MWP_ScreenCaptures/2011/May 2011/Gaillard Island/20 May 2011/20May2011 Gaillard island MWP8 Area122.JPG not found
DottedImages/2010-2013 Dotted Images/2011-2012ScreenCaptures_1October2013/MWP_ScreenCaptures/2011/May 2011/Gaillard Island/20 May 2011/20May2011 Gaillard island MWP8 Area116.JPG not foundAn error occurred (404) when calling the HeadObject operation: Not Found

An error occurred (404) when calling the HeadObject operation: Not Found
DottedImages/2010-2013 Dotted Images/2011-2012ScreenCaptures_1October2013/MWP_ScreenCaptures/2011/May 2011/Gaillard Island/20 May 2011/20May2011 Gaillard island MWP8 Area124.JPG not found
DottedImages/2010-2013 Dotted Images/2011-2012ScreenCaptures_1October2013/MWP_ScreenCaptures/2011/May 

An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found
DottedImages/2010-2013 Dotted Images/2011-2012ScreenCaptures_1October2013/MWP_ScreenCaptures/2011/May 2011/Gaillard Island/20 May 2011/20May2011 Gaillard island MWP8 Area174.JPG not found
DottedImages/2010-2013 Dotted Images/2011-2012ScreenCaptures_1October2013/MWP_ScreenCaptures/2011/May 2011/Gaillard Island/20 May 2011/20May2011 Gaillard island MWP8 Area193.JPG not found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the

An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found
DottedImages/2018 LA Waterbird Colony Photo Analysis/Group 2 Deliverables/Chandeleur South/24June18_ChandeleurSouth_SubcolonyD_Area4.jpg not found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObject operation: Not Found
An error occurred (404) when calling the HeadObjec

In [None]:
r = m_grouped.progress_apply(lambda x:avian_data.copy ({'Bucket': avian_data.name,'Key':x['key']}, x['HighResImage_new']), axis=1)

# Join summary excel with locations from colonies

In [None]:
pd_species = pd.read_excel("avianmonitoring_2010-2021.xlsx")

In [None]:
totals = pd.read_excel("/mnt/z/Colibri2010-21ColonyTotalsMayJuneCombined_8Nov22.xlsx")

In [None]:
totals

In [None]:
unique_colonies=pd_species[["Year","State","GeoRegion","ColonyName", "Longitude_y","Latitude_y"]].drop_duplicates().reset_index(drop=True)

In [None]:
unique_colonies.merge

In [None]:
join_cols =["Year", "State","GeoRegion", "ColonyName"]
for c in join_cols:
    totals[c] = totals[c].astype('str').str.strip()
    unique_colonies[c] = unique_colonies[c].astype('str').str.strip()

In [None]:
joined_totals = pd.merge(totals,unique_colonies, on=join_cols )

In [None]:
joined_totals

In [None]:
joined_totals.to_excel("joined_totals.xlsx", index=False)

# Compare totals from totals and pd_species

In [32]:
joined_totals= pd.read_excel("joined_totals.xlsx")
pd_species.columns

Index(['Year', 'Date', 'ColonyName', 'Latitude_x', 'Longitude_x',
       'DottingAreaNumber', 'CameraNumber', 'CardNumber', 'PhotoNumber', 'PQ',
       'SpeciesCode', 'WBN', 'ChickNestwithoutAdult', 'AbandNest', 'EmptyNest',
       'PBN', 'Site', 'Brood', 'OtherAdultsInColony', 'OtherImmInColony',
       'Chicks/Nestlings', 'RoostingBirds', 'RoostingAdults',
       'RoostingImmatures', 'UnknownAge', 'Dotter', 'Dotter'sColonyNumber',
       'DateDotted', 'Notes', 'BestForBPE', 'ChicksNestlings',
       'AdditionalNotes', 'Subcolony', 'ChickNest', 'Territory', 'OtherBirds',
       'ColonyID', 'ActiveInventory', 'ColonyGroupBuffer', 'State',
       'Longitude_y', 'Latitude_y', 'PrimaryHabitat', 'LandForm', 'GeoRegion',
       'ExtrapArea', 'TerrestEcoRegion', 'MarineEcoRegion', 'FormerNames',
       'OrigDotterID', 'NOTES August 2022', 'HighResImage_new',
       'screenshot_new', 'thumbnail_new', 'total_nests', 'total_birds',
       'date2', 'month', 'uid'],
      dtype='object')

In [33]:
join_cols =["Year", "State","GeoRegion", "ColonyName", "SpeciesCode",'Longitude_y', 'Latitude_y']
for c in join_cols:
    pd_species[c]=pd_species[c].astype('str').str.strip()
    joined_totals[c]=joined_totals[c].astype('str').str.strip()
# For 2021 REEG data is divided in subspecies on the access database, but reported as REEG in the summary.
pd_species.loc[(pd_species.Year == "2021") & pd_species.SpeciesCode.str.startswith("REEG "), "SpeciesCode"] = "REEG"
agg_pd_species = pd_species.loc[pd_species.BestForBPE == 'Y', ["Year", "State","GeoRegion", "ColonyName", "SpeciesCode", 'Longitude_y', 'Latitude_y', 'total_nests', 'total_birds']].groupby(["Year", "State","GeoRegion", "ColonyName", "SpeciesCode",'Longitude_y', 'Latitude_y']).sum()

In [34]:
agg_totals = joined_totals.groupby(["Year", "State","GeoRegion", "ColonyName", "SpeciesCode",'Longitude_y', 'Latitude_y']).sum()

In [36]:
joined_totals = agg_totals.join(agg_pd_species, on=join_cols, how="outer").reset_index()

In [37]:
joined_totals[joined_totals.total_nests.isna() & (joined_totals.Year == "2021")]

Unnamed: 0,Year,State,GeoRegion,ColonyName,SpeciesCode,Longitude_y,Latitude_y,Nests,Birds,total_nests,total_birds


In [38]:
joined_totals["diff_nests"] = joined_totals["Nests"].fillna(0) -joined_totals["total_nests"].fillna(0)
joined_totals["diff_birds"] = joined_totals["Birds"].fillna(0) -joined_totals["total_birds"].fillna(0)

In [42]:
joined_totals[((joined_totals["diff_nests"] !=0) | (joined_totals["diff_birds"] !=0)) & (joined_totals["SpeciesCode"] == "ROSA")]

Unnamed: 0,Year,State,GeoRegion,ColonyName,SpeciesCode,Longitude_y,Latitude_y,Nests,Birds,total_nests,total_birds,diff_nests,diff_birds
871,2010,LA,Breton-Chandeleur Islands,Breton Island,ROSA,-89.1742,29.4955,8937.0,7858.0,5415.0,8896.0,3522.0,-1038.0
5965,2010,LA,Biloxi North,Martin Island,ROSA,-89.1984,29.959,,,107.0,154.0,-107.0,-154.0


In [40]:
joined_totals.to_excel("/mnt/z/joined_totals.xlsx")

In [None]:
pd_species.loc[(pd_species.Year == "2021") & pd_species.SpeciesCode.str.startswith("REEG "), "SpeciesCode"]