In [1]:
!pip install tqdm



In [1]:
thumbnail_size = (518,345)
create_thumbnails = True
replace = True
_base_folder = 'avian_monitoring'

In [2]:
from PIL import Image, ExifTags
import os
import re
import json
import pandas as pd
import geopandas as gpd
from pathlib import Path
import platform
import pandas_access
from datetime import datetime
import pyodbc

# PILlow have methods to read exif, however, in the tests
# made, it returns a empty dict for images with actual exif data
import piexif
import fiona
import numpy as np
from multiprocessing import Pool
from functools import partial
import sqlite3
import tempfile
from pandas.io.json import json_normalize
import geopandas as gp
import numpy as np
import boto3
import re
from tqdm.notebook import tqdm
tqdm.pandas()

boto3.setup_default_session(profile_name='GLO')

# Unified 2010-2021 Data

In [4]:
# Notes: In order to use all the fields, I did a rename of the columns containing '?' or '/'
acc_db = "/mnt/c/Users/carizaporras/Downloads/Colibri_tblsSpeciesData2010-2021_18sept22.accdb"
schema = pandas_access.read_schema(acc_db) 

In [5]:
colonies_table = schema.pop("tblRWCWB_ColonyInventory_13Sept2022")

There are three tables with slighly different schema

In [6]:
schema

{'tblSpeciesData2015_2018_2021': {'AutoID': 'Long Integer',
  'Year': 'Long Integer',
  'Date': 'DateTime',
  'ColonyName': 'Text (510)',
  'Subcolony': 'Text (510)',
  'DottingAreaNumber': 'Text (6)',
  'CameraNumber': 'Text (2)',
  'CardNumber': 'Text (2)',
  'PhotoNumber': 'Text (10)',
  'PQ': 'Text (2)',
  'SpeciesCode': 'Text (100)',
  'WBN': 'Long Integer',
  'ChickNest': 'Long Integer',
  'ChickNestwithoutAdult': 'Long Integer',
  'Brood': 'Long Integer',
  'AbandNest': 'Long Integer',
  'EmptyNest': 'Long Integer',
  'PBN': 'Long Integer',
  'Territory': 'Long Integer',
  'Site': 'Long Integer',
  'OtherBirds': 'Long Integer',
  'Dotter': 'Text (6)',
  'DateDotted': 'DateTime',
  'BestForBPE': 'Text (100)',
  'Notes': 'Text (510)'},
 'tblSpeciesData2011_2013': {'AutoID': 'Double',
  'Year': 'Double',
  'Date': 'DateTime',
  'ColonyName': 'Text (510)',
  'DottingAreaNumber': 'Text (510)',
  'CameraNumber': 'Text (510)',
  'CardNumber': 'Text (510)',
  'PhotoNumber': 'Text (510)'

Taking the 2015 to 2021 table as a reference, lets compare the schemas.

Fields in the reference but not in the `tblSpeciesData2011_2013` table

In [7]:
schema["tblSpeciesData2015_2018_2021"].keys() - schema["tblSpeciesData2011_2013"].keys()

{'ChickNest', 'OtherBirds', 'Subcolony', 'Territory'}

Fields in the `tblSpeciesData2011_2013` table but not in the reference

In [8]:
 schema["tblSpeciesData2011_2013"].keys() - schema["tblSpeciesData2015_2018_2021"].keys() 

{'OtherAdultsInColony',
 'OtherImmInColony',
 'RoostingAdults',
 'RoostingBirds',
 'RoostingImmatures',
 'UnknownAge'}

Fields in the referece but not in the 2010 table

In [9]:
schema["tblSpeciesData2015_2018_2021"].keys() - schema["tblSpeciesData2010"].keys()

{'BestForBPE', 'ChickNest', 'Notes', 'OtherBirds', 'Subcolony', 'Territory'}

Fields in the referece but not in the 2010 table

In [10]:
schema["tblSpeciesData2010"].keys() - schema["tblSpeciesData2015_2018_2021"].keys() 

{'Latitude',
 'Longitude',
 'OtherAdultsInColony',
 'OtherImmInColony',
 'RoostingAdults',
 'RoostingBirds',
 'RoostingImmatures',
 'UnknownAge'}

That also means that we have different formulas to calculate the total. The bestForBPE field is used as filter in 2013 to 2021, but it is not used in the 2010.

### Formulas

|                | 2010                                                                                                                                                               | 2011-2013                                                                                                                                                          | 2015-2021                                                                                                 |
|----------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------------------------|
| Nests          | sum(<br>[WBN]<br>+[ChickNestw/outAdult]<br>+[AbandNest]<br>+[EmptyNest]<br>+[PBN]<br>+[Site]<br>+[Brood])                                                          | Sum(<br>[WBN]<br>+[ChickNestw/outAdult]<br>+[AbandNest]<br>+[EmptyNest]<br>+[PBN]<br>+[Site]<br>+[Brood])                                                          | Sum(<br>[WBN]<br>+[ChickNest]<br>+[ChickNestw/outAdult]<br>+[AbandNest]<br>+[PBN]<br>+[Site]<br>+[Brood]) |
| Birds          | sum([WBN]<br>+[PBN]<br>+[Site]<br>+[OtherAdultsInColony]<br>+[OtherImmInColony]<br>+[RoostingBirds]<br>+[RoostingAdults]<br>+[RoostingImmatures]<br>+[UnknownAge]) | Sum([WBN]<br>+[PBN]<br>+[Site]<br>+[OtherAdultsInColony]<br>+[OtherImmInColony]<br>+[RoostingBirds]<br>+[RoostingAdults]<br>+[RoostingImmatures]<br>+[UnknownAge]) | Sum(<br>[WBN]<br>+[ChickNest]<br>+[PBN]<br>+[Territory]<br>+[Site]<br>+[OtherBirds])                      |
| SumOfEmptyNest | sum(EmptyNest)                                                                                                                                                     | EmptyNest                                                                                                                                                          | EmptyNest                                                                                                 |


But 2015 to 2021 has additional categories: 

```SQL

Sum(WBN) AS SumOfWBN

Sum(ChickNest) AS SumOfChickNest

Sum([ChickNestw/outAdult]) AS [SumOfChickNestw/outAdult] 

Sum(Brood) AS SumOfBrood

Sum(AbandNest) AS SumOfAbandNest

Sum(PBN) AS SumOfPBN

Sum(Territory) AS SumOfTerritory
```

## Generating a common dataset:
- merge 2010 species with the colonies. 
- select only the common columns 

In [11]:
cols_2015_2021 = set(schema["tblSpeciesData2015_2018_2021"].keys())
cols_2011_2013 = set(schema["tblSpeciesData2011_2013"].keys())
cols_2010 = set(schema["tblSpeciesData2010"].keys())
common_fields = cols_2011_2013.intersection(cols_2015_2021)

In [12]:
cols_2010 - common_fields

{'Latitude',
 'Longitude',
 'OtherAdultsInColony',
 'OtherImmInColony',
 'RoostingAdults',
 'RoostingBirds',
 'RoostingImmatures',
 'UnknownAge'}

In [13]:
common_fields - cols_2010

{'BestForBPE', 'Notes'}

In [14]:
pd_species_2010 = pandas_access.read_table(acc_db, "tblSpeciesData2010")

In [15]:
pd_species_2010['Notes'] = ""
pd_species_2010["BestForBPE"] = "Y"

In [16]:
pd_species_2010 = pd_species_2010[common_fields]

In [17]:
pd_species_2011_2013 = pandas_access.read_table(acc_db, "tblSpeciesData2011_2013")[common_fields]
pd_species_2015_2021 = pandas_access.read_table(acc_db, "tblSpeciesData2015_2018_2021")[common_fields]

In [18]:
pd_species = pd.concat([pd_species_2010, pd_species_2011_2013, pd_species_2015_2021])

In [19]:
pd_species = pd_species.drop(columns="AutoID")

In [20]:
pd_species["Year"] = pd_species["Year"].astype(int).astype(str) 

In [21]:
pd_colonies = pandas_access.read_table(acc_db, "tblRWCWB_ColonyInventory_13Sept2022")
pd_colonies["ColonyName"] = pd_colonies["ColonyName"].astype(str)

In [22]:
pd_species = pd.merge(pd_species, pd_colonies, on="ColonyName")

In [23]:
pd_species["HighResImage_new"] = f"{_base_folder}/high_resolution_photos/"+pd_species['Year'].astype(str)+'/'+pd_species['GeoRegion']+'/'+pd_species['ColonyName']+'/'+pd.to_datetime(pd_species["Date"].astype('str'), format="%m/%d/%y %H:%M:%S").dt.strftime("%d%B%y")+'Camera'+pd_species["CameraNumber"]+'-'+'Card'+(pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))+"-"+pd_species["PhotoNumber"]+".jpg"
pd_species["screenshot_new"] = f'{_base_folder}/screenshots/'+pd_species['Year'].astype(str)+'/'+pd_species['GeoRegion']+'/'+pd_species['ColonyName']+'/'+pd.to_datetime(pd_species["Date"].astype('str'), format="%m/%d/%y %H:%M:%S").dt.strftime("%d%B%y")+'Camera'+pd_species["CameraNumber"]+'-'+'Card'+(pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))+"-"+pd_species["PhotoNumber"]+".jpg"
pd_species["thumbnail_new"] =  f'{_base_folder}/thumbnails/'+pd_species['Year'].astype(str)+'/'+pd_species['GeoRegion']+'/'+pd_species['ColonyName']+'/'+pd.to_datetime(pd_species["Date"].astype('str'), format="%m/%d/%y %H:%M:%S").dt.strftime("%d%B%y")+'Camera'+pd_species["CameraNumber"]+'-'+'Card'+(pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))+"-"+pd_species["PhotoNumber"]+".png"

In [None]:
pd_species.to_csv("avianmonitoring_2010-2021.csv.gz")

# Reorganize Files

In [24]:
bucket_name  = "twi-avian-data"
starting_folder = "HighResolutionImages"
new_folder_hr = "avian_monitoring_"
aws_s3 = boto3.resource(
        "s3",
   )
avian_data = aws_s3.Bucket(bucket_name)

In [None]:
files = [o for o in avian_data.objects.filter(Prefix=starting_folder) if re.match(r".*/20[1-2][0-9]/.*(\.jp(.{0,1})g|.tiff)",o.key.lower())]

In [None]:
image_groups = re.compile(r'HighResolutionImages/(?P<year>\d{4}).*[/ ,]+(?P<date>\d+\s*[A-Z]+\s*\d{2,4}).*(Camera|Cam)\s*(?P<camera>\d+)[ /-]*(Card\s*(?P<card>\d+)){0,1}[\s-]*((\w+/))*(IMGP){0,1}(?P<photo>\d+)\.(?P<extension>jp.?g|tiff?)', flags=re.IGNORECASE)

In [None]:
m = image_groups.match("HighResolutionImages/2010/June 2010/10 June 2010/10 June 2010 Camera 1 Card 1/10 June 2010 Camera 1 Card 1 010.JPG")

In [None]:
df_files = []
no_files = []
for i,o in tqdm(enumerate(files)): 
    if image_groups.match(o.key):
        dict_t = image_groups.match(o.key).groupdict()
        df_files.append({"key":o.key, "object":o, **dict_t})
    else:
        no_files.append(o.key)
    #if i%10000 == 0 :
    #    print(i, len(df_files), len(no_files))

In [None]:
no_files = list(filter(lambda k: not "numbering off" in k, no_files) )

In [None]:
with open("n_files_report.txt","w") as n_files_report: 
    n_files_report.write("\n".join(no_files))

In [None]:

fdf = pd.DataFrame(df_files) 

In [None]:
fdf["extension"] = fdf["extension"].str.lower()

In [None]:
from dateutil.parser import parse

In [None]:
def clean_date(text):
  datetimestr = parse(text)
  text = datetime.strftime(datetimestr, '%Y-%b-%d')
  return text
fdf['date2'] = fdf['date'].astype('str').apply(clean_date)
fdf['month'] = fdf["date2"].apply(lambda x: x.split('-')[1])
fdf['day'] = fdf["date2"].apply(lambda x: x.split('-')[2])

In [None]:
pd_species['date2']=pd_species['Date'].astype('str').apply(clean_date)
pd_species['month'] = pd_species["date2"].apply(lambda x: x.split('-')[1])
pd_species['day'] = pd_species["date2"].apply(lambda x: x.split('-')[2])

In [None]:
pd_species.to_csv("avianData20102021.csv.gz")

In [None]:
fdf = fdf.rename(columns={"camera":"CameraNumber","card":"CardNumber", "photo":"PhotoNumber", "year":"Year" })

In [None]:
fdf["PhotoNumber"] = fdf["PhotoNumber"].str.rjust(5,'0') 
pd_species["PhotoNumber"] = pd_species["PhotoNumber"].str.rjust(5,'0')

In [None]:
join_cols = ['CameraNumber','CardNumber','PhotoNumber','Year', 'month','day']
for c in join_cols:
    fdf[c] = fdf[c].astype('str').str.strip()
    pd_species[c] = pd_species[c].astype('str').str.strip()
c = 'CardNumber'
fdf.loc[fdf[c]=='None',c]= '1'

In [None]:
merged = pd_species.merge(fdf, on=join_cols, how="left")

In [None]:
pd_species["PhotoNumber"]

In [None]:
m_grouped =merged[["HighResImage_new","key","CameraNumber","CardNumber","PhotoNumber","Date","Year","month", "thumbnail_new"]].groupby(["HighResImage_new","key","CameraNumber","CardNumber","PhotoNumber","Date","Year","month","thumbnail_new"]).count().reset_index()

In [None]:
r = m_grouped.progress_apply(lambda x:avian_data.copy ({'Bucket': avian_data.name,'Key':x['key']}, x['HighResImage_new']), axis=1)

In [None]:
np.any(~r.isnull())

In [None]:
m_grouped

In [None]:
m_grouped.to_excel("UsedPhotos2010-2021.xlsx", index=False)

In [25]:
m_grouped = pd.read_excel("UsedPhotos2010-2021.xlsx")

In [26]:
from PIL import Image, ExifTags
from io import BytesIO
from concurrent.futures import ThreadPoolExecutor,as_completed
from functools import partial

In [27]:
def generate_thumbnail(high_res_key, thumb_key, regenerate=False):
    current = avian_data.objects.filter(Prefix=thumb_key)
    if not regenerate and  [o for o in current]:
        return True
    objs = [o for o in avian_data.objects.filter(Prefix=high_res_key)]
    if not objs:
        print(f"there is no {high_res_key}")
        return False
    image = Image.open(BytesIO(objs[0].get()["Body"].read()))
    buffer = BytesIO()
    image.thumbnail(thumbnail_size)
    image.save(buffer, format="png")
    buffer.seek(0)
    avian_data.put_object(Key=thumb_key, Body=buffer)
    return True

In [None]:
with ThreadPoolExecutor(max_workers=16) as e:
    futures = list(tqdm(e.map(partial(generate_thumbnail, regenerate=True), m_grouped["HighResImage_new"].tolist(), m_grouped["thumbnail_new"].tolist()), total=m_grouped.shape[0]))

  0%|          | 0/18168 [00:00<?, ?it/s]

In [None]:
m_grouped.shape

# Previous version code
```python
#acc_db = "/mnt/c/Users/carizaporras/Downloads/Colibri_tblSpeciesData2015_2018_2021_2Sept2022 (1).accdb"
#schema = pandas_access.read_schema(acc_db)
#pd_species = pandas_access.read_table(acc_db, "tblSpeciesData2015_2018_2021")
#pd_species["ColonyName"] = pd_species["ColonyName"].astype(str)
#pd_colonies = pandas_access.read_table(acc_db, "tblRWCWB_ColonyInventory_2022")
#pd_colonies["ColonyName"] = pd_colonies["ColonyName"].astype(str)
#pd_species = pd.merge(pd_species, pd_colonies, on="ColonyName")
#pd_species["HighResImage_new"] = f"{_base_folder}/high_resolution_photos/"+pd_species['Year'].astype(str)+'/'+pd_species['GeoRegion']+'/'+pd_species['ColonyName']+'/'+pd.to_datetime(pd_species["Date"].astype('str'), format="%m/%d/%y %H:%M:%S").dt.strftime("%d%B%y")+'Camera'+pd_species["CameraNumber"]+'-'+'Card'+(pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))+"-"+pd_species["PhotoNumber"]+".jpg"
#pd_species["screenshot_new"] = f'{_base_folder}/screenshots/'+pd_species['Year'].astype(str)+'/'+pd_species['GeoRegion']+'/'+pd_species['ColonyName']+'/'+pd.to_datetime(pd_species["Date"].astype('str'), format="%m/%d/%y %H:%M:%S").dt.strftime("%d%B%y")+'Camera'+pd_species["CameraNumber"]+'-'+'Card'+(pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))+"-"+pd_species["PhotoNumber"]+".jpg"
#pd_species["thumbnail_new"] =  f'{_base_folder}/thumbnails/'+pd_species['Year'].astype(str)+'/'+pd_species['GeoRegion']+'/'+pd_species['ColonyName']+'/'+pd.to_datetime(pd_species["Date"].astype('str'), format="%m/%d/%y %H:%M:%S").dt.strftime("%d%B%y")+'Camera'+pd_species["CameraNumber"]+'-'+'Card'+(pd_species["CardNumber"].where(~pd_species["CardNumber"].isnull(), other="1"))+"-"+pd_species["PhotoNumber"]+".png"
#gdf = gp.GeoDataFrame(
#    pd_species, geometry=gp.points_from_xy(pd_species["Longitude"], pd_species["Latitude"]))
#set_index(["State","GeoRegion","ColonyName","Year", "Latitude", "Longitude", "Date", "SpeciesCode"])
#pd_species = pd_species.drop(columns=["AutoID","Subcolony"])
#gdf["huc"] = "TBD"

def _convert_to_degress(value):
    """
    Helper function to convert the GPS coordinates stored in the EXIF to degress in float format
    Borrowed from: https://gist.github.com/snakeye/fdc372dbf11370fe29eb
    Modified to recieve a tuple instead of a exifread.utils.Ratio
    :param value:
    :type value: tuple
    :rtype: float
    """
    d = float(value[0][0]) / float(value[0][1])
    m = float(value[1][0]) / float(value[1][1])
    s = float(value[2][0]) / float(value[2][1])

    return d + (m / 60.0) + (s / 3600.0)
```