# This notebook is the fundation of the Avian Data ingester
The ingester component should
  - Generate the thumbnail
  - Get the EXIF metadata from the image
  - Add the dotting information to the image metadata (currently in an MS access database)
  - Read the KMZ/KML file with the routes to asign the GPS Location to each of the photos if needed. 
    <img src="./img/AvianDataIngestor.png" width=800px/>
  

In [1]:
# parameters 
import ipywidgets as widgets
input_path = widgets.Text(
    value='./input',
    placeholder='InputData',
    description='Input folder (folder containing the tiff images, the dotting database and the KMLs):',
    disabled=False
)
display(input_path)

Text(value='./input', description='Input folder (folder containing the tiff images, the dotting database and t…

In [None]:
from PIL import Image, ExifTags
import os
import re
import json
import pandas as pd
import geopandas as gpd
from pathlib import Path
import platform
import pandas_access
from datetime import datetime
# PILlow have methods to read exif, however, in the tests
# made, it returns a empty dict for images with actual exif data
import piexif
import fiona
import numpy as np

# enable KML support which is disabled by default
fiona.drvsupport.supported_drivers['kml'] = 'r'
fiona.drvsupport.supported_drivers['KML'] = 'r' 

def _convert_to_degress(value):
    """    
    Helper function to convert the GPS coordinates stored in the EXIF to degress in float format
    Borrowed from: https://gist.github.com/snakeye/fdc372dbf11370fe29eb
    Modified to recieve a tuple instead of a exifread.utils.Ratio
    :param value:
    :type value: tuple
    :rtype: float
    """
    d = float(value[0][0]) / float(value[0][1])
    m = float(value[1][0]) / float(value[1][1])
    s = float(value[2][0]) / float(value[2][1])

    return d + (m / 60.0) + (s / 3600.0)

class DottingInfo:
    """
    Access and use the dotting information
    """
    def __init__(self):
        access_databases_paths = list(input_folder.glob("*.accdb"))
        db = None
        self.tables = {}
        data = []
        try:
            # I think we can assume we have only one database file per group of photos,
            # However, if we have more than one, we can concatenate the values of the tables, 
            # iif they have the same schema. 
            for db in access_databases_paths:
                schema = pandas_access.read_schema(db)
                for table_name in schema:
                    _t = pandas_access.read_table(db, table_name)
                    self.tables[table_name] = (_t if not table_name in self.tables else pd.concat([ self.tables[table_name],_t])).drop_duplicates()
            self.tables["tblSpeciesData"]["Date"] = pd.to_datetime(self.tables["tblSpeciesData"]["Date"])
        except pyodbc.Error as e:
            print(e)
    def get_info_by_name(self, image_name):
        """
        image_name must be DDMonthYYYYCameraX-Photo#.tiff
        """
        name_re = r"([0-9]{2}[A-Za-z]+[0-9]{4})\s*Camera(\d+)-(?:Photo|)(\d+)\.*"
        matches = re.match(name_re, image_name)
        if not matches:
            #raise Exception(f"FATAL: The name of the photo {image_name} is not following the expected schema")
            return None
        photo_info = matches.groups()
        photo_date = datetime.strptime(photo_info[0], "%d%B%Y")
        tbl_sd = self.tables.get("tblSpeciesData")
        if tbl_sd is None:
            raise Exception ("Please make sure that the table tblSpeciesData exists in the access database")
        data = tbl_sd.loc[(tbl_sd["Date"] == photo_date) & (tbl_sd["CameraNumber"] == photo_info[1]) &  (tbl_sd["PhotoNumber"] == int(photo_info[2]))]
        data = data.merge(self.tables["tblSpeciesCodes"], how ="left",left_on=["SpeciesCode"], right_on=["SpeciesCode"]).reset_index(drop=True)
        data = data.merge(self.tables["tblColonyLocationInfo"], how="left", left_on="ColonyID", right_on="ColonyID")
        data = data.merge(self.tables["tblColonySiteNotes"], how="left", on=["ColonyID","SpeciesCode" ], suffixes=(None,"_notes"))
        return data
class KMLInfo:
    """
    Use KML info to tag images.    
    """
    def __init__(self, kmls_paths):
        self.kmldf = None
        for kml_file in kmls_paths:
            _kmldf = gpd.read_file(kml_file)
            self.kmldf = pd.concat([self.kmldf, _kmldf]) if self.kmldf is not None else _kmldf
    def get_info_by_name(self, name, change_extension = None)->tuple:
        """
        Get information from the KML files about a give image
        params: 
            - name: image name
            - change_extension: extension used in the names stored in the kml files
                                None if no change is necessary. 
        """
        _sn = name if not change_extension else Path(name).with_suffix(f".{change_extension}").name
        try:
            geom = self.kmldf.loc[self.kmldf.Name == _sn].geometry
            return geom.values[0] if len(geom)>0 else None
        except (AttributeError, KeyError) as e:
            return None

In [None]:
thumbnail_size = (200,200)

In [None]:
input_folder = Path(input_path.value)
access_databases_paths = input_folder.glob("*.accdb")
thumb_folder = input_folder.joinpath("thumbnails")
thumb_folder.mkdir(exist_ok=True, parents=True)
metadata_folder = input_folder.joinpath("metadata") 
metadata_folder.mkdir(exist_ok=True, parents=True)


In [None]:
data = []
exclude_tags = ["InterColorProfile", "StripOffsets", "StripByteCounts", "XMLPacket"]

In [None]:
images = list(input_folder.glob("*.tif?"))

In [None]:
dotting_info = DottingInfo()
for image_path in images:
    with Image.open(image_path) as img:
        
        thumbnail_name  = image_path.with_suffix(".png").name
        img_meta = {"name": image_path.name, "thumbnail": thumbnail_name}
        piex = piexif.load(image_path.as_posix())
        plain_exif = {}
        for k in piex:
            if k == "thumbnail":
                continue
            for exif_id in piex[k]:
                t = ExifTags.TAGS.get(exif_id) if k != "GPS" else ExifTags.GPSTAGS.get(exif_id)
                if t in exclude_tags:
                    continue
                if t is None:
                    print( k, exif_id, piex[k][exif_id])
                plain_exif[t] = piex[k][exif_id] if not t in ("GPSLongitude", "GPSLatitude") else _convert_to_degress(piex[k][exif_id])
        img_meta["exif"] = plain_exif
        img.thumbnail(thumbnail_size)
        img.save(thumb_folder.joinpath(thumbnail_name), "PNG")
        info = dotting_info.get_info_by_name(image_path.name)
        if info is not None:
            _a = info.groupby(["CameraNumber","PhotoNumber", "Date"])[["SpeciesCode", "SpeciesName", "Date", "ColonyID","Latitude", "Longitude"]].apply(lambda x: x.to_dict('records')).reset_index()
            _meta = _a.to_dict(orient="records")[0]
            _meta["species_colonies"] = _meta.pop(0)
            img_meta.update(_meta)
        data.append(img_meta)


In [None]:
#Example metadata


# Read the KMZ
If there are KMLS paths and the foto is there, use its location, else 

In [None]:
def _select_coordinates(point, data):
    if point:
        return point.coords[0], 'kml_point'
    if "exif" in data and "GPSLongitude" in data.get("exif"):
        _exif = data.get("exif")
        return  ((-1 if _exif.get("GPSLongitudeRef") == "S" else 1)*_exif.get("GPSLongitude"), (-1 if _exif.get("GPSLatitudeRef") == "W" else 1) * _exif.get("GPSLatitude"), _exif.get("Altitude", None) ),'exif'
    if  "species_colonies" in data:
        sc_list = data["species_colonies"]
        count = len (sc_list)
        lat = 0
        lon = 0
        for col in sc_list:
            lat += col.get("Latitude",0)
            lon += col.get("Longitude",0)
        if lon and lat:
            return (lon/count, lat/count, None), 'colonies_average'
    return None, None

In [None]:
kmls_paths =  input_folder.glob("*.kml")
kml_info = KMLInfo(kmls_paths)

In [None]:
for d in data:
    _point = kml_info.get_info_by_name(d.get("name"), change_extension="jpg")
    
    coord, _from = _select_coordinates(_point, d)
    if coord:
        d["longitude"] = coord[0]
        d["latitude"] = coord[1]
        d["altitude"] = coord[2]
        d["location_from"] = _from

In [None]:
ser_data = json.loads(json.dumps(data, default=str, indent=2))

In [None]:

for document in data:
    with open(metadata_folder.joinpath(Path(document["name"]).with_suffix(".json")),'w') as _file:
        json.dump(document, _file, default=str)

In [None]:
with open(metadata_folder.joinpath("all.json"),"w") as _all:
    json.dump(data, _all, default=str )