In [None]:
# Disable jedi autocompleter
%config Completer.use_jedi = False

# This notebook is the fundation of the Avian Data ingester
The ingester component should
  - Generate the thumbnail
  - Get the EXIF metadata from the image
  - Add the dotting information to the image metadata (currently in an MS access database)
  - Read the KMZ/KML file with the routes to asign the GPS Location to each of the photos if needed. 
    <img src="./img/AvianDataIngestor.png" width=800px/>
  

In [None]:
# parameters 
import ipywidgets as widgets
input_path = widgets.Text(
    value='/mnt/d/2018',
    placeholder='InputData',
    description='Input folder (folder containing the jpg images, the dotting database and the KMLs):',
    disabled=False
)
display(input_path)

In [None]:
thumbnail_size = (518,345)
create_thumbnails = True
replace = True

In [None]:
from PIL import Image, ExifTags
import os
import re
import json
import pandas as pd
import geopandas as gpd
from pathlib import Path
import platform
import pandas_access
from datetime import datetime
import pyodbc

# PILlow have methods to read exif, however, in the tests
# made, it returns a empty dict for images with actual exif data
import piexif
import fiona
import numpy as np
from multiprocessing import Pool
from functools import partial
import sqlite3
import tempfile
from pandas.io.json import json_normalize


from SPARQLWrapper import SPARQLWrapper, JSON

# enable KML support which is disabled by default
fiona.drvsupport.supported_drivers["kml"] = "r"
fiona.drvsupport.supported_drivers["KML"] = "r"


def _convert_to_degress(value):
    """
    Helper function to convert the GPS coordinates stored in the EXIF to degress in float format
    Borrowed from: https://gist.github.com/snakeye/fdc372dbf11370fe29eb
    Modified to recieve a tuple instead of a exifread.utils.Ratio
    :param value:
    :type value: tuple
    :rtype: float
    """
    d = float(value[0][0]) / float(value[0][1])
    m = float(value[1][0]) / float(value[1][1])
    s = float(value[2][0]) / float(value[2][1])

    return d + (m / 60.0) + (s / 3600.0)


class DottingInfo:
    """
    Access and use the dotting information
    """

    def __init__(self, add_birds_info=True):
        self.__add_info = add_birds_info
        self.birds_data = {}
        if Path("birds_data.json").exists():
            with open("birds_data.json") as _bdf:
                self.birds_data = json.load(_bdf)
        access_databases_paths = list(input_folder.glob("*.accdb"))
        db = None
        tables = {}
        try:
            # I think we can assume we have only one database file per group of photos,
            # However, if we have more than one, we can concatenate the values of the tables,
            # iif they have the same schema.
            for db in access_databases_paths:
                schema = pandas_access.read_schema(db)
                for table_name in schema:

                    _t = pandas_access.read_table(db, table_name)
                    if table_name == "tblColonyNameInfo":
                        table_name = "tblColonyLocationInfo"
                        d = {"ColonyUID": "ColonyID"}
                        _t.rename(columns=d, inplace=True)
                    tables[table_name] = (
                        _t
                        if not table_name in tables
                        else pd.concat([tables[table_name], _t])
                    ).drop_duplicates()
            data = tables.get("tblSpeciesData")
            if data is None:
                raise Exception(
                    "Please make sure that the table tblSpeciesData exists in the access database"
                )
            data["Date"] = pd.to_datetime(data["Date"])
            data.PhotoNumber = data.PhotoNumber.astype(int)
            
            ### If there is not ColonyID, use the LongVersionColonyUID
            # to find the correct value
            if tables["tblSpeciesData"].ColonyID.isnull().values.any():
                _loc = tables["tblColonyLocationInfo"]
                _index = data.ColonyID.isnull()
                _view = data.loc[_index]
                print(len(_view))
                data.loc[_index, "ColonyID"] = _view.apply(
                    lambda row: _loc.loc[
                        _loc.LongVersionColonyUID == str(row.LongVersionColonyUID)
                    ].ColonyID.values, axis=1
                ).apply(lambda x:x[0] if len(x) else None)
                print(data.loc[_index, "ColonyID"].unique())
            data = (
                data.set_index("SpeciesCode")
                .join(tables["tblSpeciesCodes"].set_index("SpeciesCode"), how="left")
                .reset_index()
            )
            data = (
                data.set_index("ColonyID")
                .join(
                    tables["tblColonyLocationInfo"].set_index("ColonyID"),
                    how="left",
                    rsuffix="from_colonies",
                )
                .reset_index()
            )
            data = (
                data.set_index(["ColonyID", "SpeciesCode"])
                .join(
                    tables["tblColonySiteNotes"].set_index(["ColonyID", "SpeciesCode"]),
                    how="left",
                    rsuffix="_notes",
                    on=["ColonyID", "SpeciesCode"],
                )
                .reset_index()
            )

            if self.__add_info:
                b_info = {
                    k: json.dumps(self.__get_data(k))
                    for k in data["SpeciesName"].unique()
                }
                data["bird_info"] = data["SpeciesName"].map(b_info)
                data.bird_info = data.bird_info.fillna("null")
                with open(Path("birds_data.json"), "w") as _jd:
                    json.dump(self.birds_data, _jd)
            _, self.db_name = tempfile.mkstemp()

            con = sqlite3.connect(self.db_name)
            data.to_sql("merged_data", con)
            cur = con.cursor()
            cur.execute(
                "CREATE INDEX find_data ON merged_data (Date, CameraNumber, PhotoNumber);"
            )
        except pyodbc.Error as e:
            print(e)

    def get_info_by_name(self, image_name):
        """
        image_name must be DDMonthYYYYCameraX-Photo#.tiff
        """
        name_re = r"([0-9]{2}[A-Za-z]+[0-9]{4})\s*Camera(\d+)-(?:Photo|)(\d+)\.*"
        matches = re.match(name_re, image_name)
        if not matches:
            # raise Exception(f"FATAL: The name of the photo {image_name} is not following the expected schema")
            return None
        photo_info = matches.groups()
        photo_date = datetime.strptime(photo_info[0], "%d%B%Y")
        # tbl_sd = self.merged_data
        data = pd.read_sql_query(
            f"select * from merged_data where Date = '{photo_date}' AND CameraNumber = {photo_info[1]} AND PhotoNumber = {int(photo_info[2])}",
            sqlite3.connect(self.db_name),
        )
        # tbl_sd.loc[(tbl_sd["Date"] == photo_date) & (tbl_sd["CameraNumber"] == photo_info[1]) &  (tbl_sd["PhotoNumber"] == int(photo_info[2]))]
        return data

    def __get_data(self, bird_name):
        if not bird_name in self.birds_data:
            sparql = SPARQLWrapper("http://dbpedia.org/sparql")
            sparql.setQuery(
                """
               PREFIX rdfs:<http://www.w3.org/2000/01/rdf-schema#> 
                PREFIX yago:<http://dbpedia.org/class/yago/> 
                PREFIX owl: <http://www.w3.org/2002/07/owl#> 
                PREFIX dbo: <http://dbpedia.org/ontology/> 
                PREFIX umbelrc: <http://umbel.org/umbel/rc/>
                PREFIX dbr: <http://dbpedia.org/resource/>

                select distinct ?Bird, ?label, ?thumbnail, ?page  where { ?Class rdfs:subClassOf|owl:sameAs yago:Bird101503061 .
                {{?Bird a ?Class} UNION  
                {?Bird a yago:Bird101503061}UNION
{?Bird a umbelrc:Bird} UNION
{dbr:List_of_birds_by_common_name dbo:wikiPageWikiLink ?Bird}} 
                ?Bird rdfs:label ?label .
                ?Bird dbo:thumbnail ?thumbnail .
                ?page  foaf:primaryTopic ?Bird .
                FILTER(REGEX(?label, "THEBIRDNAME", "i"))
                } LIMIT 1
            """.replace(
                    "THEBIRDNAME", str(bird_name)
                )
            )
            sparql.setReturnFormat(JSON)
            results = sparql.query().convert()
            b = {}
            for result in results["results"]["bindings"]:
                b["bird_thumbnail"] = result["thumbnail"]["value"]
                b["bird_wikipage"] = result["page"]["value"]
                self.birds_data[bird_name] = b
        return self.birds_data.get(bird_name)


In [None]:
            
class KMLInfo:
    """
    Use KML info to tag images.    
    """
    def __init__(self, kmls_paths):
        self.kmldf = None
        for kml_file in kmls_paths:
            _kmldf = gpd.read_file(kml_file)
            if "name" in _kmldf:
                _kmldf.rename({"name":"Name"})
            self.kmldf = pd.concat([self.kmldf, _kmldf]) if self.kmldf is not None else _kmldf
    def get_info_by_name(self, name, change_extension = None)->tuple:
        """
        Get information from the KML files about a give image
        params: 
            - name: image name
            - change_extension: extension used in the names stored in the kml files
                                None if no change is necessary. 
        """
        _sn = name if not change_extension else Path(name).with_suffix(f".{change_extension}").name
        try:
            geom = self.kmldf.loc[self.kmldf.Name.str.replace(" ","") == _sn.replace(" ","")].geometry
            return geom.values[0] if len(geom)>0 else None
        except (AttributeError, KeyError) as e:
            print(e)
            return None
    def fallback(self, name, change_extension = None):
        name = name if not change_extension else Path(name).with_suffix(f".{change_extension}").name
        name_re = r"(?P<date>[0-9]{2}[A-Za-z]+[0-9]{4})\s*Camera(?P<camera>\d+)(?:Card\d){0,1}-(?:Photo|)(?P<photo>\d+)\.*"
        matches_name = re.match(name_re, name).groupdict()
#        ndf = self.kmldf.Name.apply(lambda x: pd.Series(re.match(name_re, x).groupdict())) 
#        _view = self.kmldf.loc[(ndf.date==matches_name["date"]) & (ndf.camera==matches_name["camera"]) & ((ndf.photo.astype(int)-int(matches_name["photo"])).abs()<=10) ].copy()
        try:
            ndf = self.kmldf.Name.apply(lambda x: pd.Series(re.match(name_re, x).groupdict())) 
            _view = self.kmldf.loc[(ndf.date==matches_name["date"]) & (ndf.camera==matches_name["camera"]) & ((ndf.photo.astype(int)-int(matches_name["photo"])).abs()<=10) ].copy()
            _centroid = _view.to_crs("EPSG:3857").dissolve().geometry.centroid.to_crs("EPSG:4326") .values
            return _centroid[-1] if _centroid else None
        except (KeyError, AttributeError) as e:
            print(e)
            return None

In [None]:
input_folder = Path(input_path.value)
access_databases_paths = input_folder.glob("*.accdb")
thumb_folder = input_folder.joinpath("thumbnails")
thumb_folder.mkdir(exist_ok=True, parents=True)
metadata_folder = input_folder.joinpath("metadata") 
metadata_folder.mkdir(exist_ok=True, parents=True)
images = list(input_folder.glob("*.jpg"))


In [None]:
data = []
exclude_tags = ["InterColorProfile", "StripOffsets", "StripByteCounts", "XMLPacket"]

In [None]:
dotting_info = None
try:
    dotting_info = DottingInfo()
except Exception as e:
    print(e)

In [None]:
def _select_coordinates(point, data, fallback=None):
    if point:
        return point.coords[0], 'kml_point'
    if "exif" in data and "GPSLongitude" in data.get("exif"):
        _exif = data.get("exif")
        return  ((-1 if _exif.get("GPSLongitudeRef").decode() == "W" else 1)*_exif.get("GPSLongitude"), (-1 if _exif.get("GPSLatitudeRef").decode() == "S" else 1) * _exif.get("GPSLatitude"), _exif.get("GPSAltitude", None) ),'exif'
    if  "species_colonies" in data:
        sc_list = data["species_colonies"]
        count = len (sc_list)
        lat = 0
        lon = 0
        for col in sc_list:
            lat += float(col.get("Latitude") or "0")
            lon += float(col.get("Longitude") or "0")
        if lon and lat:
            return (lon/count, lat/count, None), 'colonies_average'
    if fallback:
        return fallback.coords[0], 'kml_fallback'
    return None, None
class ImageProcessor(object):
    def __init__(self, dotting_info, kml_info, metadata_folder,create_thumbnails=True, replace=True):
        self.dotting_info =  dotting_info
        self.kml_info =kml_info
        self.metadata_folder = metadata_folder
        self.create_thumbnails = create_thumbnails
        self.replace = replace

    def __call__(self, image_path):
        metadata_file = self.metadata_folder.joinpath(image_path.with_suffix(".json").name)
        if (not self.replace) and metadata_file.exists():
            return metadata_file
        with Image.open(image_path) as img:      
            thumbnail_name  = image_path.with_suffix(".png").name
            img_meta = {"name": image_path.name, "thumbnail": thumbnail_name}
            piex = piexif.load(image_path.as_posix())
            plain_exif = {}
            for k in piex:
                if k == "thumbnail":
                    continue
                for exif_id in piex[k]:
                    t = ExifTags.TAGS.get(exif_id) if k != "GPS" else ExifTags.GPSTAGS.get(exif_id)
                    if not t or t in exclude_tags:
                        continue
                    plain_exif[t] = piex[k][exif_id] if not t in ("GPSLongitude", "GPSLatitude") else _convert_to_degress(piex[k][exif_id])
            img_meta["exif"] = plain_exif
            if self.create_thumbnails:
                img.thumbnail(thumbnail_size)
                img.save(thumb_folder.joinpath(thumbnail_name), "PNG")
            info = self.dotting_info.get_info_by_name(image_path.name)
            if info is not None and not info.empty:
                _det_cols = ["SpeciesCode", "SpeciesName", "Date", "ColonyID","Latitude", "Longitude"]
                info = info.drop_duplicates(["CameraNumber","PhotoNumber", "Date"]+_det_cols)
                if "bird_info" in info:
                    info.bird_info = info.bird_info.apply(json.loads)
                    _det_cols.append("bird_info")
                 
                _a = info.groupby(["CameraNumber","PhotoNumber", "Date"])[_det_cols].apply(lambda x: x.to_dict('records')).reset_index()
                _meta = _a.to_dict(orient="records")[0]
                _meta["species_colonies"] = _meta.pop(0)
                img_meta.update(_meta)
            _point = self.kml_info.get_info_by_name(img_meta.get("name"), change_extension="jpg")
            coord, _from = _select_coordinates(_point, img_meta, self.kml_info.fallback(img_meta.get("name"), change_extension="jpg") if not _point else None)
            if coord:
                img_meta["longitude"] = coord[0]
                img_meta["latitude"] = coord[1]
                img_meta["altitude"] = coord[2] if len(coord)>2 else 0
                img_meta["location_from"] = _from
            with open(metadata_file,'w') as _file:
                json.dump(img_meta, _file, default=str)
            return metadata_file


In [None]:
kmls_paths =  input_folder.glob("*.kml")
kml_info = KMLInfo(kmls_paths)

In [None]:

with Pool(7) as pool:
    image_processor = ImageProcessor(dotting_info=dotting_info, create_thumbnails=create_thumbnails, kml_info=kml_info, metadata_folder=metadata_folder, replace=replace)
    data = pool.map(image_processor, images)    


In [None]:
with open(metadata_folder.joinpath("all.json"),"w") as _all:
    json.dump(data, _all, default=str )

In [None]:
exifdata = piexif.load("25May2021Camera1Card3-10010.jpg")

In [None]:
piex = exifdata
plain_exif = {}
for k in piex:
    if k == "thumbnail":
        continue
    for exif_id in piex[k]:
        t = ExifTags.TAGS.get(exif_id) if k != "GPS" else ExifTags.GPSTAGS.get(exif_id)
        if not t or t in exclude_tags:
            continue
        plain_exif[t] = piex[k][exif_id] if not t in ("GPSLongitude", "GPSLatitude") else _convert_to_degress(piex[k][exif_id])


In [None]:
plain_exif