Merge 3cc77d1 into d5f9c08

whylabs · Jan 27, 2021 · d589157 · d589157
2 parents d5f9c08 + 3cc77d1
commit d589157
Show file tree

Hide file tree

Showing 30 changed files with 821 additions and 61 deletions.
diff --git a/.bumpversion.cfg b/.bumpversion.cfg
@@ -1,5 +1,5 @@
 [bumpversion]
-current_version = 0.1.13-dev8
+current_version = 0.2.0-dev0
 commit = True
 tag = False
 parse = (?P<major>\d+)\.(?P<minor>\d+)\.(?P<patch>\d+)(\-(?P<release>[a-z]+)(?P<build>\d+))?

diff --git a/docs/conf.py b/docs/conf.py
@@ -119,7 +119,7 @@ def setup(app):
 # built documents.
 #
 # The short X.Y version.
-version = "0.1.13-dev8"
+version = "0.2.0-dev0"
 # The full version, including alpha/beta/rc tags.
 release = ""  # Is set by calling `setup.py docs`
 

diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -59,7 +59,7 @@ nodeenv==1.5.0
 numpy==1.19.4
 oauthlib==3.1.0
 packaging==20.7
-pandas==1.1.4
+pandas==1.2.1
 parso==0.7.1
 pathspec==0.8.1
 pexpect==4.8.0
@@ -116,3 +116,6 @@ websocket-client==0.57.0
 Werkzeug==1.0.1
 whylabs-datasketches==2.0.0b7
 zipp==3.4.0
+puremagic == 1.10
+xlrd==2.0.1
+openpyxl==3.0.6
diff --git a/requirements-test.txt b/requirements-test.txt
@@ -7,7 +7,7 @@ jmespath==0.10.0
 marshmallow==3.9.1
 matplotlib==3.3.3
 numpy==1.19.4
-pandas==1.1.4
+pandas<=1.2.1
 protobuf==4.0.0rc2
 python-dateutil==2.8.1
 pytz==2020.4
@@ -24,4 +24,7 @@ scikit-learn==0.24.0
 Pillow==8.1.0
 moto==1.3.16
 pytest-cov>=2.11.1
-mlflow==1.13.1
+mlflow==1.13.1
+puremagic ==1.10
+xlrd==2.0.1
+openpyxl==3.0.6
diff --git a/requirements.txt b/requirements.txt
@@ -6,7 +6,7 @@ fsspec==0.8.4
 jmespath==0.10.0
 marshmallow==3.9.1
 numpy==1.19.4
-pandas==1.1.4
+pandas<=1.2.1
 protobuf==3.14.0
 python-dateutil==2.8.1
 pytz==2020.4
@@ -16,3 +16,6 @@ s3transfer==0.3.3
 six==1.15.0
 urllib3==1.26.2
 whylabs-datasketches==2.0.0b7
+xlrd==2.0.1
+openpyxl==3.0.6
+puremagic == 1.10
diff --git a/setup.cfg b/setup.cfg
@@ -4,7 +4,7 @@
 
 [metadata]
 name = whylogs
-version = 0.1.13-dev8
+version = 0.2.0-dev0
 description = Profile and monitor your ML data pipeline end-to-end
 author = WhyLabs.ai
 author-email = support@whylabs.ai

diff --git a/src/whylogs/_version.py b/src/whylogs/_version.py
@@ -1,3 +1,3 @@
 """WhyLabs version number."""
 
-__version__ = "0.1.13-dev8"
+__version__ = "0.2.0-dev0"
diff --git a/src/whylogs/app/logger.py b/src/whylogs/app/logger.py
@@ -6,13 +6,14 @@
 import json
 from pathlib import Path
 from typing import List, Optional, Dict, Union, Callable, AnyStr
+from tqdm import tqdm
 
 import pandas as pd
 from typing.io import IO
 
 from whylogs.app.writers import Writer
-from whylogs.core import DatasetProfile, TrackImage, METADATA_DEFAULT_ATTRIBUTES
-
+from whylogs.core import DatasetProfile, TrackImage, METADATA_DEFAULT_ATTRIBUTES, TrackBB
+from whylogs.io import LocalDataset
 TIME_ROTATION_VALUES = ["s", "m", "h", "d"]
 
 # TODO upgrade to Classes
@@ -99,6 +100,15 @@ def profile(self, ) -> DatasetProfile:
         """
         return self._profiles[-1]["full_profile"]
 
+    def tracking_checks(self):
+
+        if not self._active:
+            return False
+
+        if self.should_rotate():
+            self._rotate_time()
+        return True
+
     @property
     def segmented_profiles(self, ) -> Dict[str, DatasetProfile]:
         """
@@ -287,12 +297,9 @@ def log(
         :param value: value of as single feature. Cannot be specified if 'features' is specified
 
         """
-        if not self._active:
+        if not self.tracking_checks():
             return None
 
-        if self.should_rotate():
-            self._rotate_time()
-
         if features is None and feature_name is None:
             return
 
@@ -341,10 +348,8 @@ def log_image(self,
         :param feature_transforms: a list of callables to transform the input into metrics
         :type image: Union[str, PIL.image]
         """
-        if not self._active:
-            return
-        if self.should_rotate():
-            self._rotate_time()
+        if not self.tracking_checks():
+            return None
 
         if isinstance(image, str):
             track_image = TrackImage(image, feature_transforms=feature_transforms,
@@ -355,6 +360,30 @@ def log_image(self,
 
         track_image(self._profiles[-1]["full_profile"])
 
+    def log_local_dataset(self, root_dir, folder_feature_name="folder_feature"):
+        from PIL.Image import Image as ImageType
+        dst = LocalDataset(root_dir)
+        for idx in tqdm(range(len(dst))):
+            ((data, magic_data), fmt), segment_value = dst[idx]
+            self.log(feature_name="file_format", value=fmt)
+            self.log(feature_name=folder_feature_name, value=segment_value)
+            self.log(features=magic_data)
+            if isinstance(data, pd.DataFrame):
+                self.log_dataframe(data)
+            elif isinstance(data, Dict) or isinstance(data, list):
+                self.log_annotation(annotation_data=data)
+            elif isinstance(data, ImageType):
+                self.log_image(data)
+            else:
+                raise NotImplementedError(
+                    "File format not supported {}, format:{}".format(type(data), fmt))
+
+    def log_annotation(self, annotation_data):
+        if not self.tracking_checks():
+            return None
+        track_bounding_box = TrackBB(obj=annotation_data)
+        track_bounding_box(self._profiles[-1]["full_profile"])
+
     def log_csv(self,
                 filepath_or_buffer: Union[str, Path, IO[AnyStr]],
                 segments: Optional[Union[List[Segment], List[str]]] = None,
@@ -369,10 +398,6 @@ def log_csv(self,
         :param profile_full_dataset: when segmenting dataset, an option to keep the full unsegmented profile of the
         dataset.
         """
-        if not self._active:
-            return
-        if self.should_rotate():
-            self._rotate_time()
 
         self.profile_full_dataset = profile_full_dataset
         if segments is not None:
@@ -392,10 +417,8 @@ def log_dataframe(self, df,
         :param segments: specify the tag key value pairs for segments
         :param df: the Pandas dataframe to log
         """
-        if not self._active:
+        if not self.tracking_checks():
             return None
-        if self.should_rotate():
-            self._rotate_time()
 
         # segment check  in case segments are just keys
         self.profile_full_dataset = profile_full_dataset

diff --git a/src/whylogs/core/__init__.py b/src/whylogs/core/__init__.py
@@ -1,10 +1,13 @@
 from .columnprofile import ColumnProfile
 from .datasetprofile import DatasetProfile
 from .image_profiling import TrackImage, _METADATA_DEFAULT_ATTRIBUTES as METADATA_DEFAULT_ATTRIBUTES
+from .annotation_profiling import TrackBB, BB_ATTRIBUTES
 
 __ALL__ = [
     ColumnProfile,
     DatasetProfile,
     TrackImage,
     METADATA_DEFAULT_ATTRIBUTES,
+    TrackBB,
+    BB_ATTRIBUTES
 ]
diff --git a/src/whylogs/core/annotation_profiling.py b/src/whylogs/core/annotation_profiling.py
@@ -0,0 +1,158 @@
+from typing import Callable, Optional, Dict, List
+
+import pandas as pd
+import numpy as np
+
+from whylogs.io.file_loader import file_loader
+
+
+class Rectangle:
+
+    """
+    Helper class to compute minimal bouding box intersections and/or iou
+    minimal stats properties of boudning box
+
+    Attributes:
+        area (float): Description
+        aspect_ratio (TYPE): Description
+        boundingBox (TYPE): Description
+        centroid (TYPE): Description
+        confidence (TYPE): Description
+        height (TYPE): Description
+        labels (TYPE): Description
+        width (TYPE): Description
+    """
+
+    # replace with shapley functions and methods
+    # or move to cpp/cpython
+    def __init__(self, boundingBox, confidence=None, labels=None):
+        self.boundingBox = boundingBox
+        self._x1 = boundingBox[0][0]
+        self._x2 = boundingBox[1][0]
+        self._y1 = boundingBox[0][1]
+        self._y2 = boundingBox[1][0]
+        self.confidence = confidence
+        self.labels = labels
+        self.area = abs(self.x2 - self.x1)*abs(self.y2-self.y1)
+        self.width = abs(self.x2 - self.x1)
+        self.height = abs(self.y2 - self.y1)
+        self.aspect_ratio = self.width / self.height if self.height > 0 else 0.0
+        self.centroid = [self.x1 + self.width/2, self.y1 + self.height/2]
+
+    @property
+    def x1(self):
+        return self._x1
+
+    @property
+    def x2(self):
+        return self._x2
+
+    @property
+    def y1(self):
+        return self._y1
+
+    @property
+    def y2(self):
+        return self._y2
+
+    def intersection(self, Rectangle_2):
+        x_left = max(self.x1, Rectangle_2.x1)
+        y_top = max(self.y1, Rectangle_2.y1)
+        x_right = min(self.x2, Rectangle_2.x2)
+        y_bottom = min(self.y2, Rectangle_2.y2)
+        if x_right < x_left or y_bottom < y_top:
+            return 0.0
+        intersection_area = (x_right - x_left) * (y_bottom - y_top)
+
+        return intersection_area
+
+    def iou(self, Rectangle_2):
+        intersection_area = self.intersection(Rectangle_2)
+        if Rectangle_2.area <= 0 or self.area <= 0:
+            return 0.0
+        return intersection_area / \
+            (self.area+Rectangle_2.area - intersection_area)
+
+
+BB_ATTRIBUTES = ("annotation_count", "annotation_density",
+                 "area_coverage", "bb_width", "bb_height",
+                 "bb_area", "bb_aspect_ratio", "confidence", "dist_to_center"
+                 )
+
+
+class TrackBB:
+
+    def __init__(self, filepath: str = None,
+                 obj: Dict = None,
+                 feature_transforms: Optional[List[Callable]] = None,
+                 feature_names: str = ""
+                 ):
+
+        if filepath is None and obj is None:
+            raise ValueError("Need  filepath or object data")
+        if filepath is not None:
+            (self.obj, magic_data), self.fmt = file_loader(filepath)
+
+        else:
+            self.obj = obj
+        self.per_image_stats = []
+        self.all_bboxes = []
+        self.calculate_metrics()
+
+    def calculate_metrics(self,):
+
+        for obj in self.obj:
+
+            annotation_metrics = {}
+            annotations = obj.get("annotation", None)
+            if annotations is None:
+                continue
+            img_height_pixel = annotations["size"]["height"]
+            img_width_pixel = annotations["size"]["width"]
+            img_rect = Rectangle([[0, 0], [img_width_pixel, img_height_pixel]])
+            annotation_metrics["annotation_count"] = len(
+                annotations["object"])
+            annotation_metrics["annotation_density"] = annotation_metrics["annotation_count"]/img_rect.area
+
+            # Get individual bbox metrics
+            annotation_metrics["area_coverage"] = 0
+
+            for bb_obj in filter(lambda x: "bndbox" in x,
+                                 annotations["object"]):
+                bounding_box_metric = {}
+
+                rect1 = Rectangle([[bb_obj["bndbox"]["xmin"], bb_obj["bndbox"]["ymin"]],
+                                   [bb_obj["bndbox"]["xmax"], bb_obj["bndbox"]["ymax"]]],
+                                  confidence=bb_obj["confidence"])
+
+                bounding_box_metric["confidence"] = rect1.confidence
+
+                bounding_box_metric["bb_width"] = rect1.width
+                bounding_box_metric["bb_height"] = rect1.height
+                bounding_box_metric["bb_area"] = rect1.area
+
+                bounding_box_metric["bb_aspect_ratio"] = rect1.aspect_ratio
+
+                bounding_box_metric["dist_to_center"] = np.linalg.norm([
+                    rect1.centroid[0] - (img_width_pixel / 2.0),
+                    rect1.centroid[1] - (img_height_pixel / 2.0)], ord=2)
+
+                annotation_metrics["area_coverage"] += rect1.intersection(
+                    img_rect) / (img_rect.area*annotation_metrics["annotation_count"])
+
+                self.all_bboxes.append(bounding_box_metric)
+
+            # Send object to metrics
+            self.per_image_stats.append(annotation_metrics)
+
+    def __call__(self, profiles):
+
+        if not isinstance(profiles, list):
+            profiles = [profiles]
+
+        per_image_dataframe = pd.DataFrame(self.per_image_stats)
+        bounding_boxes = pd.DataFrame(self.all_bboxes)
+        for each_profile in profiles:
+
+            each_profile.track_dataframe(per_image_dataframe)
+            each_profile.track_dataframe(bounding_boxes)
diff --git a/src/whylogs/core/image_profiling.py b/src/whylogs/core/image_profiling.py
@@ -13,7 +13,7 @@
 except ImportError as e:
     ImageType = None
     logger.debug(str(e))
-    logger.debug("Unable to load PIL; install pillow for image support")
+    logger.debug("Unable to load PIL; install Pillow for image support")
 
 DEFAULT_IMAGE_FEATURES = [Hue(), Saturation(), Brightness()]
 

diff --git a/src/whylogs/io/__init__.py b/src/whylogs/io/__init__.py
@@ -1,3 +1,7 @@
-_file_plugins = [
 
+
+from .local_dataset import LocalDataset
+
+__ALL__ = [
+    LocalDataset
 ]