outsourcing input definition (can be either screen or video) to separate classes and let user specify it in the provided config yaml file

sjentzsch · sjentzsch · commit 984bbe8f4f07 · 2017-10-23T01:00:09.000+02:00
diff --git a/config/config.obj_detect.sample.yml b/config/config.obj_detect.sample.yml
@@ -6,3 +6,14 @@
 model_name: 'ssd_mobilenet_v1_coco_11_06_2017'
 model_dl_base_path: 'http://download.tensorflow.org/models/object_detection/'
 model_dl_file_format: '.tar.gz'
+
+# re-calculates and displays FPS rate every x seconds
+fps_interval: 3
+
+# choose your input: screen, video
+input_type: screen
+
+# for video input: choose either the device id (camera index) or a filename
+## will be passed to OpenCV VideoCapture
+#input_video: '../opencv_extra/testdata/highgui/video/big_buck_bunny.mp4'
+input_video: 0
diff --git a/obj_detect.py b/obj_detect.py
@@ -6,17 +6,14 @@
 import tensorflow as tf
 import zipfile
 import time
+from Xlib import display
 import cv2
 import yaml
 
-from Xlib import display, X
 
 from collections import defaultdict
 from io import StringIO
-from PIL import Image
-
-#cap = cv2.VideoCapture(0)
-#cap = cv2.VideoCapture('../opencv_extra/testdata/highgui/video/big_buck_bunny.mp4')
+#from PIL import Image
 
 sys.path.append('../tensorflow_models/research')
 sys.path.append('../tensorflow_models/research/slim')
@@ -25,6 +22,8 @@
 from utils import label_map_util
 from utils import visualization_utils as vis_util
 
+from stuff.helper import FPS
+from stuff.input import ScreenInput, VideoInput
 
 # Load config values from config.obj_detect.sample.yml (as default values) updated by optional user-specific config.obj_detect.yml
 ## see also http://treyhunner.com/2016/02/how-to-merge-dictionaries-in-python/
@@ -35,7 +34,15 @@
 #for section in cfg:
 #  print(section, ":", cfg[section])
 
-
+# Define input
+screen = display.Display().screen().root.get_geometry()
+if cfg['input_type'] == 'screen':
+  input = ScreenInput(0, 0, int(screen.width/2), int(screen.height/2))
+elif cfg['input_type'] == 'video':
+  input = VideoInput(cfg['input_video'])
+else:
+  print('No valid input type given. Exit.')
+  sys.exit()
 
 # Any model exported using the `export_inference_graph.py` tool can be loaded here simply by changing `PATH_TO_CKPT` to point to a new .pb file.
 # See the [detection model zoo](https://github.com/tensorflow/models/blob/master/research/object_detection/g3doc/detection_model_zoo.md) for a list of other models that can be run out-of-the-box with varying speeds and accuracies.
@@ -97,33 +104,20 @@
     detection_classes = detection_graph.get_tensor_by_name('detection_classes:0')
     num_detections = detection_graph.get_tensor_by_name('num_detections:0')
 
-    # for frame rate calculation
-    start_time = time.time()
-    x = 3 # displays the frame rate every x seconds
-    counter = 0
+    # TODO: Usually FPS calculation lives in a separate thread. As is now, the interval is a minimum value for each iteration.
+    fps = FPS(cfg['fps_interval']).start()
 
     windowPlacedYet = False
 
-#    while(cap.isOpened()):
-    while(True):
-
-        dsp = display.Display()
-        root = dsp.screen().root
-        reso = root.get_geometry()
-        W,H = int(reso.width/2),int(reso.height/2)
-        #W,H = 600,600
-        raw = root.get_image(0, 0, W, H, X.ZPixmap, 0xffffffff)
-        image = Image.frombytes("RGB", (W, H), raw.data, "raw", "RGBX")
-        image_np = np.array(image);
+    while(input.isActive()):
+        ret, image_np = input.getImage()
+        if not ret:
+          print("No frames grabbed from input (anymore)! Exit.")
+          break
 
 #        image_np_bgr = np.array(ImageGrab.grab(bbox=(0,0,600,600))) # grab(bbox=(10,10,500,500)) or just grab()
 #        image_np = cv2.cvtColor(image_np_bgr, cv2.COLOR_BGR2RGB)
 
-#        ret, image_np = cap.read()
-#        if not ret:
-#          print("Video finished!")
-#          break
-
 #    for image_path in TEST_IMAGE_PATHS:
 #      image = Image.open(image_path)
       # the array based representation of the image will be used later in order to prepare the
@@ -150,14 +144,14 @@
         if cv2.waitKey(1) & 0xFF == ord('q'):
             break
         if not windowPlacedYet:
-          cv2.moveWindow('object detection', (int)(reso.width/3), (int)(reso.height/3))
+          cv2.moveWindow('object detection', (int)(screen.width/3), (int)(screen.height/3))
           windowPlacedYet = True
 
-        counter+=1
-        if (time.time() - start_time) > x :
-            print("FPS: ", counter / (time.time() - start_time))
-            counter = 0
-            start_time = time.time()
+        fps.update()
+
+fps.stop()
+print('[INFO] elapsed time (total): {:.2f}'.format(fps.elapsed()))
+print('[INFO] approx. FPS: {:.2f}'.format(fps.fps()))
 
-#cap.release()
+input.cleanup()
 cv2.destroyAllWindows()
diff --git a/stuff/helper.py b/stuff/helper.py
@@ -0,0 +1,34 @@
+import datetime
+
+class FPS:
+    def __init__(self, interval):
+        self._glob_start = None
+        self._glob_end = None
+        self._glob_numFrames = 0
+        self._local_start = None
+        self._local_numFrames = 0
+        self._interval = interval
+
+    def start(self):
+        self._glob_start = datetime.datetime.now()
+        self._local_start = self._glob_start
+        return self
+
+    def stop(self):
+        self._glob_end = datetime.datetime.now()
+
+    def update(self):
+        curr_time = datetime.datetime.now()
+        curr_local_elapsed = (curr_time - self._local_start).total_seconds()
+        self._glob_numFrames += 1
+        self._local_numFrames += 1
+        if curr_local_elapsed > self._interval:
+          print("FPS: ", self._local_numFrames / curr_local_elapsed)
+          self._local_numFrames = 0
+          self._local_start = curr_time
+
+    def elapsed(self):
+        return (self._glob_end - self._glob_start).total_seconds()
+
+    def fps(self):
+        return self._glob_numFrames / self.elapsed()
diff --git a/stuff/input.py b/stuff/input.py
@@ -0,0 +1,41 @@
+import numpy as np
+from Xlib import display, X
+from PIL import Image
+import cv2
+
+
+class ScreenInput:
+    def __init__(self, startX, startY, endX, endY):
+        self.root = display.Display().screen().root
+        self.reso = self.root.get_geometry()
+
+        self.startX = startX
+        self.startY = startY
+        self.width = endX-startX
+        self.height = endY-startY
+
+    def isActive(self):
+        return True
+
+    def getImage(self):
+        raw = self.root.get_image(self.startX, self.startY, self.width, self.height, X.ZPixmap, 0xffffffff)
+        image = Image.frombytes("RGB", (self.width, self.height), raw.data, "raw", "RGBX")
+        image_np = np.array(image);
+        return True, image_np
+
+    def cleanup(self):
+        pass
+
+
+class VideoInput:
+    def __init__(self, input):
+        self.cap = cv2.VideoCapture(input)
+
+    def isActive(self):
+        return self.cap.isOpened()
+
+    def getImage(self):
+        return self.cap.read()
+
+    def cleanup(self):
+        self.cap.release()