Merge branch 'bit-masked-integers' into develop

whitews · May 17, 2023 · 344b32b · 344b32b
2 parents ad1ad74 + e481694
commit 344b32b
Show file tree

Hide file tree

Showing 4 changed files with 120 additions and 26 deletions.
diff --git a/flowio/_version.py b/flowio/_version.py
@@ -1,4 +1,4 @@
 """
 FlowIO version
 """
-__version__ = "1.2.0"
+__version__ = "1.2.1b0"
diff --git a/flowio/flowdata.py b/flowio/flowdata.py
@@ -16,6 +16,13 @@
     basestring = str
 
 
+def _next_power_of_2(x):
+    if x == 0:
+        return 1
+    else:
+        return 2 ** (x - 1).bit_length()
+
+
 class FlowData(object):
     """
     Object representing a Flow Cytometry Standard (FCS) file.
@@ -101,7 +108,10 @@ def __init__(
         )
 
         if int(self.text.get("nextdata", "0")) != 0 and nextdata_offset is None:
-            raise MultipleDataSetsError()
+            self._fh.close()
+            raise MultipleDataSetsError(
+                "%s contains multiple data sets, use read_multiple_data_sets function" % self.name
+            )
 
         self.channel_count = int(self.text['par'])
         self.event_count = int(self.text['tot'])
@@ -307,16 +317,27 @@ def __parse_data(self, offset, start, stop, text):
             order = '@'
             # from here on out we assume mode "l" (list)
 
-        bit_width = []
-        for i in range(1, int(text['par']) + 1):
-            bit_width.append(int(text['p%db' % i]))
-
         if data_type.lower() == 'i':
+            # For int data we need to check the bit width and range values.
+            # The PnR value specifies the max value for the channel. This
+            # value is exclusive, e.g. a value of 1024 means the highest
+            # integer value allowed is 1023. Integer data needs to be
+            # bit-masked according to this max range value.
+            bit_width_by_channel = {}
+            max_range_by_channel = {}
+            for i in range(1, int(text['par']) + 1):
+                bit_width_by_channel[i] = int(text['p%db' % i])
+
+                # Need to verify the value is a power of 2
+                tmp_max_range = int(text['p%dr' % i])
+                max_range_by_channel[i] = _next_power_of_2(tmp_max_range)
+
             data = self.__parse_int_data(
                 offset,
                 start,
                 stop,
-                bit_width,
+                bit_width_by_channel,
+                max_range_by_channel,
                 order
             )
         else:
@@ -368,42 +389,89 @@ def __calc_data_item_count(self, start, stop, data_type_size):
 
         return num_items, stop
 
-    def __parse_int_data(self, offset, start, stop, bit_width, order):
+    def __parse_int_data(
+            self,
+            offset,
+            start,
+            stop,
+            bit_width_lut,
+            max_range_lut,
+            order
+    ):
         """Parse out and return integer list data from FCS file"""
 
-        if reduce(and_, [item in [8, 16, 32] for item in bit_width]):
-            # We have a uniform bit width for all parameters,
-            # use the first value to determine the number of actual events
-            if len(set(bit_width)) == 1:
-                data_type_size = bit_width[0] / 8
+        if reduce(and_, [item in [8, 16, 32] for item in bit_width_lut.values()]):
+            # Determine if we have uniform bit width values for all parameters.
+            # If so, use array.array for much faster parsing
+            if len(set(bit_width_lut.values())) == 1:
+                # We do have a uniform bit width, grab the 1st value to
+                # determine the number of actual events
+                bit_width = list(bit_width_lut.values())[0]
+                data_type_size = bit_width / 8
                 num_items, stop = self.__calc_data_item_count(start, stop, data_type_size)
 
+                # Here, we're reading the initial data array, but some channel
+                # data may still need bit-masking correction using max range
                 self._fh.seek(offset + start)
-                tmp = array.array(self.__format_integer(bit_width[0]))
+                tmp = array.array(self.__format_integer(bit_width))
                 tmp.fromfile(self._fh, int(num_items))
                 if order == '>':
                     tmp.byteswap()
 
-            # parameter sizes are different
-            # e.g. 8, 8, 16, 8, 32 ...
+                # If any bits higher shall be
+                # ignored using a bit mask. If the PnR value is not a power
+                # of 2, then the next power of 2 shall be used.
+                if any(2 ** bit_width_lut[c] > max_range_lut[c] for
+                       c in bit_width_lut.keys()) :
+
+                    amount_data_points = int(num_items / len(max_range_lut))
+
+                    # Create bit mask array matching length of our data array,
+                    # with values for every position being the max range value.
+                    bit_mask = array.array(
+                        self.__format_integer(bit_width),
+                        [mr - 1 for mr in max_range_lut.values()] * amount_data_points
+                    )
+                    new_tmp = array.array(self.__format_integer(bit_width))
+                    new_tmp.frombytes(bytes(map(lambda a,b: a&b, tmp.tobytes(), bit_mask.tobytes())))
+                    tmp = new_tmp
             else:
+                # parameter sizes are different
+                # e.g. 8, 8, 16, 8, 32 ...
                 # can't use array for heterogeneous bit widths
-                tmp = self.__extract_var_length_int(bit_width, offset, order, start, stop)
+                tmp = self.__extract_var_length_int(
+                    bit_width_lut,
+                    max_range_lut,
+                    offset,
+                    order,
+                    start,
+                    stop
+                )
 
         else:  # non standard bit width...  Does this happen?
             warn('Non-standard bit width for data segments')
             return None
+
         return tmp
 
-    def __extract_var_length_int(self, bit_width, offset, order, start, stop):
+    def __extract_var_length_int(self, bit_width_by_channel, max_range_by_channel, 
+                                 offset, order, start, stop):
         data_format = order
-        for cur_width in bit_width:
+        for cur_width in bit_width_by_channel.values():
             data_format += '%s' % self.__format_integer(cur_width)
 
         # array module doesn't have a function to heterogeneous bit widths,
         # so fall back to the slower unpack approach
         tuple_tmp = iter_unpack(data_format, self.__read_bytes(offset, start, stop))
-        tmp = [ti for t in tuple_tmp for ti in t]
+        if any(2**bit_width_by_channel[c] > max_range_by_channel[c] for 
+               c in bit_width_by_channel.keys()) :
+            tmp = []
+            for data_tuple in tuple_tmp:
+                for channel, max_range in max_range_by_channel.items():
+                    tmp.append(data_tuple[channel-1] % max_range)
+        else:
+            tmp = [ti for t in tuple_tmp for ti in t]
+
         return tmp
 
     def __parse_non_int_data(self, offset, start, stop, data_type, order):

diff --git a/flowio/tests/flowdata_lmd_tests.py b/flowio/tests/flowdata_lmd_tests.py
@@ -1,17 +1,17 @@
 import unittest
 import warnings
-from flowio import FlowData
+import array
+from flowio import FlowData, read_multiple_data_sets
 from flowio.exceptions import FCSParsingError
 
 
 class FlowDataLMDTestCase(unittest.TestCase):
     def setUp(self):
         with warnings.catch_warnings():
             warnings.simplefilter('ignore')
-            self.flow_data = FlowData(
+            self.flow_data, self.fcs3_data = read_multiple_data_sets(
                 'examples/fcs_files/coulter.lmd',
-                ignore_offset_error=True,
-                nextdata_offset=0
+                ignore_offset_error=True
             )
 
     def test_event_count(self):
@@ -26,3 +26,11 @@ def test_get_text(self):
     def test_fail_data_offset_error(self):
         with self.assertRaises(FCSParsingError):     
             FlowData('examples/fcs_files/coulter.lmd', nextdata_offset=0)
+
+    def test_right_integer_reading(self):
+        self.assertEqual(
+            self.fcs3_data.events[:24], 
+            array.array("I", [61056, 131840, 46, 324, 10309, 104, 11912, 0,
+                            257280, 378656, 139, 1728, 58688, 354, 58720, 0,
+                            164128, 305376, 159, 924, 29024, 208, 29728, 0])
+        )
diff --git a/flowio/tests/flowdata_tests.py b/flowio/tests/flowdata_tests.py
@@ -65,12 +65,30 @@ def test_parse_var_int_data(self):
         event_values = [
             49135, 61373, 48575, 49135, 61373, 48575, 7523, 598, 49135, 61373,
             48575, 49135, 61373, 48575, 28182, 61200, 48575, 49135, 32445, 30797,
-            19057, 49135, 61373, 48575, 5969, 142482809,
+            19057, 49135, 61373, 48575, 5969, 8265081,
             61266, 48575, 49135, 20925, 61265, 48575, 27961, 25200, 61287, 48575, 9795,
             49135, 29117, 49135, 61373, 48575, 61228, 48575, 22, 21760, 49135,
-            20413, 49135, 23997, 19807, 3220139858
+            20413, 49135, 23997, 19807, 15691602
         ]
 
+        # To double-check our logic, let's use the 2 values from
+        # channel 26 where:
+        # PnB is 32
+        # PnR is 11209599
+        # The 1st value for chan 26, 32-bit, unmasked:  142482809
+        # The 2nd value for chan 26, 32-bit, unmasked: 3220139858
+        # The next power of 2 above 11209599 (PnR) is:   16777216
+        # Subtracting 1 from this power of 2, we can see what the
+        # new values should be from the binary:
+        #   1st value:
+        #   0000 1000 0111 1110 0001 1101 0111 1001     142482809 (orig 32-bit value)
+        #   0000 0000 1111 1111 1111 1111 1111 1111      16777215 (2 ** 24 - 1)
+        #   0000 0000 0111 1110 0001 1101 0111 1001       8265081 (new value)
+        #   2nd value:
+        #   1011 1111 1110 1111 0110 1111 0101 0010    3220139858 (orig 32-bit value)
+        #   0000 0000 1111 1111 1111 1111 1111 1111      16777215 (2 ** 24 - 1)
+        #   0000 0000 1110 1111 0110 1111 0101 0010      15691602 (new value)
+
         fcs_file = "examples/fcs_files/variable_int_example.fcs"
         sample = FlowData(fcs_file)