diff --git a/flowio/_version.py b/flowio/_version.py index ad194f9..1f5be26 100644 --- a/flowio/_version.py +++ b/flowio/_version.py @@ -1,4 +1,4 @@ """ FlowIO version """ -__version__ = "1.2.0" +__version__ = "1.2.1b0" diff --git a/flowio/flowdata.py b/flowio/flowdata.py index 60b7ec2..ab9945d 100644 --- a/flowio/flowdata.py +++ b/flowio/flowdata.py @@ -16,6 +16,13 @@ basestring = str +def _next_power_of_2(x): + if x == 0: + return 1 + else: + return 2 ** (x - 1).bit_length() + + class FlowData(object): """ Object representing a Flow Cytometry Standard (FCS) file. @@ -101,7 +108,10 @@ def __init__( ) if int(self.text.get("nextdata", "0")) != 0 and nextdata_offset is None: - raise MultipleDataSetsError() + self._fh.close() + raise MultipleDataSetsError( + "%s contains multiple data sets, use read_multiple_data_sets function" % self.name + ) self.channel_count = int(self.text['par']) self.event_count = int(self.text['tot']) @@ -307,16 +317,27 @@ def __parse_data(self, offset, start, stop, text): order = '@' # from here on out we assume mode "l" (list) - bit_width = [] - for i in range(1, int(text['par']) + 1): - bit_width.append(int(text['p%db' % i])) - if data_type.lower() == 'i': + # For int data we need to check the bit width and range values. + # The PnR value specifies the max value for the channel. This + # value is exclusive, e.g. a value of 1024 means the highest + # integer value allowed is 1023. Integer data needs to be + # bit-masked according to this max range value. + bit_width_by_channel = {} + max_range_by_channel = {} + for i in range(1, int(text['par']) + 1): + bit_width_by_channel[i] = int(text['p%db' % i]) + + # Need to verify the value is a power of 2 + tmp_max_range = int(text['p%dr' % i]) + max_range_by_channel[i] = _next_power_of_2(tmp_max_range) + data = self.__parse_int_data( offset, start, stop, - bit_width, + bit_width_by_channel, + max_range_by_channel, order ) else: @@ -368,42 +389,89 @@ def __calc_data_item_count(self, start, stop, data_type_size): return num_items, stop - def __parse_int_data(self, offset, start, stop, bit_width, order): + def __parse_int_data( + self, + offset, + start, + stop, + bit_width_lut, + max_range_lut, + order + ): """Parse out and return integer list data from FCS file""" - if reduce(and_, [item in [8, 16, 32] for item in bit_width]): - # We have a uniform bit width for all parameters, - # use the first value to determine the number of actual events - if len(set(bit_width)) == 1: - data_type_size = bit_width[0] / 8 + if reduce(and_, [item in [8, 16, 32] for item in bit_width_lut.values()]): + # Determine if we have uniform bit width values for all parameters. + # If so, use array.array for much faster parsing + if len(set(bit_width_lut.values())) == 1: + # We do have a uniform bit width, grab the 1st value to + # determine the number of actual events + bit_width = list(bit_width_lut.values())[0] + data_type_size = bit_width / 8 num_items, stop = self.__calc_data_item_count(start, stop, data_type_size) + # Here, we're reading the initial data array, but some channel + # data may still need bit-masking correction using max range self._fh.seek(offset + start) - tmp = array.array(self.__format_integer(bit_width[0])) + tmp = array.array(self.__format_integer(bit_width)) tmp.fromfile(self._fh, int(num_items)) if order == '>': tmp.byteswap() - # parameter sizes are different - # e.g. 8, 8, 16, 8, 32 ... + # If any bits higher shall be + # ignored using a bit mask. If the PnR value is not a power + # of 2, then the next power of 2 shall be used. + if any(2 ** bit_width_lut[c] > max_range_lut[c] for + c in bit_width_lut.keys()) : + + amount_data_points = int(num_items / len(max_range_lut)) + + # Create bit mask array matching length of our data array, + # with values for every position being the max range value. + bit_mask = array.array( + self.__format_integer(bit_width), + [mr - 1 for mr in max_range_lut.values()] * amount_data_points + ) + new_tmp = array.array(self.__format_integer(bit_width)) + new_tmp.frombytes(bytes(map(lambda a,b: a&b, tmp.tobytes(), bit_mask.tobytes()))) + tmp = new_tmp else: + # parameter sizes are different + # e.g. 8, 8, 16, 8, 32 ... # can't use array for heterogeneous bit widths - tmp = self.__extract_var_length_int(bit_width, offset, order, start, stop) + tmp = self.__extract_var_length_int( + bit_width_lut, + max_range_lut, + offset, + order, + start, + stop + ) else: # non standard bit width... Does this happen? warn('Non-standard bit width for data segments') return None + return tmp - def __extract_var_length_int(self, bit_width, offset, order, start, stop): + def __extract_var_length_int(self, bit_width_by_channel, max_range_by_channel, + offset, order, start, stop): data_format = order - for cur_width in bit_width: + for cur_width in bit_width_by_channel.values(): data_format += '%s' % self.__format_integer(cur_width) # array module doesn't have a function to heterogeneous bit widths, # so fall back to the slower unpack approach tuple_tmp = iter_unpack(data_format, self.__read_bytes(offset, start, stop)) - tmp = [ti for t in tuple_tmp for ti in t] + if any(2**bit_width_by_channel[c] > max_range_by_channel[c] for + c in bit_width_by_channel.keys()) : + tmp = [] + for data_tuple in tuple_tmp: + for channel, max_range in max_range_by_channel.items(): + tmp.append(data_tuple[channel-1] % max_range) + else: + tmp = [ti for t in tuple_tmp for ti in t] + return tmp def __parse_non_int_data(self, offset, start, stop, data_type, order): diff --git a/flowio/tests/flowdata_lmd_tests.py b/flowio/tests/flowdata_lmd_tests.py index a1393e7..24c512a 100644 --- a/flowio/tests/flowdata_lmd_tests.py +++ b/flowio/tests/flowdata_lmd_tests.py @@ -1,6 +1,7 @@ import unittest import warnings -from flowio import FlowData +import array +from flowio import FlowData, read_multiple_data_sets from flowio.exceptions import FCSParsingError @@ -8,10 +9,9 @@ class FlowDataLMDTestCase(unittest.TestCase): def setUp(self): with warnings.catch_warnings(): warnings.simplefilter('ignore') - self.flow_data = FlowData( + self.flow_data, self.fcs3_data = read_multiple_data_sets( 'examples/fcs_files/coulter.lmd', - ignore_offset_error=True, - nextdata_offset=0 + ignore_offset_error=True ) def test_event_count(self): @@ -26,3 +26,11 @@ def test_get_text(self): def test_fail_data_offset_error(self): with self.assertRaises(FCSParsingError): FlowData('examples/fcs_files/coulter.lmd', nextdata_offset=0) + + def test_right_integer_reading(self): + self.assertEqual( + self.fcs3_data.events[:24], + array.array("I", [61056, 131840, 46, 324, 10309, 104, 11912, 0, + 257280, 378656, 139, 1728, 58688, 354, 58720, 0, + 164128, 305376, 159, 924, 29024, 208, 29728, 0]) + ) \ No newline at end of file diff --git a/flowio/tests/flowdata_tests.py b/flowio/tests/flowdata_tests.py index f22902e..97dffce 100644 --- a/flowio/tests/flowdata_tests.py +++ b/flowio/tests/flowdata_tests.py @@ -65,12 +65,30 @@ def test_parse_var_int_data(self): event_values = [ 49135, 61373, 48575, 49135, 61373, 48575, 7523, 598, 49135, 61373, 48575, 49135, 61373, 48575, 28182, 61200, 48575, 49135, 32445, 30797, - 19057, 49135, 61373, 48575, 5969, 142482809, + 19057, 49135, 61373, 48575, 5969, 8265081, 61266, 48575, 49135, 20925, 61265, 48575, 27961, 25200, 61287, 48575, 9795, 49135, 29117, 49135, 61373, 48575, 61228, 48575, 22, 21760, 49135, - 20413, 49135, 23997, 19807, 3220139858 + 20413, 49135, 23997, 19807, 15691602 ] + # To double-check our logic, let's use the 2 values from + # channel 26 where: + # PnB is 32 + # PnR is 11209599 + # The 1st value for chan 26, 32-bit, unmasked: 142482809 + # The 2nd value for chan 26, 32-bit, unmasked: 3220139858 + # The next power of 2 above 11209599 (PnR) is: 16777216 + # Subtracting 1 from this power of 2, we can see what the + # new values should be from the binary: + # 1st value: + # 0000 1000 0111 1110 0001 1101 0111 1001 142482809 (orig 32-bit value) + # 0000 0000 1111 1111 1111 1111 1111 1111 16777215 (2 ** 24 - 1) + # 0000 0000 0111 1110 0001 1101 0111 1001 8265081 (new value) + # 2nd value: + # 1011 1111 1110 1111 0110 1111 0101 0010 3220139858 (orig 32-bit value) + # 0000 0000 1111 1111 1111 1111 1111 1111 16777215 (2 ** 24 - 1) + # 0000 0000 1110 1111 0110 1111 0101 0010 15691602 (new value) + fcs_file = "examples/fcs_files/variable_int_example.fcs" sample = FlowData(fcs_file)