From 29294fe33fc302ab4f030cae87a80c87d9b7089c Mon Sep 17 00:00:00 2001 From: Maxmilian Bertsch Date: Mon, 15 May 2023 15:12:48 +0200 Subject: [PATCH 01/10] use bitmask to only read meaning full bits for integer values --- flowio/flowdata.py | 57 +++++++++++++++++++++++++++++++++++----------- 1 file changed, 44 insertions(+), 13 deletions(-) diff --git a/flowio/flowdata.py b/flowio/flowdata.py index 60b7ec2..6b6e2ee 100644 --- a/flowio/flowdata.py +++ b/flowio/flowdata.py @@ -4,6 +4,7 @@ from warnings import warn import os import re +import math from functools import reduce from .create_fcs import create_fcs from .exceptions import FCSParsingError, DataOffsetDiscrepancyError, MultipleDataSetsError @@ -307,16 +308,19 @@ def __parse_data(self, offset, start, stop, text): order = '@' # from here on out we assume mode "l" (list) - bit_width = [] + bit_width_by_channel = {} + max_range_by_channel = {} for i in range(1, int(text['par']) + 1): - bit_width.append(int(text['p%db' % i])) + bit_width_by_channel[i] = int(text['p%db' % i]) + max_range_by_channel[i] = int(text['p%dr' % i]) if data_type.lower() == 'i': data = self.__parse_int_data( offset, start, stop, - bit_width, + bit_width_by_channel, + max_range_by_channel, order ) else: @@ -368,42 +372,69 @@ def __calc_data_item_count(self, start, stop, data_type_size): return num_items, stop - def __parse_int_data(self, offset, start, stop, bit_width, order): + def __parse_int_data(self, offset, start, stop, bit_width_by_channel, + max_range_by_channel, order): """Parse out and return integer list data from FCS file""" - if reduce(and_, [item in [8, 16, 32] for item in bit_width]): + if reduce(and_, [item in [8, 16, 32] for item in bit_width_by_channel.values()]): # We have a uniform bit width for all parameters, # use the first value to determine the number of actual events - if len(set(bit_width)) == 1: - data_type_size = bit_width[0] / 8 + if len(set(bit_width_by_channel.values())) == 1: + bit_width = list(bit_width_by_channel.values())[0] + data_type_size = bit_width / 8 num_items, stop = self.__calc_data_item_count(start, stop, data_type_size) self._fh.seek(offset + start) - tmp = array.array(self.__format_integer(bit_width[0])) + tmp = array.array(self.__format_integer(bit_width)) tmp.fromfile(self._fh, int(num_items)) if order == '>': tmp.byteswap() - + # acording to the FCS standard the PnR value of Integer values + # determines how many bit of the max bit_width are actually used + # for the data + if any(2**bit_width_by_channel[c] > max_range_by_channel[c] for + c in bit_width_by_channel.keys()) : + # for i in range(len(tmp)): + # value = tmp.pop(0) + # channel = i % len(bit_width_by_channel) + 1 + # tmp.append(value % max_range_by_channel[channel]) + amount_data_points = int(num_items / len(max_range_by_channel)) + #create bit mask for extracting the right amount of bits + bit_mask = array.array(self.__format_integer(bit_width), + [mr -1 for mr in max_range_by_channel.values()]*amount_data_points) + new_tmp = array.array(self.__format_integer(bit_width)) + new_tmp.frombytes(bytes(map(lambda a,b: a&b, tmp.tobytes(), bit_mask.tobytes()))) + tmp = new_tmp # parameter sizes are different # e.g. 8, 8, 16, 8, 32 ... else: # can't use array for heterogeneous bit widths - tmp = self.__extract_var_length_int(bit_width, offset, order, start, stop) + tmp = self.__extract_var_length_int(bit_width_by_channel, max_range_by_channel, + offset, order, start, stop) else: # non standard bit width... Does this happen? warn('Non-standard bit width for data segments') return None return tmp - def __extract_var_length_int(self, bit_width, offset, order, start, stop): + def __extract_var_length_int(self, bit_width_by_channel, max_range_by_channel, + offset, order, start, stop): data_format = order - for cur_width in bit_width: + for cur_width in bit_width_by_channel.values(): data_format += '%s' % self.__format_integer(cur_width) # array module doesn't have a function to heterogeneous bit widths, # so fall back to the slower unpack approach tuple_tmp = iter_unpack(data_format, self.__read_bytes(offset, start, stop)) - tmp = [ti for t in tuple_tmp for ti in t] + if any(2**bit_width_by_channel[c] > max_range_by_channel[c] for + c in bit_width_by_channel.keys()) : + tmp = [] + for data_tuple in tuple_tmp: + for channel, max_range in max_range_by_channel.items(): + tmp.append(data_tuple[channel-1] % max_range) + else: + tmp = [ti for t in tuple_tmp for ti in t] + return tmp def __parse_non_int_data(self, offset, start, stop, data_type, order): From 50caf6ad6877312b6c9e698c0458665b45281bd0 Mon Sep 17 00:00:00 2001 From: Maxmilian Bertsch Date: Mon, 15 May 2023 15:13:01 +0200 Subject: [PATCH 02/10] add test for fcs3.0 data of lmd file --- flowio/tests/flowdata_lmd_tests.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/flowio/tests/flowdata_lmd_tests.py b/flowio/tests/flowdata_lmd_tests.py index a1393e7..24c512a 100644 --- a/flowio/tests/flowdata_lmd_tests.py +++ b/flowio/tests/flowdata_lmd_tests.py @@ -1,6 +1,7 @@ import unittest import warnings -from flowio import FlowData +import array +from flowio import FlowData, read_multiple_data_sets from flowio.exceptions import FCSParsingError @@ -8,10 +9,9 @@ class FlowDataLMDTestCase(unittest.TestCase): def setUp(self): with warnings.catch_warnings(): warnings.simplefilter('ignore') - self.flow_data = FlowData( + self.flow_data, self.fcs3_data = read_multiple_data_sets( 'examples/fcs_files/coulter.lmd', - ignore_offset_error=True, - nextdata_offset=0 + ignore_offset_error=True ) def test_event_count(self): @@ -26,3 +26,11 @@ def test_get_text(self): def test_fail_data_offset_error(self): with self.assertRaises(FCSParsingError): FlowData('examples/fcs_files/coulter.lmd', nextdata_offset=0) + + def test_right_integer_reading(self): + self.assertEqual( + self.fcs3_data.events[:24], + array.array("I", [61056, 131840, 46, 324, 10309, 104, 11912, 0, + 257280, 378656, 139, 1728, 58688, 354, 58720, 0, + 164128, 305376, 159, 924, 29024, 208, 29728, 0]) + ) \ No newline at end of file From 9bb1f86f63ed3fe5ffe53bc8323e703a541f0566 Mon Sep 17 00:00:00 2001 From: Maxmilian Bertsch Date: Mon, 15 May 2023 15:13:19 +0200 Subject: [PATCH 03/10] dirty fix for var int file test --- flowio/tests/flowdata_tests.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/flowio/tests/flowdata_tests.py b/flowio/tests/flowdata_tests.py index f22902e..a46aac5 100644 --- a/flowio/tests/flowdata_tests.py +++ b/flowio/tests/flowdata_tests.py @@ -65,10 +65,10 @@ def test_parse_var_int_data(self): event_values = [ 49135, 61373, 48575, 49135, 61373, 48575, 7523, 598, 49135, 61373, 48575, 49135, 61373, 48575, 28182, 61200, 48575, 49135, 32445, 30797, - 19057, 49135, 61373, 48575, 5969, 142482809, + 19057, 49135, 61373, 48575, 5969, 7967621, 61266, 48575, 49135, 20925, 61265, 48575, 27961, 25200, 61287, 48575, 9795, 49135, 29117, 49135, 61373, 48575, 61228, 48575, 22, 21760, 49135, - 20413, 49135, 23997, 19807, 3220139858 + 20413, 49135, 23997, 19807, 2984945 ] fcs_file = "examples/fcs_files/variable_int_example.fcs" From efd4727a171c9666d003195d39566298ab9fd46b Mon Sep 17 00:00:00 2001 From: whitews Date: Tue, 16 May 2023 17:35:07 -0400 Subject: [PATCH 04/10] bump version to beta patch v1.2.1b0 --- flowio/_version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/flowio/_version.py b/flowio/_version.py index ad194f9..1f5be26 100644 --- a/flowio/_version.py +++ b/flowio/_version.py @@ -1,4 +1,4 @@ """ FlowIO version """ -__version__ = "1.2.0" +__version__ = "1.2.1b0" From e003e32d7abf21aa5d60723e71b521920351bce4 Mon Sep 17 00:00:00 2001 From: whitews Date: Tue, 16 May 2023 17:36:33 -0400 Subject: [PATCH 05/10] remove unused import --- flowio/flowdata.py | 1 - 1 file changed, 1 deletion(-) diff --git a/flowio/flowdata.py b/flowio/flowdata.py index 6b6e2ee..798096a 100644 --- a/flowio/flowdata.py +++ b/flowio/flowdata.py @@ -4,7 +4,6 @@ from warnings import warn import os import re -import math from functools import reduce from .create_fcs import create_fcs from .exceptions import FCSParsingError, DataOffsetDiscrepancyError, MultipleDataSetsError From fca169eccf6ce22a39e198fdb8b26fb04cfaca7c Mon Sep 17 00:00:00 2001 From: whitews Date: Tue, 16 May 2023 17:45:35 -0400 Subject: [PATCH 06/10] update comments & minor reformatting --- flowio/flowdata.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/flowio/flowdata.py b/flowio/flowdata.py index 798096a..b0b9679 100644 --- a/flowio/flowdata.py +++ b/flowio/flowdata.py @@ -383,6 +383,8 @@ def __parse_int_data(self, offset, start, stop, bit_width_by_channel, data_type_size = bit_width / 8 num_items, stop = self.__calc_data_item_count(start, stop, data_type_size) + # Here, we're reading the initial data array, but some channel + # data may still need bit-masking correction using max range self._fh.seek(offset + start) tmp = array.array(self.__format_integer(bit_width)) tmp.fromfile(self._fh, int(num_items)) @@ -404,9 +406,9 @@ def __parse_int_data(self, offset, start, stop, bit_width_by_channel, new_tmp = array.array(self.__format_integer(bit_width)) new_tmp.frombytes(bytes(map(lambda a,b: a&b, tmp.tobytes(), bit_mask.tobytes()))) tmp = new_tmp - # parameter sizes are different - # e.g. 8, 8, 16, 8, 32 ... else: + # parameter sizes are different + # e.g. 8, 8, 16, 8, 32 ... # can't use array for heterogeneous bit widths tmp = self.__extract_var_length_int(bit_width_by_channel, max_range_by_channel, offset, order, start, stop) @@ -414,6 +416,7 @@ def __parse_int_data(self, offset, start, stop, bit_width_by_channel, else: # non standard bit width... Does this happen? warn('Non-standard bit width for data segments') return None + return tmp def __extract_var_length_int(self, bit_width_by_channel, max_range_by_channel, From 8b6721ac18476858f952ac731023e649178c866d Mon Sep 17 00:00:00 2001 From: whitews Date: Tue, 16 May 2023 17:48:11 -0400 Subject: [PATCH 07/10] refactor a few vars w/shorter names, remove commented code, comments & PEP8 changes --- flowio/flowdata.py | 61 +++++++++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 22 deletions(-) diff --git a/flowio/flowdata.py b/flowio/flowdata.py index b0b9679..5259e7d 100644 --- a/flowio/flowdata.py +++ b/flowio/flowdata.py @@ -371,15 +371,24 @@ def __calc_data_item_count(self, start, stop, data_type_size): return num_items, stop - def __parse_int_data(self, offset, start, stop, bit_width_by_channel, - max_range_by_channel, order): + def __parse_int_data( + self, + offset, + start, + stop, + bit_width_lut, + max_range_lut, + order + ): """Parse out and return integer list data from FCS file""" - if reduce(and_, [item in [8, 16, 32] for item in bit_width_by_channel.values()]): - # We have a uniform bit width for all parameters, - # use the first value to determine the number of actual events - if len(set(bit_width_by_channel.values())) == 1: - bit_width = list(bit_width_by_channel.values())[0] + if reduce(and_, [item in [8, 16, 32] for item in bit_width_lut.values()]): + # Determine if we have uniform bit width values for all parameters. + # If so, use array.array for much faster parsing + if len(set(bit_width_lut.values())) == 1: + # We do have a uniform bit width, grab the 1st value to + # determine the number of actual events + bit_width = list(bit_width_lut.values())[0] data_type_size = bit_width / 8 num_items, stop = self.__calc_data_item_count(start, stop, data_type_size) @@ -390,19 +399,21 @@ def __parse_int_data(self, offset, start, stop, bit_width_by_channel, tmp.fromfile(self._fh, int(num_items)) if order == '>': tmp.byteswap() - # acording to the FCS standard the PnR value of Integer values - # determines how many bit of the max bit_width are actually used - # for the data - if any(2**bit_width_by_channel[c] > max_range_by_channel[c] for - c in bit_width_by_channel.keys()) : - # for i in range(len(tmp)): - # value = tmp.pop(0) - # channel = i % len(bit_width_by_channel) + 1 - # tmp.append(value % max_range_by_channel[channel]) - amount_data_points = int(num_items / len(max_range_by_channel)) - #create bit mask for extracting the right amount of bits - bit_mask = array.array(self.__format_integer(bit_width), - [mr -1 for mr in max_range_by_channel.values()]*amount_data_points) + + # If any bits higher shall be + # ignored using a bit mask. If the PnR value is not a power + # of 2, then the next power of 2 shall be used. + if any(2 ** bit_width_lut[c] > max_range_lut[c] for + c in bit_width_lut.keys()) : + + amount_data_points = int(num_items / len(max_range_lut)) + + # Create bit mask array matching length of our data array, + # with values for every position being the max range value. + bit_mask = array.array( + self.__format_integer(bit_width), + [mr - 1 for mr in max_range_lut.values()] * amount_data_points + ) new_tmp = array.array(self.__format_integer(bit_width)) new_tmp.frombytes(bytes(map(lambda a,b: a&b, tmp.tobytes(), bit_mask.tobytes()))) tmp = new_tmp @@ -410,8 +421,14 @@ def __parse_int_data(self, offset, start, stop, bit_width_by_channel, # parameter sizes are different # e.g. 8, 8, 16, 8, 32 ... # can't use array for heterogeneous bit widths - tmp = self.__extract_var_length_int(bit_width_by_channel, max_range_by_channel, - offset, order, start, stop) + tmp = self.__extract_var_length_int( + bit_width_lut, + max_range_lut, + offset, + order, + start, + stop + ) else: # non standard bit width... Does this happen? warn('Non-standard bit width for data segments') From 9d2754df8762dc6d8cb4cac7580856da4b63107d Mon Sep 17 00:00:00 2001 From: whitews Date: Tue, 16 May 2023 17:52:30 -0400 Subject: [PATCH 08/10] only do bit width/range collection for int data type, calc next power of 2 for PnR values --- flowio/flowdata.py | 27 +++++++++++++++++++++------ 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/flowio/flowdata.py b/flowio/flowdata.py index 5259e7d..c782900 100644 --- a/flowio/flowdata.py +++ b/flowio/flowdata.py @@ -16,6 +16,13 @@ basestring = str +def _next_power_of_2(x): + if x == 0: + return 1 + else: + return 2 ** (x - 1).bit_length() + + class FlowData(object): """ Object representing a Flow Cytometry Standard (FCS) file. @@ -307,13 +314,21 @@ def __parse_data(self, offset, start, stop, text): order = '@' # from here on out we assume mode "l" (list) - bit_width_by_channel = {} - max_range_by_channel = {} - for i in range(1, int(text['par']) + 1): - bit_width_by_channel[i] = int(text['p%db' % i]) - max_range_by_channel[i] = int(text['p%dr' % i]) - if data_type.lower() == 'i': + # For int data we need to check the bit width and range values. + # The PnR value specifies the max value for the channel. This + # value is exclusive, e.g. a value of 1024 means the highest + # integer value allowed is 1023. Integer data needs to be + # bit-masked according to this max range value. + bit_width_by_channel = {} + max_range_by_channel = {} + for i in range(1, int(text['par']) + 1): + bit_width_by_channel[i] = int(text['p%db' % i]) + + # Need to verify the value is a power of 2 + tmp_max_range = int(text['p%dr' % i]) + max_range_by_channel[i] = _next_power_of_2(tmp_max_range) + data = self.__parse_int_data( offset, start, From 89869be9308bb31c0c287b8b4c747aea032d100f Mon Sep 17 00:00:00 2001 From: whitews Date: Tue, 16 May 2023 17:55:08 -0400 Subject: [PATCH 09/10] update test for int data type w/correct bit-masked values & extensive comment explanation --- flowio/tests/flowdata_tests.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/flowio/tests/flowdata_tests.py b/flowio/tests/flowdata_tests.py index a46aac5..97dffce 100644 --- a/flowio/tests/flowdata_tests.py +++ b/flowio/tests/flowdata_tests.py @@ -65,12 +65,30 @@ def test_parse_var_int_data(self): event_values = [ 49135, 61373, 48575, 49135, 61373, 48575, 7523, 598, 49135, 61373, 48575, 49135, 61373, 48575, 28182, 61200, 48575, 49135, 32445, 30797, - 19057, 49135, 61373, 48575, 5969, 7967621, + 19057, 49135, 61373, 48575, 5969, 8265081, 61266, 48575, 49135, 20925, 61265, 48575, 27961, 25200, 61287, 48575, 9795, 49135, 29117, 49135, 61373, 48575, 61228, 48575, 22, 21760, 49135, - 20413, 49135, 23997, 19807, 2984945 + 20413, 49135, 23997, 19807, 15691602 ] + # To double-check our logic, let's use the 2 values from + # channel 26 where: + # PnB is 32 + # PnR is 11209599 + # The 1st value for chan 26, 32-bit, unmasked: 142482809 + # The 2nd value for chan 26, 32-bit, unmasked: 3220139858 + # The next power of 2 above 11209599 (PnR) is: 16777216 + # Subtracting 1 from this power of 2, we can see what the + # new values should be from the binary: + # 1st value: + # 0000 1000 0111 1110 0001 1101 0111 1001 142482809 (orig 32-bit value) + # 0000 0000 1111 1111 1111 1111 1111 1111 16777215 (2 ** 24 - 1) + # 0000 0000 0111 1110 0001 1101 0111 1001 8265081 (new value) + # 2nd value: + # 1011 1111 1110 1111 0110 1111 0101 0010 3220139858 (orig 32-bit value) + # 0000 0000 1111 1111 1111 1111 1111 1111 16777215 (2 ** 24 - 1) + # 0000 0000 1110 1111 0110 1111 0101 0010 15691602 (new value) + fcs_file = "examples/fcs_files/variable_int_example.fcs" sample = FlowData(fcs_file) From e481694d43695edb14343ce9a1557f1dc15e7d05 Mon Sep 17 00:00:00 2001 From: whitews Date: Tue, 16 May 2023 17:56:02 -0400 Subject: [PATCH 10/10] close file before throwing error, better error message --- flowio/flowdata.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/flowio/flowdata.py b/flowio/flowdata.py index c782900..ab9945d 100644 --- a/flowio/flowdata.py +++ b/flowio/flowdata.py @@ -108,7 +108,10 @@ def __init__( ) if int(self.text.get("nextdata", "0")) != 0 and nextdata_offset is None: - raise MultipleDataSetsError() + self._fh.close() + raise MultipleDataSetsError( + "%s contains multiple data sets, use read_multiple_data_sets function" % self.name + ) self.channel_count = int(self.text['par']) self.event_count = int(self.text['tot'])