Skip to content

Commit

Permalink
Merge branch 'bit-masked-integers' into develop
Browse files Browse the repository at this point in the history
  • Loading branch information
whitews committed May 17, 2023
2 parents ad1ad74 + e481694 commit 344b32b
Show file tree
Hide file tree
Showing 4 changed files with 120 additions and 26 deletions.
2 changes: 1 addition & 1 deletion flowio/_version.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""
FlowIO version
"""
__version__ = "1.2.0"
__version__ = "1.2.1b0"
106 changes: 87 additions & 19 deletions flowio/flowdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,13 @@
basestring = str


def _next_power_of_2(x):
if x == 0:
return 1
else:
return 2 ** (x - 1).bit_length()


class FlowData(object):
"""
Object representing a Flow Cytometry Standard (FCS) file.
Expand Down Expand Up @@ -101,7 +108,10 @@ def __init__(
)

if int(self.text.get("nextdata", "0")) != 0 and nextdata_offset is None:
raise MultipleDataSetsError()
self._fh.close()
raise MultipleDataSetsError(
"%s contains multiple data sets, use read_multiple_data_sets function" % self.name
)

self.channel_count = int(self.text['par'])
self.event_count = int(self.text['tot'])
Expand Down Expand Up @@ -307,16 +317,27 @@ def __parse_data(self, offset, start, stop, text):
order = '@'
# from here on out we assume mode "l" (list)

bit_width = []
for i in range(1, int(text['par']) + 1):
bit_width.append(int(text['p%db' % i]))

if data_type.lower() == 'i':
# For int data we need to check the bit width and range values.
# The PnR value specifies the max value for the channel. This
# value is exclusive, e.g. a value of 1024 means the highest
# integer value allowed is 1023. Integer data needs to be
# bit-masked according to this max range value.
bit_width_by_channel = {}
max_range_by_channel = {}
for i in range(1, int(text['par']) + 1):
bit_width_by_channel[i] = int(text['p%db' % i])

# Need to verify the value is a power of 2
tmp_max_range = int(text['p%dr' % i])
max_range_by_channel[i] = _next_power_of_2(tmp_max_range)

data = self.__parse_int_data(
offset,
start,
stop,
bit_width,
bit_width_by_channel,
max_range_by_channel,
order
)
else:
Expand Down Expand Up @@ -368,42 +389,89 @@ def __calc_data_item_count(self, start, stop, data_type_size):

return num_items, stop

def __parse_int_data(self, offset, start, stop, bit_width, order):
def __parse_int_data(
self,
offset,
start,
stop,
bit_width_lut,
max_range_lut,
order
):
"""Parse out and return integer list data from FCS file"""

if reduce(and_, [item in [8, 16, 32] for item in bit_width]):
# We have a uniform bit width for all parameters,
# use the first value to determine the number of actual events
if len(set(bit_width)) == 1:
data_type_size = bit_width[0] / 8
if reduce(and_, [item in [8, 16, 32] for item in bit_width_lut.values()]):
# Determine if we have uniform bit width values for all parameters.
# If so, use array.array for much faster parsing
if len(set(bit_width_lut.values())) == 1:
# We do have a uniform bit width, grab the 1st value to
# determine the number of actual events
bit_width = list(bit_width_lut.values())[0]
data_type_size = bit_width / 8
num_items, stop = self.__calc_data_item_count(start, stop, data_type_size)

# Here, we're reading the initial data array, but some channel
# data may still need bit-masking correction using max range
self._fh.seek(offset + start)
tmp = array.array(self.__format_integer(bit_width[0]))
tmp = array.array(self.__format_integer(bit_width))
tmp.fromfile(self._fh, int(num_items))
if order == '>':
tmp.byteswap()

# parameter sizes are different
# e.g. 8, 8, 16, 8, 32 ...
# If any bits higher shall be
# ignored using a bit mask. If the PnR value is not a power
# of 2, then the next power of 2 shall be used.
if any(2 ** bit_width_lut[c] > max_range_lut[c] for
c in bit_width_lut.keys()) :

amount_data_points = int(num_items / len(max_range_lut))

# Create bit mask array matching length of our data array,
# with values for every position being the max range value.
bit_mask = array.array(
self.__format_integer(bit_width),
[mr - 1 for mr in max_range_lut.values()] * amount_data_points
)
new_tmp = array.array(self.__format_integer(bit_width))
new_tmp.frombytes(bytes(map(lambda a,b: a&b, tmp.tobytes(), bit_mask.tobytes())))
tmp = new_tmp
else:
# parameter sizes are different
# e.g. 8, 8, 16, 8, 32 ...
# can't use array for heterogeneous bit widths
tmp = self.__extract_var_length_int(bit_width, offset, order, start, stop)
tmp = self.__extract_var_length_int(
bit_width_lut,
max_range_lut,
offset,
order,
start,
stop
)

else: # non standard bit width... Does this happen?
warn('Non-standard bit width for data segments')
return None

return tmp

def __extract_var_length_int(self, bit_width, offset, order, start, stop):
def __extract_var_length_int(self, bit_width_by_channel, max_range_by_channel,
offset, order, start, stop):
data_format = order
for cur_width in bit_width:
for cur_width in bit_width_by_channel.values():
data_format += '%s' % self.__format_integer(cur_width)

# array module doesn't have a function to heterogeneous bit widths,
# so fall back to the slower unpack approach
tuple_tmp = iter_unpack(data_format, self.__read_bytes(offset, start, stop))
tmp = [ti for t in tuple_tmp for ti in t]
if any(2**bit_width_by_channel[c] > max_range_by_channel[c] for
c in bit_width_by_channel.keys()) :
tmp = []
for data_tuple in tuple_tmp:
for channel, max_range in max_range_by_channel.items():
tmp.append(data_tuple[channel-1] % max_range)
else:
tmp = [ti for t in tuple_tmp for ti in t]

return tmp

def __parse_non_int_data(self, offset, start, stop, data_type, order):
Expand Down
16 changes: 12 additions & 4 deletions flowio/tests/flowdata_lmd_tests.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,17 @@
import unittest
import warnings
from flowio import FlowData
import array
from flowio import FlowData, read_multiple_data_sets
from flowio.exceptions import FCSParsingError


class FlowDataLMDTestCase(unittest.TestCase):
def setUp(self):
with warnings.catch_warnings():
warnings.simplefilter('ignore')
self.flow_data = FlowData(
self.flow_data, self.fcs3_data = read_multiple_data_sets(
'examples/fcs_files/coulter.lmd',
ignore_offset_error=True,
nextdata_offset=0
ignore_offset_error=True
)

def test_event_count(self):
Expand All @@ -26,3 +26,11 @@ def test_get_text(self):
def test_fail_data_offset_error(self):
with self.assertRaises(FCSParsingError):
FlowData('examples/fcs_files/coulter.lmd', nextdata_offset=0)

def test_right_integer_reading(self):
self.assertEqual(
self.fcs3_data.events[:24],
array.array("I", [61056, 131840, 46, 324, 10309, 104, 11912, 0,
257280, 378656, 139, 1728, 58688, 354, 58720, 0,
164128, 305376, 159, 924, 29024, 208, 29728, 0])
)
22 changes: 20 additions & 2 deletions flowio/tests/flowdata_tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,30 @@ def test_parse_var_int_data(self):
event_values = [
49135, 61373, 48575, 49135, 61373, 48575, 7523, 598, 49135, 61373,
48575, 49135, 61373, 48575, 28182, 61200, 48575, 49135, 32445, 30797,
19057, 49135, 61373, 48575, 5969, 142482809,
19057, 49135, 61373, 48575, 5969, 8265081,
61266, 48575, 49135, 20925, 61265, 48575, 27961, 25200, 61287, 48575, 9795,
49135, 29117, 49135, 61373, 48575, 61228, 48575, 22, 21760, 49135,
20413, 49135, 23997, 19807, 3220139858
20413, 49135, 23997, 19807, 15691602
]

# To double-check our logic, let's use the 2 values from
# channel 26 where:
# PnB is 32
# PnR is 11209599
# The 1st value for chan 26, 32-bit, unmasked: 142482809
# The 2nd value for chan 26, 32-bit, unmasked: 3220139858
# The next power of 2 above 11209599 (PnR) is: 16777216
# Subtracting 1 from this power of 2, we can see what the
# new values should be from the binary:
# 1st value:
# 0000 1000 0111 1110 0001 1101 0111 1001 142482809 (orig 32-bit value)
# 0000 0000 1111 1111 1111 1111 1111 1111 16777215 (2 ** 24 - 1)
# 0000 0000 0111 1110 0001 1101 0111 1001 8265081 (new value)
# 2nd value:
# 1011 1111 1110 1111 0110 1111 0101 0010 3220139858 (orig 32-bit value)
# 0000 0000 1111 1111 1111 1111 1111 1111 16777215 (2 ** 24 - 1)
# 0000 0000 1110 1111 0110 1111 0101 0010 15691602 (new value)

fcs_file = "examples/fcs_files/variable_int_example.fcs"
sample = FlowData(fcs_file)

Expand Down

0 comments on commit 344b32b

Please sign in to comment.