Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

value_counts of column that contains masked values and empty strings. #434

Merged
merged 3 commits into from
Oct 9, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
80 changes: 62 additions & 18 deletions packages/vaex-core/src/strings.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -307,15 +307,32 @@ class StringSequenceBase : public StringSequence {
auto m = matches.mutable_unchecked<1>();
{
py::gil_scoped_release release;
for(size_t i = 0; i < length; i++) {
#if defined(_MSC_VER)
auto str = get(i);
bool match = str == other;
#else
auto str = view(i);
bool match = str == other;
#endif
m(i) = match;
if(has_null()){
for(size_t i = 0; i < length; i++) {
if(is_null(i)) {
m(i) = false;
} else {
#if defined(_MSC_VER)
auto str = get(i);
bool match = str == other;
#else
auto str = view(i);
bool match = str == other;
#endif
m(i) = match;
}
}
} else {
for(size_t i = 0; i < length; i++) {
#if defined(_MSC_VER)
auto str = get(i);
bool match = str == other;
#else
auto str = view(i);
bool match = str == other;
#endif
m(i) = match;
}
}
}
return std::move(matches);
Expand All @@ -328,11 +345,24 @@ class StringSequenceBase : public StringSequence {
auto m = matches.mutable_unchecked<1>();
{
py::gil_scoped_release release;
for(size_t i = 0; i < length; i++) {
auto str = view(i);
auto other = others->view(i);
bool match = str == other;
m(i) = match;
if(has_null() || others->has_null()) {
for(size_t i = 0; i < length; i++) {
if(is_null(i) || others->is_null(i)) {
m(i) = false;
} else {
auto str = view(i);
auto other = others->view(i);
bool match = str == other;
m(i) = match;
}
}
} else {
for(size_t i = 0; i < length; i++) {
auto str = view(i);
auto other = others->view(i);
bool match = str == other;
m(i) = match;
}
}
}
return std::move(matches);
Expand Down Expand Up @@ -1586,7 +1616,7 @@ const char* empty = "";

class StringArray : public StringSequenceBase {
public:
StringArray(PyObject** object_array, size_t length) : StringSequenceBase(length), _byte_size(0), _has_null(false) {
StringArray(PyObject** object_array, size_t length, uint8_t* byte_mask=nullptr) : StringSequenceBase(length), _byte_size(0), _has_null(false) {
#if PY_MAJOR_VERSION == 2
utf8_objects = (PyObject**)malloc(length * sizeof(void*));
#endif
Expand All @@ -1597,7 +1627,7 @@ class StringArray : public StringSequenceBase {
objects[i] = object_array[i];
Py_IncRef(objects[i]);
#if PY_MAJOR_VERSION == 3
if(PyUnicode_CheckExact(object_array[i])) {
if(PyUnicode_CheckExact(object_array[i]) && ((byte_mask == nullptr) || (byte_mask[i] == 0))) {
// python37 declares as const
strings[i] = (char*)PyUnicode_AsUTF8AndSize(object_array[i], &sizes[i]);
} else {
Expand All @@ -1606,12 +1636,12 @@ class StringArray : public StringSequenceBase {
sizes[i] = 0;
}
#else
if(PyUnicode_CheckExact(object_array[i])) {
if(PyUnicode_CheckExact(object_array[i]) && ((byte_mask == nullptr) || (byte_mask[i] == 0))) {
// if unicode, first convert to utf8
utf8_objects[i] = PyUnicode_AsUTF8String(object_array[i]);
sizes[i] = PyString_Size(utf8_objects[i]);
strings[i] = PyString_AsString(utf8_objects[i]);
} else if(PyString_CheckExact(object_array[i])) {
} else if(PyString_CheckExact(object_array[i]) && ((byte_mask == nullptr) || (byte_mask[i] == 0))) {
// otherwise directly use
utf8_objects[i] = 0;
sizes[i] = PyString_Size(object_array[i]);
Expand Down Expand Up @@ -2079,6 +2109,20 @@ PYBIND11_MODULE(superstrings, m) {
new StringArray((PyObject**)info.ptr, info.shape[0]));
}) // no need to keep a reference to the ndarrays
)
.def(py::init([](py::buffer string_array, py::buffer mask_array) {
py::buffer_info info = string_array.request();
py::buffer_info mask_info = mask_array.request();
if(info.ndim != 1) {
throw std::runtime_error("Expected a 1d byte buffer");
}
if(info.format != "O") {
throw std::runtime_error("Expected an object array");
}
// std::cout << info.format << " format" << std::endl;
return std::unique_ptr<StringArray>(
new StringArray((PyObject**)info.ptr, info.shape[0], (uint8_t*)mask_info.ptr));
}) // no need to keep a reference to the ndarrays
)
.def("to_arrow", &StringArray::to_arrow) // nothing to keep alive, all a copy
// .def("get", &StringArray::get_)
// .def("get", (const std::string (StringArray::*)(int64_t))&StringArray::get)
Expand Down
8 changes: 6 additions & 2 deletions packages/vaex-core/vaex/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -282,11 +282,15 @@ def _to_string_sequence(x):
if isinstance(x, ColumnString):
return x.string_sequence
elif isinstance(x, np.ndarray):
mask = None
if np.ma.isMaskedArray(x):
mask = np.ma.getmaskarray(x)
x = x.data
if x.dtype == 'O':
return vaex.strings.StringArray(x)
return vaex.strings.StringArray(x) if mask is None else vaex.strings.StringArray(x, mask)
elif x.dtype.kind in 'US':
x = x.astype('O')
return vaex.strings.StringArray(x)
return vaex.strings.StringArray(x) if mask is None else vaex.strings.StringArray(x, mask)
else:
raise ValueError('unsupported dtype ' +str(x.dtype))
else:
Expand Down
10 changes: 0 additions & 10 deletions packages/vaex-core/vaex/functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -796,21 +796,11 @@ def str_equals(x, y):
"""
xmask = None
ymask = None
if np.ma.isMaskedArray(x):
x, xmask = x.data, np.ma.getmaskarray(x)
if np.ma.isMaskedArray(y):
y, ymask = x.data, np.ma.getmaskarray(y)

if not isinstance(x, six.string_types):
x = _to_string_sequence(x)
if not isinstance(y, six.string_types):
y = _to_string_sequence(y)
equals_mask = x.equals(y)
# take out masked values
if xmask is not None:
equals_mask = equals_mask & ~xmask
if ymask is not None:
equals_mask = equals_mask & ~ymask
return equals_mask


Expand Down
11 changes: 10 additions & 1 deletion tests/internal/strings_module_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,21 @@ def test_regex():
assert sl.search('aa', True).tolist() == [True, False, False]
# assert False


def test_regex_array():
ar = np.array(["aap", "noot", "mies"], dtype='object')
sa = vaex.strings.StringArray(ar)
assert sa.search('aa', False).tolist() == [True, False, False]
assert sa.search('aa', True).tolist() == [True, False, False]
# assert False


def test_masked_array():
ar = np.array(['dog', 'dog', 'cat', 'cat', 'mouse'], dtype=object)
mask = np.array([False, False, True, False, True], dtype=bool)
sa = vaex.strings.StringArray(ar, mask)
assert sa.tolist() == ['dog', 'dog', None, 'cat', None]
assert sa.equals('cat').tolist() == [False, False, False, True, False]
assert sa.equals(sa).tolist() == [True, True, False, True, False]

def test_string_array():
ar = np.array(["aap", "noot", None, "mies"], dtype='object')
Expand Down
24 changes: 24 additions & 0 deletions tests/value_counts_test.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,3 +78,27 @@ def test_value_counts_object_missing():
assert len(df.x.value_counts(dropnan=False, dropmissing=False)) == 8
assert len(df.x.value_counts(dropnan=True, dropmissing=True)) == 6


def test_value_counts_masked_str():
x = np.ma.MaskedArray(data=['A' , 'A' , 'A' , 'B' , 'B' , 'B' , '' , '' , '' ],
mask=[False, True, False, False, True, True, False, True, False])
df = vaex.from_arrays(x=x)

value_counts = df.x.value_counts()
assert len(value_counts) == 4
assert value_counts['A'] == 2
assert value_counts['B'] == 1
assert value_counts[''] == 2
assert value_counts['missing'] == 4

value_counts = df.x.value_counts(dropmissing=True)
assert len(value_counts) == 3
assert value_counts['A'] == 2
assert value_counts['B'] == 1
assert value_counts[''] == 2

value_counts = df.x.value_counts(dropna=True)
assert len(value_counts) == 3
assert value_counts['A'] == 2
assert value_counts['B'] == 1
assert value_counts[''] == 2