Skip to content

Commit

Permalink
Added more tests and one correction for format_str
Browse files Browse the repository at this point in the history
  • Loading branch information
AlenkaF committed Sep 7, 2021
1 parent 5e64b36 commit 14fa9a4
Show file tree
Hide file tree
Showing 2 changed files with 61 additions and 5 deletions.
5 changes: 4 additions & 1 deletion packages/vaex-core/vaex/dataframe_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -377,7 +377,10 @@ def _dtype_from_vaexdtype(self, dtype) -> Tuple[enum.IntEnum, int, str, str]:
raise NotImplementedError(f"Data type {dtype} not handled yet")

bitwidth = dtype.numpy.itemsize * 8
format_str = dtype.numpy.str
if not isinstance(self._col.values, np.ndarray) and isinstance(self._col.values.type, pa.DictionaryType):
format_str = self._col.index_values().dtype.numpy.str
else:
format_str = dtype.numpy.str
endianness = dtype.byteorder if not kind == _k.CATEGORICAL else '='
return (kind, bitwidth, format_str, endianness)

Expand Down
61 changes: 57 additions & 4 deletions tests/dataframe_protocol_test.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,10 @@
import numpy as np
import pyarrow as pa
import pytest
from typing import Any, Optional, Tuple, Dict, Iterable, Sequence

DataFrameObject = Any
ColumnObject = Any

import vaex
from common import *
Expand All @@ -10,25 +14,32 @@
def test_float_only(df_factory):
df = df_factory(x=[1.5, 2.5, 3.5], y=[9.2, 10.5, 11.8])
df2 = _from_dataframe_to_vaex(df.__dataframe__())

assert df2.x.tolist() == df.x.tolist()
assert df2.y.tolist() == df.y.tolist()
assert df2.__dataframe__().get_column_by_name('x').null_count == 0
assert df2.__dataframe__().get_column_by_name('y').null_count == 0

assert_dataframe_equal(df.__dataframe__(), df)

def test_mixed_intfloat(df_factory):
df = df_factory(x=[1, 2, 0], y=[9.2, 10.5, 11.8])
df2 = _from_dataframe_to_vaex(df.__dataframe__())

assert df2.x.tolist() == df.x.tolist()
assert df2.y.tolist() == df.y.tolist()
assert df2.__dataframe__().get_column_by_name('x').null_count == 0
assert df2.__dataframe__().get_column_by_name('y').null_count == 0

assert_dataframe_equal(df.__dataframe__(), df)

def test_mixed_intfloatbool(df_factory):
df = df_factory(
x=np.array([True, True, False]),
y=np.array([1, 2, 0]),
z=np.array([9.2, 10.5, 11.8]))
df2 = _from_dataframe_to_vaex(df.__dataframe__())

assert df2.x.tolist() == df.x.tolist()
assert df2.y.tolist() == df.y.tolist()
assert df2.z.tolist() == df.z.tolist()
Expand All @@ -49,6 +60,8 @@ def test_mixed_intfloatbool(df_factory):
assert df2.__dataframe__().get_column_by_name('y').describe_categorical
assert df2.__dataframe__().get_column_by_name('y').describe_null == (3, 1)

assert_dataframe_equal(df.__dataframe__(), df)

def test_mixed_missing(df_factory_arrow):
df = df_factory_arrow(
x=np.array([True, None, False, None, True]),
Expand All @@ -74,6 +87,8 @@ def test_mixed_missing(df_factory_arrow):
assert df2.__dataframe__().get_column_by_name('z').null_count == 2
assert df['z'].dtype == df2['z'].dtype

assert_dataframe_equal(df.__dataframe__(), df)

def test_missing_from_masked(df_factory_numpy):
df = df_factory_numpy(
x=np.ma.array([1, 2, 3, 4, 0], mask=[0, 0, 0, 1, 1], dtype=int),
Expand All @@ -99,6 +114,8 @@ def test_missing_from_masked(df_factory_numpy):
assert df2.__dataframe__().get_column_by_name('z').null_count == 2
assert df['z'].dtype == df2['z'].dtype

assert_dataframe_equal(df.__dataframe__(), df)

def test_categorical_ordinal():
colors = ['red', 'blue', 'green', 'blue']
ds = vaex.from_arrays(
Expand All @@ -110,22 +127,29 @@ def test_categorical_ordinal():
df = df.categorize('weekday', labels=['Mon', 'Tue', 'Wed', 'Thu', 'Fri', 'Sat', 'Sun'])

# Some detailed testing for correctness of dtype and null handling:
col = df.__dataframe__().get_column_by_name('weekday')
col = df.__dataframe__().get_column_by_name('colors')
assert col.dtype[0] == _DtypeKind.CATEGORICAL
assert col.describe_categorical == (False, True, {0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'})
assert col.describe_categorical == (False, True, {0: 'red', 1: 'green', 2: 'blue'})
assert col.describe_null == (3, 1)
assert col.dtype == (23, 64, 'u', '=')
col2 = df.__dataframe__().get_column_by_name('colors')
col2 = df.__dataframe__().get_column_by_name('year')
assert col2.dtype[0] == _DtypeKind.CATEGORICAL
assert col2.describe_categorical == (False, True, {0: 'red', 1: 'green', 2: 'blue'})
assert col2.describe_categorical == (False, True, {0: 2012, 1: 2013, 2: 2014, 3: 2015, 4: 2016, 5: 2017, 6: 2018, 7: 2019})
assert col2.describe_null == (3, 1)
assert col2.dtype == (23, 64, 'u', '=')
col3 = df.__dataframe__().get_column_by_name('weekday')
assert col3.dtype[0] == _DtypeKind.CATEGORICAL
assert col3.describe_categorical == (False, True, {0: 'Mon', 1: 'Tue', 2: 'Wed', 3: 'Thu', 4: 'Fri', 5: 'Sat', 6: 'Sun'})
assert col3.describe_null == (3, 1)
assert col3.dtype == (23, 64, 'u', '=')

df2 = _from_dataframe_to_vaex(df.__dataframe__())
assert df2['colors'].tolist() == ['red', 'blue', 'green', 'blue']
assert df2['year'].tolist() == [2012, 2013, 2015, 2019]
assert df2['weekday'].tolist() == ['Mon', 'Tue', 'Fri', 'Sun']

assert_dataframe_equal(df.__dataframe__(), df)

def test_arrow_dictionary():
indices = pa.array([0, 1, 0, 1, 2, 0, 1, 2])
dictionary = pa.array(['foo', 'bar', 'baz'])
Expand All @@ -136,11 +160,15 @@ def test_arrow_dictionary():
col = df.__dataframe__().get_column_by_name('x')
assert col.dtype[0] == _DtypeKind.CATEGORICAL
assert col.describe_categorical == (False, True, {0: 'foo', 1: 'bar', 2: 'baz'})
assert col.describe_null == (3, 1)
assert col.dtype == (23, 64, 'u', '=')

df2 = _from_dataframe_to_vaex(df.__dataframe__())
assert df2.x.tolist() == df.x.tolist()
assert df2.__dataframe__().get_column_by_name('x').null_count == 0

assert_dataframe_equal(df.__dataframe__(), df)

def test_arrow_dictionary_missing():
indices = pa.array([0, 1, 2, 0, 1], mask=np.array([0, 1, 1, 0, 0], dtype=bool))
dictionary = pa.array(['aap', 'noot', 'mies'])
Expand All @@ -157,6 +185,8 @@ def test_arrow_dictionary_missing():
assert df2.__dataframe__().get_column_by_name('x').null_count == 2
assert df['x'].dtype.index_type == df2['x'].dtype.index_type

assert_dataframe_equal(df.__dataframe__(), df)

def test_string():
df = vaex.from_dict({"A": ["a", "b", "cdef", "", "g"]})
col = df.__dataframe__().get_column_by_name('A')
Expand Down Expand Up @@ -224,3 +254,26 @@ def test_VaexDataFrame():

assert df2.select_columns((0,2))._df[:,0].tolist() == df2.select_columns_by_name(('x','z'))._df[:,0].tolist()
assert df2.select_columns((0,2))._df[:,1].tolist() == df2.select_columns_by_name(('x','z'))._df[:,1].tolist()

def assert_buffer_equal(buffer_dtype: Tuple[_VaexBuffer, Any], vaexcol:vaex.expression.Expression):
buf, dtype = buffer_dtype
pytest.raises(NotImplementedError, buf.__dlpack__)
assert buf.__dlpack_device__() == (1, None)
assert dtype[1] == vaexcol.dtype.numpy.itemsize * 8
if not isinstance(vaexcol.values, np.ndarray) and isinstance(vaexcol.values.type, pa.DictionaryType):
assert dtype[2] == vaexcol.index_values().dtype.numpy.str
else:
assert dtype[2] == vaexcol.dtype.numpy.str

def assert_column_equal(col: _VaexColumn, vaexcol:vaex.expression.Expression):
assert col.size == vaexcol.df.count("*")
assert col.offset == 0
assert col.null_count == vaexcol.countmissing()
assert_buffer_equal(col._get_data_buffer(), vaexcol)

def assert_dataframe_equal(dfo: DataFrameObject, df:vaex.dataframe.DataFrame):
assert dfo.num_columns() == len(df.columns)
assert dfo.num_rows() == len(df)
assert dfo.column_names() == list(df.get_column_names())
for col in df.get_column_names():
assert_column_equal(dfo.get_column_by_name(col), df[col])

0 comments on commit 14fa9a4

Please sign in to comment.