From 2ebdd5e4ca1b3230e7c59c9deeeb23349944a5a7 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Tue, 11 Feb 2020 23:51:28 -0600 Subject: [PATCH] Add Python unit test to verify that field_id's are correctly filtered through to the Arrow schema in Python --- python/pyarrow/tests/test_parquet.py | 49 +++++++++++++++++++++++++--- python/pyarrow/types.pxi | 4 +++ 2 files changed, 49 insertions(+), 4 deletions(-) diff --git a/python/pyarrow/tests/test_parquet.py b/python/pyarrow/tests/test_parquet.py index c8f555e5ef954..f045855b4dcd8 100644 --- a/python/pyarrow/tests/test_parquet.py +++ b/python/pyarrow/tests/test_parquet.py @@ -3251,6 +3251,13 @@ def test_categorical_order_survives_roundtrip(): tm.assert_frame_equal(result, df) +def _simple_table_write_read(table): + bio = pa.BufferOutputStream() + pq.write_table(table, bio) + contents = bio.getvalue() + return pq.read_table(pa.BufferReader(contents)) + + def test_dictionary_array_automatically_read(): # ARROW-3246 @@ -3269,16 +3276,50 @@ def test_dictionary_array_automatically_read(): dict_values)) table = pa.table([pa.chunked_array(chunks)], names=['f0']) + result = _simple_table_write_read(table) + + assert_tables_equal(result, table) + + # The only key in the metadata was the Arrow schema key + assert result.schema.metadata is None + + +def test_field_id_metadata(): + # ARROW-7080 + table = pa.table([pa.array([1], type='int32'), + pa.array([[]], type=pa.list_(pa.int32())), + pa.array([b'boo'], type='binary')], + ['f0', 'f1', 'f2']) bio = pa.BufferOutputStream() pq.write_table(table, bio) contents = bio.getvalue() - result = pq.read_table(pa.BufferReader(contents)) - assert_tables_equal(result, table) + pf = pq.ParquetFile(pa.BufferReader(contents)) + schema = pf.schema_arrow - # The only key in the metadata was the Arrow schema key - assert result.schema.metadata is None + # Expected Parquet schema for reference + # + # required group field_id=0 schema { + # optional int32 field_id=1 f0; + # optional group field_id=2 f1 (List) { + # repeated group field_id=3 list { + # optional int32 field_id=4 item; + # } + # } + # optional binary field_id=5 f2; + # } + + field_name = b'PARQUET::field_id' + assert schema[0].metadata[field_name] == b'1' + + list_field = schema[1] + assert list_field.metadata[field_name] == b'2' + + list_item_field = list_field.type.value_field + assert list_item_field.metadata[field_name] == b'4' + + assert schema[2].metadata[field_name] == b'5' @pytest.mark.pandas diff --git a/python/pyarrow/types.pxi b/python/pyarrow/types.pxi index 1099d43841835..c63051d129b20 100644 --- a/python/pyarrow/types.pxi +++ b/python/pyarrow/types.pxi @@ -253,6 +253,10 @@ cdef class ListType(DataType): def __reduce__(self): return list_, (self.value_type,) + @property + def value_field(self): + return pyarrow_wrap_field(self.list_type.value_field()) + @property def value_type(self): """