From d2f34dbea6ffb01976e1c6a1a357ad60ebf4e42b Mon Sep 17 00:00:00 2001 From: gfyoung Date: Tue, 6 Nov 2018 05:08:13 -0800 Subject: [PATCH] BUG: Fix of handle missing CSV MI column names (#23484) --- doc/source/whatsnew/v0.24.0.txt | 1 + pandas/_libs/parsers.pyx | 19 +++++++--- pandas/io/parsers.py | 54 +++++++++++++++++++---------- pandas/tests/io/parser/index_col.py | 28 +++++++++++++++ 4 files changed, 80 insertions(+), 22 deletions(-) diff --git a/doc/source/whatsnew/v0.24.0.txt b/doc/source/whatsnew/v0.24.0.txt index c6721cac82a2d6..f7010742566a66 100644 --- a/doc/source/whatsnew/v0.24.0.txt +++ b/doc/source/whatsnew/v0.24.0.txt @@ -1284,6 +1284,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form - Bug in :func:`to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`) - Bug in :func:`DataFrame.to_csv` where a single level MultiIndex incorrectly wrote a tuple. Now just the value of the index is written (:issue:`19589`). - Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`) +- Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`) Plotting ^^^^^^^^ diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx index a2a718aa8b5917..391de339ad60e1 100644 --- a/pandas/_libs/parsers.pyx +++ b/pandas/_libs/parsers.pyx @@ -302,6 +302,7 @@ cdef class TextReader: object tupleize_cols object usecols list dtype_cast_order + set unnamed_cols set noconvert def __cinit__(self, source, @@ -536,7 +537,7 @@ cdef class TextReader: self.header = [ header ] self.names = names - self.header, self.table_width = self._get_header() + self.header, self.table_width, self.unnamed_cols = self._get_header() if not self.table_width: raise EmptyDataError("No columns to parse from file") @@ -720,13 +721,15 @@ cdef class TextReader: cdef: Py_ssize_t i, start, field_count, passed_count, unnamed_count # noqa char *word - object name + object name, old_name int status int64_t hr, data_line char *errors = "strict" cdef StringPath path = _string_path(self.c_encoding) header = [] + unnamed_cols = set() + if self.parser.header_start >= 0: # Header is in the file @@ -759,6 +762,7 @@ cdef class TextReader: counts = {} unnamed_count = 0 + for i in range(field_count): word = self.parser.words[start + i] @@ -770,6 +774,9 @@ cdef class TextReader: name = PyUnicode_Decode(word, strlen(word), self.c_encoding, errors) + # We use this later when collecting placeholder names. + old_name = name + if name == '': if self.has_mi_columns: name = ('Unnamed: {i}_level_{lvl}' @@ -786,6 +793,9 @@ cdef class TextReader: name = '%s.%d' % (name, count) count = counts.get(name, 0) + if old_name == '': + unnamed_cols.add(name) + this_header.append(name) counts[name] = count + 1 @@ -798,6 +808,7 @@ cdef class TextReader: lc = len(this_header) ic = (len(self.index_col) if self.index_col is not None else 0) + if lc != unnamed_count and lc - ic > unnamed_count: hr -= 1 self.parser_start -= 1 @@ -830,7 +841,7 @@ cdef class TextReader: if self.parser.lines < 1: self._tokenize_rows(1) - return None, self.parser.line_fields[0] + return None, self.parser.line_fields[0], unnamed_cols # Corner case, not enough lines in the file if self.parser.lines < data_line + 1: @@ -864,7 +875,7 @@ cdef class TextReader: elif self.allow_leading_cols and passed_count < field_count: self.leading_cols = field_count - passed_count - return header, field_count + return header, field_count, unnamed_cols def read(self, rows=None): """ diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index cd9d3ccb79af84..12914c10e06555 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -1265,6 +1265,7 @@ def __init__(self, kwds): self.prefix = kwds.pop('prefix', None) self.index_col = kwds.get('index_col', None) + self.unnamed_cols = set() self.index_names = None self.col_names = None @@ -1374,7 +1375,8 @@ def _extract_multi_indexer_columns(self, header, index_names, col_names, # clean the index_names index_names = header.pop(-1) index_names, names, index_col = _clean_index_names(index_names, - self.index_col) + self.index_col, + self.unnamed_cols) # extract the columns field_count = len(header[0]) @@ -1454,7 +1456,8 @@ def _make_index(self, data, alldata, columns, indexnamerow=False): if not self._name_processed: (self.index_names, _, self.index_col) = _clean_index_names(list(columns), - self.index_col) + self.index_col, + self.unnamed_cols) self._name_processed = True index = self._get_complex_date_index(data, columns) index = self._agg_index(index, try_parse_dates=False) @@ -1732,6 +1735,7 @@ def __init__(self, src, **kwds): kwds['usecols'] = self.usecols self._reader = parsers.TextReader(src, **kwds) + self.unnamed_cols = self._reader.unnamed_cols passed_names = self.names is None @@ -1792,7 +1796,8 @@ def __init__(self, src, **kwds): self._name_processed = True (index_names, self.names, self.index_col) = _clean_index_names(self.names, - self.index_col) + self.index_col, + self.unnamed_cols) if self.index_names is None: self.index_names = index_names @@ -1966,7 +1971,8 @@ def _get_index_names(self): if self._reader.leading_cols == 0 and self.index_col is not None: (idx_names, names, - self.index_col) = _clean_index_names(names, self.index_col) + self.index_col) = _clean_index_names(names, self.index_col, + self.unnamed_cols) return names, idx_names @@ -2112,7 +2118,8 @@ def __init__(self, f, **kwds): # Get columns in two steps: infer from data, then # infer column indices from self.usecols if it is specified. self._col_indices = None - self.columns, self.num_original_columns = self._infer_columns() + (self.columns, self.num_original_columns, + self.unnamed_cols) = self._infer_columns() # Now self.columns has the set of columns that we will process. # The original set is stored in self.original_columns. @@ -2367,6 +2374,8 @@ def _infer_columns(self): names = self.names num_original_columns = 0 clear_buffer = True + unnamed_cols = set() + if self.header is not None: header = self.header @@ -2400,7 +2409,7 @@ def _infer_columns(self): if clear_buffer: self._clear_buffer() columns.append([None] * len(columns[-1])) - return columns, num_original_columns + return columns, num_original_columns, unnamed_cols if not self.names: raise EmptyDataError( @@ -2408,16 +2417,19 @@ def _infer_columns(self): line = self.names[:] - unnamed_count = 0 this_columns = [] + this_unnamed_cols = [] + for i, c in enumerate(line): if c == '': if have_mi_columns: - this_columns.append('Unnamed: %d_level_%d' - % (i, level)) + col_name = ("Unnamed: {i}_level_{level}" + .format(i=i, level=level)) else: - this_columns.append('Unnamed: %d' % i) - unnamed_count += 1 + col_name = "Unnamed: {i}".format(i=i) + + this_unnamed_cols.append(i) + this_columns.append(col_name) else: this_columns.append(c) @@ -2443,12 +2455,17 @@ def _infer_columns(self): lc = len(this_columns) ic = (len(self.index_col) if self.index_col is not None else 0) + unnamed_count = len(this_unnamed_cols) + if lc != unnamed_count and lc - ic > unnamed_count: clear_buffer = False this_columns = [None] * lc self.buf = [self.buf[-1]] columns.append(this_columns) + unnamed_cols.update({this_columns[i] + for i in this_unnamed_cols}) + if len(columns) == 1: num_original_columns = len(this_columns) @@ -2513,7 +2530,7 @@ def _infer_columns(self): columns = [names] num_original_columns = ncols - return columns, num_original_columns + return columns, num_original_columns, unnamed_cols def _handle_usecols(self, columns, usecols_key): """ @@ -2879,7 +2896,8 @@ def _get_index_name(self, columns): else: # Case 2 (index_name, columns_, - self.index_col) = _clean_index_names(columns, self.index_col) + self.index_col) = _clean_index_names(columns, self.index_col, + self.unnamed_cols) return index_name, orig_names, columns @@ -3178,7 +3196,7 @@ def _clean_na_values(na_values, keep_default_na=True): return na_values, na_fvalues -def _clean_index_names(columns, index_col): +def _clean_index_names(columns, index_col, unnamed_cols): if not _is_index_col(index_col): return None, columns, index_col @@ -3203,10 +3221,10 @@ def _clean_index_names(columns, index_col): columns.remove(name) index_names.append(name) - # hack - if (isinstance(index_names[0], compat.string_types) and - 'Unnamed' in index_names[0]): - index_names[0] = None + # Only clean index names that were placeholders. + for i, name in enumerate(index_names): + if isinstance(name, compat.string_types) and name in unnamed_cols: + index_names[i] = None return index_names, columns, index_col diff --git a/pandas/tests/io/parser/index_col.py b/pandas/tests/io/parser/index_col.py index 2909ef6214e621..ba54ed4620199b 100644 --- a/pandas/tests/io/parser/index_col.py +++ b/pandas/tests/io/parser/index_col.py @@ -141,3 +141,31 @@ def test_empty_with_index_col_false(self): result = self.read_csv(StringIO(data), index_col=False) expected = DataFrame([], columns=['x', 'y']) tm.assert_frame_equal(result, expected) + + @pytest.mark.parametrize("index_names", [ + ["", ""], + ["foo", ""], + ["", "bar"], + ["foo", "bar"], + ["NotReallyUnnamed", "Unnamed: 0"], + ]) + def test_multi_index_naming(self, index_names): + # We don't want empty index names being replaced with "Unnamed: 0" + data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"]) + result = self.read_csv(StringIO(data), index_col=[0, 1]) + + expected = DataFrame({"col": [1, 2, 3, 4]}, + index=MultiIndex.from_product([["a", "b"], + ["c", "d"]])) + expected.index.names = [name if name else None for name in index_names] + tm.assert_frame_equal(result, expected) + + def test_multi_index_naming_not_all_at_beginning(self): + data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4" + result = self.read_csv(StringIO(data), index_col=[0, 2]) + + expected = DataFrame({"Unnamed: 2": ["c", "d", "c", "d"]}, + index=MultiIndex( + levels=[['a', 'b'], [1, 2, 3, 4]], + labels=[[0, 0, 1, 1], [0, 1, 2, 3]])) + tm.assert_frame_equal(result, expected)