Skip to content

Commit

Permalink
BUG: Fix of handle missing CSV MI column names (pandas-dev#23484)
Browse files Browse the repository at this point in the history
  • Loading branch information
gfyoung authored and tm9k1 committed Nov 19, 2018
1 parent 4c08672 commit d2f34db
Show file tree
Hide file tree
Showing 4 changed files with 80 additions and 22 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v0.24.0.txt
Expand Up @@ -1284,6 +1284,7 @@ Notice how we now instead output ``np.nan`` itself instead of a stringified form
- Bug in :func:`to_string()` that broke column alignment when ``index=False`` and width of first column's values is greater than the width of first column's header (:issue:`16839`, :issue:`13032`)
- Bug in :func:`DataFrame.to_csv` where a single level MultiIndex incorrectly wrote a tuple. Now just the value of the index is written (:issue:`19589`).
- Bug in :meth:`HDFStore.append` when appending a :class:`DataFrame` with an empty string column and ``min_itemsize`` < 8 (:issue:`12242`)
- Bug in :meth:`read_csv()` in which :class:`MultiIndex` index names were being improperly handled in the cases when they were not provided (:issue:`23484`)

Plotting
^^^^^^^^
Expand Down
19 changes: 15 additions & 4 deletions pandas/_libs/parsers.pyx
Expand Up @@ -302,6 +302,7 @@ cdef class TextReader:
object tupleize_cols
object usecols
list dtype_cast_order
set unnamed_cols
set noconvert

def __cinit__(self, source,
Expand Down Expand Up @@ -536,7 +537,7 @@ cdef class TextReader:
self.header = [ header ]

self.names = names
self.header, self.table_width = self._get_header()
self.header, self.table_width, self.unnamed_cols = self._get_header()

if not self.table_width:
raise EmptyDataError("No columns to parse from file")
Expand Down Expand Up @@ -720,13 +721,15 @@ cdef class TextReader:
cdef:
Py_ssize_t i, start, field_count, passed_count, unnamed_count # noqa
char *word
object name
object name, old_name
int status
int64_t hr, data_line
char *errors = "strict"
cdef StringPath path = _string_path(self.c_encoding)

header = []
unnamed_cols = set()

if self.parser.header_start >= 0:

# Header is in the file
Expand Down Expand Up @@ -759,6 +762,7 @@ cdef class TextReader:

counts = {}
unnamed_count = 0

for i in range(field_count):
word = self.parser.words[start + i]

Expand All @@ -770,6 +774,9 @@ cdef class TextReader:
name = PyUnicode_Decode(word, strlen(word),
self.c_encoding, errors)

# We use this later when collecting placeholder names.
old_name = name

if name == '':
if self.has_mi_columns:
name = ('Unnamed: {i}_level_{lvl}'
Expand All @@ -786,6 +793,9 @@ cdef class TextReader:
name = '%s.%d' % (name, count)
count = counts.get(name, 0)

if old_name == '':
unnamed_cols.add(name)

this_header.append(name)
counts[name] = count + 1

Expand All @@ -798,6 +808,7 @@ cdef class TextReader:
lc = len(this_header)
ic = (len(self.index_col) if self.index_col
is not None else 0)

if lc != unnamed_count and lc - ic > unnamed_count:
hr -= 1
self.parser_start -= 1
Expand Down Expand Up @@ -830,7 +841,7 @@ cdef class TextReader:
if self.parser.lines < 1:
self._tokenize_rows(1)

return None, self.parser.line_fields[0]
return None, self.parser.line_fields[0], unnamed_cols

# Corner case, not enough lines in the file
if self.parser.lines < data_line + 1:
Expand Down Expand Up @@ -864,7 +875,7 @@ cdef class TextReader:
elif self.allow_leading_cols and passed_count < field_count:
self.leading_cols = field_count - passed_count

return header, field_count
return header, field_count, unnamed_cols

def read(self, rows=None):
"""
Expand Down
54 changes: 36 additions & 18 deletions pandas/io/parsers.py
Expand Up @@ -1265,6 +1265,7 @@ def __init__(self, kwds):
self.prefix = kwds.pop('prefix', None)

self.index_col = kwds.get('index_col', None)
self.unnamed_cols = set()
self.index_names = None
self.col_names = None

Expand Down Expand Up @@ -1374,7 +1375,8 @@ def _extract_multi_indexer_columns(self, header, index_names, col_names,
# clean the index_names
index_names = header.pop(-1)
index_names, names, index_col = _clean_index_names(index_names,
self.index_col)
self.index_col,
self.unnamed_cols)

# extract the columns
field_count = len(header[0])
Expand Down Expand Up @@ -1454,7 +1456,8 @@ def _make_index(self, data, alldata, columns, indexnamerow=False):
if not self._name_processed:
(self.index_names, _,
self.index_col) = _clean_index_names(list(columns),
self.index_col)
self.index_col,
self.unnamed_cols)
self._name_processed = True
index = self._get_complex_date_index(data, columns)
index = self._agg_index(index, try_parse_dates=False)
Expand Down Expand Up @@ -1732,6 +1735,7 @@ def __init__(self, src, **kwds):
kwds['usecols'] = self.usecols

self._reader = parsers.TextReader(src, **kwds)
self.unnamed_cols = self._reader.unnamed_cols

passed_names = self.names is None

Expand Down Expand Up @@ -1792,7 +1796,8 @@ def __init__(self, src, **kwds):
self._name_processed = True
(index_names, self.names,
self.index_col) = _clean_index_names(self.names,
self.index_col)
self.index_col,
self.unnamed_cols)

if self.index_names is None:
self.index_names = index_names
Expand Down Expand Up @@ -1966,7 +1971,8 @@ def _get_index_names(self):

if self._reader.leading_cols == 0 and self.index_col is not None:
(idx_names, names,
self.index_col) = _clean_index_names(names, self.index_col)
self.index_col) = _clean_index_names(names, self.index_col,
self.unnamed_cols)

return names, idx_names

Expand Down Expand Up @@ -2112,7 +2118,8 @@ def __init__(self, f, **kwds):
# Get columns in two steps: infer from data, then
# infer column indices from self.usecols if it is specified.
self._col_indices = None
self.columns, self.num_original_columns = self._infer_columns()
(self.columns, self.num_original_columns,
self.unnamed_cols) = self._infer_columns()

# Now self.columns has the set of columns that we will process.
# The original set is stored in self.original_columns.
Expand Down Expand Up @@ -2367,6 +2374,8 @@ def _infer_columns(self):
names = self.names
num_original_columns = 0
clear_buffer = True
unnamed_cols = set()

if self.header is not None:
header = self.header

Expand Down Expand Up @@ -2400,24 +2409,27 @@ def _infer_columns(self):
if clear_buffer:
self._clear_buffer()
columns.append([None] * len(columns[-1]))
return columns, num_original_columns
return columns, num_original_columns, unnamed_cols

if not self.names:
raise EmptyDataError(
"No columns to parse from file")

line = self.names[:]

unnamed_count = 0
this_columns = []
this_unnamed_cols = []

for i, c in enumerate(line):
if c == '':
if have_mi_columns:
this_columns.append('Unnamed: %d_level_%d'
% (i, level))
col_name = ("Unnamed: {i}_level_{level}"
.format(i=i, level=level))
else:
this_columns.append('Unnamed: %d' % i)
unnamed_count += 1
col_name = "Unnamed: {i}".format(i=i)

this_unnamed_cols.append(i)
this_columns.append(col_name)
else:
this_columns.append(c)

Expand All @@ -2443,12 +2455,17 @@ def _infer_columns(self):
lc = len(this_columns)
ic = (len(self.index_col)
if self.index_col is not None else 0)
unnamed_count = len(this_unnamed_cols)

if lc != unnamed_count and lc - ic > unnamed_count:
clear_buffer = False
this_columns = [None] * lc
self.buf = [self.buf[-1]]

columns.append(this_columns)
unnamed_cols.update({this_columns[i]
for i in this_unnamed_cols})

if len(columns) == 1:
num_original_columns = len(this_columns)

Expand Down Expand Up @@ -2513,7 +2530,7 @@ def _infer_columns(self):
columns = [names]
num_original_columns = ncols

return columns, num_original_columns
return columns, num_original_columns, unnamed_cols

def _handle_usecols(self, columns, usecols_key):
"""
Expand Down Expand Up @@ -2879,7 +2896,8 @@ def _get_index_name(self, columns):
else:
# Case 2
(index_name, columns_,
self.index_col) = _clean_index_names(columns, self.index_col)
self.index_col) = _clean_index_names(columns, self.index_col,
self.unnamed_cols)

return index_name, orig_names, columns

Expand Down Expand Up @@ -3178,7 +3196,7 @@ def _clean_na_values(na_values, keep_default_na=True):
return na_values, na_fvalues


def _clean_index_names(columns, index_col):
def _clean_index_names(columns, index_col, unnamed_cols):
if not _is_index_col(index_col):
return None, columns, index_col

Expand All @@ -3203,10 +3221,10 @@ def _clean_index_names(columns, index_col):
columns.remove(name)
index_names.append(name)

# hack
if (isinstance(index_names[0], compat.string_types) and
'Unnamed' in index_names[0]):
index_names[0] = None
# Only clean index names that were placeholders.
for i, name in enumerate(index_names):
if isinstance(name, compat.string_types) and name in unnamed_cols:
index_names[i] = None

return index_names, columns, index_col

Expand Down
28 changes: 28 additions & 0 deletions pandas/tests/io/parser/index_col.py
Expand Up @@ -141,3 +141,31 @@ def test_empty_with_index_col_false(self):
result = self.read_csv(StringIO(data), index_col=False)
expected = DataFrame([], columns=['x', 'y'])
tm.assert_frame_equal(result, expected)

@pytest.mark.parametrize("index_names", [
["", ""],
["foo", ""],
["", "bar"],
["foo", "bar"],
["NotReallyUnnamed", "Unnamed: 0"],
])
def test_multi_index_naming(self, index_names):
# We don't want empty index names being replaced with "Unnamed: 0"
data = ",".join(index_names + ["col\na,c,1\na,d,2\nb,c,3\nb,d,4"])
result = self.read_csv(StringIO(data), index_col=[0, 1])

expected = DataFrame({"col": [1, 2, 3, 4]},
index=MultiIndex.from_product([["a", "b"],
["c", "d"]]))
expected.index.names = [name if name else None for name in index_names]
tm.assert_frame_equal(result, expected)

def test_multi_index_naming_not_all_at_beginning(self):
data = ",Unnamed: 2,\na,c,1\na,d,2\nb,c,3\nb,d,4"
result = self.read_csv(StringIO(data), index_col=[0, 2])

expected = DataFrame({"Unnamed: 2": ["c", "d", "c", "d"]},
index=MultiIndex(
levels=[['a', 'b'], [1, 2, 3, 4]],
labels=[[0, 0, 1, 1], [0, 1, 2, 3]]))
tm.assert_frame_equal(result, expected)

0 comments on commit d2f34db

Please sign in to comment.