Skip to content

Commit

Permalink
BUG: io: Stop guessing the data delimiter in ARFF files.
Browse files Browse the repository at this point in the history
In the ARFF reader, there were several dozen lines of code that
determined whether the delimiter in the @DaTa section of the ARFF
file was a comma or a space.  This code has been removed.  The ARFF
file format specification says the delimiter must be a comma.

As a side effect, this closes scipygh-5276.  'loadarff' can now handle
a file with no data in the @DaTa section.
  • Loading branch information
WarrenWeckesser authored and sumitbinnani committed Oct 9, 2015
1 parent 2ac949b commit 802a07f
Show file tree
Hide file tree
Showing 3 changed files with 27 additions and 54 deletions.
55 changes: 1 addition & 54 deletions scipy/io/arff/arffread.py
Expand Up @@ -398,39 +398,6 @@ def safe_date(value, date_format, datetime_unit):
return np.datetime64(dt).astype("datetime64[%s]" % datetime_unit)


def get_delim(line):
"""Given a string representing a line of data, check whether the
delimiter is ',' or space.
Parameters
----------
line : str
line of data
Returns
-------
delim : {',', ' '}
Examples
--------
>>> get_delim(',')
','
>>> get_delim(' ')
' '
>>> get_delim(', ')
','
>>> get_delim('x')
Traceback (most recent call last):
...
ValueError: delimiter not understood: x
"""
if ',' in line:
return ','
if ' ' in line:
return ' '
raise ValueError("delimiter not understood: " + line)


class MetaData(object):
"""Small container to keep useful informations on a ARFF dataset.
Expand Down Expand Up @@ -630,26 +597,6 @@ def _loadarff(ofile):

ni = len(convertors)

# Get the delimiter from the first line of data:
def next_data_line(row_iter):
"""Assumes we are already in the data part (eg after @data)."""
raw = next(row_iter)
while r_empty.match(raw) or r_comment.match(raw):
raw = next(row_iter)
return raw

try:
try:
dtline = next_data_line(ofile)
delim = get_delim(dtline)
except ValueError as e:
raise ParseArffError("Error while parsing delimiter: " + str(e))
finally:
ofile.seek(0, 0)
ofile = go_data(ofile)
# skip the @data line
next(ofile)

def generator(row_iter, delim=','):
# TODO: this is where we are spending times (~80%). I think things
# could be made more efficiently:
Expand Down Expand Up @@ -681,7 +628,7 @@ def generator(row_iter, delim=','):
row = raw.split(delim)
yield tuple([convertors[i](row[i]) for i in elems])

a = generator(ofile, delim=delim)
a = generator(ofile)
# No error should happen here: it is a bug otherwise
data = np.fromiter(a, descr)
return data, meta
Expand Down
11 changes: 11 additions & 0 deletions scipy/io/arff/tests/data/nodata.arff
@@ -0,0 +1,11 @@
@RELATION iris

@ATTRIBUTE sepallength REAL
@ATTRIBUTE sepalwidth REAL
@ATTRIBUTE petallength REAL
@ATTRIBUTE petalwidth REAL
@ATTRIBUTE class {Iris-setosa,Iris-versicolor,Iris-virginica}

@DATA

% This file has no data
15 changes: 15 additions & 0 deletions scipy/io/arff/tests/test_arffread.py
Expand Up @@ -83,6 +83,21 @@ def test_missing(self):
assert_array_almost_equal(data[i], expect_missing[i])


class NoDataTest(TestCase):
def test_nodata(self):
# The file nodata.arff has no data in the @DATA section.
# Reading it should result in an array with length 0.
nodata_filename = os.path.join(data_path, 'nodata.arff')
data, meta = loadarff(nodata_filename)
expected_dtype = np.dtype([('sepallength', '<f8'),
('sepalwidth', '<f8'),
('petallength', '<f8'),
('petalwidth', '<f8'),
('class', 'S15')])
assert_equal(data.dtype, expected_dtype)
assert_equal(data.size, 0)


class HeaderTest(TestCase):
def test_type_parsing(self):
# Test parsing type of attribute from their value.
Expand Down

0 comments on commit 802a07f

Please sign in to comment.