BUG: io: Stop guessing the data delimiter in ARFF files.

In the ARFF reader, there were several dozen lines of code that determined whether the delimiter in the @DaTa section of the ARFF file was a comma or a space. This code has been removed. The ARFF file format specification says the delimiter must be a comma. As a side effect, this closes scipygh-5276. 'loadarff' can now handle a file with no data in the @DaTa section.
sumitbinnani · Oct 9, 2015 · 802a07f · 802a07f
1 parent 2ac949b
commit 802a07f
Show file tree

Hide file tree

Showing 3 changed files with 27 additions and 54 deletions.
diff --git a/scipy/io/arff/arffread.py b/scipy/io/arff/arffread.py
@@ -398,39 +398,6 @@ def safe_date(value, date_format, datetime_unit):
         return np.datetime64(dt).astype("datetime64[%s]" % datetime_unit)
 
 
-def get_delim(line):
-    """Given a string representing a line of data, check whether the
-    delimiter is ',' or space.
-
-    Parameters
-    ----------
-    line : str
-       line of data
-
-    Returns
-    -------
-    delim : {',', ' '}
-
-    Examples
-    --------
-    >>> get_delim(',')
-    ','
-    >>> get_delim(' ')
-    ' '
-    >>> get_delim(', ')
-    ','
-    >>> get_delim('x')
-    Traceback (most recent call last):
-       ...
-    ValueError: delimiter not understood: x
-    """
-    if ',' in line:
-        return ','
-    if ' ' in line:
-        return ' '
-    raise ValueError("delimiter not understood: " + line)
-
-
 class MetaData(object):
     """Small container to keep useful informations on a ARFF dataset.
 
@@ -630,26 +597,6 @@ def _loadarff(ofile):
 
     ni = len(convertors)
 
-    # Get the delimiter from the first line of data:
-    def next_data_line(row_iter):
-        """Assumes we are already in the data part (eg after @data)."""
-        raw = next(row_iter)
-        while r_empty.match(raw) or r_comment.match(raw):
-            raw = next(row_iter)
-        return raw
-
-    try:
-        try:
-            dtline = next_data_line(ofile)
-            delim = get_delim(dtline)
-        except ValueError as e:
-            raise ParseArffError("Error while parsing delimiter: " + str(e))
-    finally:
-        ofile.seek(0, 0)
-        ofile = go_data(ofile)
-        # skip the @data line
-        next(ofile)
-
     def generator(row_iter, delim=','):
         # TODO: this is where we are spending times (~80%). I think things
         # could be made more efficiently:
@@ -681,7 +628,7 @@ def generator(row_iter, delim=','):
             row = raw.split(delim)
             yield tuple([convertors[i](row[i]) for i in elems])
 
-    a = generator(ofile, delim=delim)
+    a = generator(ofile)
     # No error should happen here: it is a bug otherwise
     data = np.fromiter(a, descr)
     return data, meta

diff --git a/scipy/io/arff/tests/data/nodata.arff b/scipy/io/arff/tests/data/nodata.arff
@@ -0,0 +1,11 @@
+@RELATION iris
+
+@ATTRIBUTE sepallength  REAL
+@ATTRIBUTE sepalwidth   REAL
+@ATTRIBUTE petallength  REAL
+@ATTRIBUTE petalwidth   REAL
+@ATTRIBUTE class    {Iris-setosa,Iris-versicolor,Iris-virginica}
+
+@DATA
+
+% This file has no data
diff --git a/scipy/io/arff/tests/test_arffread.py b/scipy/io/arff/tests/test_arffread.py
@@ -83,6 +83,21 @@ def test_missing(self):
             assert_array_almost_equal(data[i], expect_missing[i])
 
 
+class NoDataTest(TestCase):
+    def test_nodata(self):
+        # The file nodata.arff has no data in the @DATA section.
+        # Reading it should result in an array with length 0.
+        nodata_filename = os.path.join(data_path, 'nodata.arff')
+        data, meta = loadarff(nodata_filename)
+        expected_dtype = np.dtype([('sepallength', '<f8'),
+                                   ('sepalwidth', '<f8'),
+                                   ('petallength', '<f8'),
+                                   ('petalwidth', '<f8'),
+                                   ('class', 'S15')])
+        assert_equal(data.dtype, expected_dtype)
+        assert_equal(data.size, 0)
+
+
 class HeaderTest(TestCase):
     def test_type_parsing(self):
         # Test parsing type of attribute from their value.