Skip to content
This repository has been archived by the owner on May 7, 2023. It is now read-only.

Commit

Permalink
Merge 1a8a1b3 into 1ef2d46
Browse files Browse the repository at this point in the history
  • Loading branch information
caseycrogers committed Dec 11, 2020
2 parents 1ef2d46 + 1a8a1b3 commit 8347f88
Show file tree
Hide file tree
Showing 5 changed files with 254 additions and 18 deletions.
1 change: 1 addition & 0 deletions README.md
Expand Up @@ -13,6 +13,7 @@ From csv:
```dart
final df = await DataFrame.fromCsv("dataset/stocks.csv");
```
fromCSV parses files according to the csv standard, including support for escape double quotes (see: [RFC4180](https://tools.ietf.org/html/rfc4180)).

Note: the type of the records are infered from the data. The first line of the csv must contains the headers for the column names. Optional parameters:

Expand Down
52 changes: 34 additions & 18 deletions lib/src/df.dart
Expand Up @@ -3,6 +3,7 @@ import 'dart:collection';
import 'dart:convert';
import 'dart:io';

import 'package:df/src/util/csv_parser.dart';
import 'package:jiffy/jiffy.dart';
import 'package:meta/meta.dart';

Expand Down Expand Up @@ -58,7 +59,7 @@ class DataFrame {
DataFrame.fromRows(List<Map<String, dynamic>> rows)
: assert(rows != null),
assert(rows.isNotEmpty) {
// create _columns from the first datapint
// create _columns from the first datapoint
rows[0].forEach((k, dynamic v) {
final t = v.runtimeType as Type;
_columns.add(DataFrameColumn(name: k, type: t));
Expand All @@ -67,7 +68,7 @@ class DataFrame {
rows.forEach((row) => _matrix.addRow(row, _columnsIndices()));
}

static List<dynamic> _parseLine(
static List<dynamic> _parseVals(
List<dynamic> vals, List<DataFrameColumn> columnsNames,
{String dateFormat,
String timestampCol,
Expand Down Expand Up @@ -113,26 +114,18 @@ class DataFrame {
return colValues;
}

/// Build a dataframe from a csv file
static Future<DataFrame> fromCsv(String path,
/// Build a dataframe from a utf8 encoded stream of comma separated strings
static Future<DataFrame> fromStream(Stream<String> stream,
{String dateFormat,
String timestampCol,
TimestampFormat timestampFormat = TimestampFormat.milliseconds,
bool verbose = false}) async {
final file = File(path);
if (!file.existsSync()) {
throw FileNotFoundException("File not found: $path");
}
String timestampCol,
TimestampFormat timestampFormat = TimestampFormat.milliseconds,
bool verbose = false}) async {
final df = DataFrame();
var i = 1;
List<String> _colNames;
await file
.openRead()
.transform<String>(utf8.decoder)
.transform<String>(const LineSplitter())
.forEach((line) {
await stream.forEach((line) {
//print('line $i: $line');
final vals = line.split(",");
final vals = CsvParser.parseLine(line);
if (i == 1) {
// set columns names
_colNames = vals;
Expand All @@ -151,7 +144,7 @@ class DataFrame {
++vi;
});
}
final colValues = _parseLine(vals, df._columns,
final colValues = _parseVals(vals, df._columns,
dateFormat: dateFormat,
timestampCol: timestampCol,
timestampFormat: timestampFormat);
Expand All @@ -165,6 +158,29 @@ class DataFrame {
return df;
}

/// Build a dataframe from a csv file
static Future<DataFrame> fromCsv(String path,
{String dateFormat,
String timestampCol,
TimestampFormat timestampFormat = TimestampFormat.milliseconds,
bool verbose = false}) async {
final file = File(path);
if (!file.existsSync()) {
throw FileNotFoundException("File not found: $path");
}

return fromStream(
file
.openRead()
.transform<String>(utf8.decoder)
.transform<String>(const LineSplitter()),
dateFormat: dateFormat,
timestampCol: timestampCol,
timestampFormat: timestampFormat,
verbose: verbose,
);
}

DataFrame._copyWithMatrix(DataFrame df, List<List<dynamic>> matrix) {
_columns = df._columns;
_matrix.data = matrix;
Expand Down
76 changes: 76 additions & 0 deletions lib/src/util/csv_parser.dart
@@ -0,0 +1,76 @@
/// CSVParser parses a stream of lines into a list of vals compliant with
/// the CSV standard
///
/// See RFC4180 for details on CSV standard
class CsvParser {
/// Takes a single line and parses it into a list of values according to the
/// csv standard (see RFC4180)
static List<String> parseLine(String line) {
final records = <String>[];
var i = 0;
StringBuffer record;
while (i < line.length) {
record = StringBuffer();
if (line[i] == "\"") {
// if the csv field begins with a double quote, parse it with
// proper character escaping - see RC4180 2.5-2.7
i = parseEscapedField(line, record, i);
} else {
i = parseField(line, record, i);
}
records.add(record.toString());
// increment past the current char (a comma or EOL)
i++;
}
// special case for a line that ends with comma (ie a blank field)
if (line[line.length - 1] == ",") records.add("");
return records;
}

/// Parse and write chars to buff until a comma is reached, then return the
/// the index after the last char consumed
static int parseField(String line, StringBuffer record, int startIndex) {
var i = startIndex;
while (i < line.length && line[i] != ",") {
if (line[i] == "\"") {
throw FormatException("A field contained an unescaped double quote. "
"See section 2.5 of https://tools.ietf.org/html/rfc4180.\n"
"character $i of line:\n$line\n");
}
record.write(line[i]);
i++;
}
return i;
}

/// Like _parseField, but with support for character escaping
static int parseEscapedField(
String line, StringBuffer record, int startIndex) {
var i = startIndex;
assert(
line[i] == "\"",
"parseEscapedField was called on an unescaped field at"
" char $i of line $line");
// increment past the first char (a double quote)
i++;
while (i < line.length) {
if (line[i] == "\"") {
if (i + 1 < line.length && line[i + 1] == "\"") {
// A double quote preceded by a double quote is escaped - increment
// past this double quote and write the next one to record
i++;
} else {
// Single double quote, this is the end of the escaped sequence
return i + 1;
}
}
record.write(line[i]);
i++;
}
// reached end of line without closing the escape quote
throw FormatException(
"A field contained an escape quote without a closing escape quote. "
"See section 2.5 of https://tools.ietf.org/html/rfc4180.\n"
"character $i of line:\n$line\n");
}
}
94 changes: 94 additions & 0 deletions test/csv_parser_test.dart
@@ -0,0 +1,94 @@
import 'package:df/src/util/csv_parser.dart';
import 'package:test/test.dart';

void main() {
test("test parseField", () {
StringBuffer record;
String line;

line = "a,bc,def";
// parse first field
record = StringBuffer();
expect(CsvParser.parseField(line, record, 0), 1);
expect(record.toString(), "a");

// parse second field with two chars
record = StringBuffer();
expect(CsvParser.parseField(line, record, 2), 4);
expect(record.toString(), "bc");

// parse final field
record = StringBuffer();
expect(CsvParser.parseField(line, record, 5), 8);
expect(record.toString(), "def");

// a double quote in an unescaped field throws an error
record = StringBuffer();
line = "a,b\",c";
expect(() => CsvParser.parseField(line, record, 2),
throwsA(isA<FormatException>()));
});

test("test parseEscapedField", () {
StringBuffer record;
String line;

line = "\"a\",\"b,c\"";
// escape quotes aren't added to record
record = StringBuffer();
expect(CsvParser.parseEscapedField(line, record, 0), 3);
expect(record.toString(), "a");

// parse an escaped field with a comma
record = StringBuffer();
expect(CsvParser.parseEscapedField(line, record, 4), 9);
expect(record.toString(), "b,c");

line = "a,\"b\"\"\",c";
// A properly escaped double quote is added to record
record = StringBuffer();
expect(CsvParser.parseEscapedField(line, record, 2), 7);
expect(record.toString(), "b\"");

// A FormatException is thrown if there's a hanging escape quote
line = "a,\"b,c";
record = StringBuffer();
expect(() => CsvParser.parseEscapedField(line, record, 2),
throwsA(isA<FormatException>()));
});

test("test parseLine", () {
StringBuffer record;
String line;

// parse a generic line with no escaping
line = "a,bc,def";
record = StringBuffer();
expect(CsvParser.parseLine(line), <dynamic>["a", "bc", "def"]);

// parse a generic line with a blank final field
line = "a,b,";
record = StringBuffer();
expect(CsvParser.parseLine(line), <dynamic>["a", "b", ""]);

// parse a line with basic escaping
line = "a,\"bc\",\"def\"";
record = StringBuffer();
expect(CsvParser.parseLine(line), <dynamic>["a", "bc", "def"]);

// parse a line with escaped commas
line = "a,\"b,c\",\"d,e,f\"";
record = StringBuffer();
expect(CsvParser.parseLine(line), <dynamic>["a", "b,c", "d,e,f"]);

// parse a line with escaped double quotes and commas
line = "a,\"b\"\"c\",\"d,e,f\"\"\"";
record = StringBuffer();
expect(CsvParser.parseLine(line), <dynamic>["a", "b\"c", "d,e,f\""]);

// parse a line with an unclosed escape quote
line = "a,\"b\"\",c";
record = StringBuffer();
expect(() => CsvParser.parseLine(line), throwsA(isA<FormatException>()));
});
}
49 changes: 49 additions & 0 deletions test/df_test.dart
Expand Up @@ -85,6 +85,13 @@ void main() {
expect(e.runtimeType.toString() == "FileNotFoundException", true);
expect(e.message, 'File not found: /wrong/path');
});

df = await DataFrame.fromCsv("test/data/data_timestamp_s.csv",
timestampCol: "timestamp",
timestampFormat: TimestampFormat.seconds,
verbose: true)
..show();
expect(df.columnsNames, <String>["symbol", "price", "n", "timestamp"]);
});

test("subset", () async {
Expand Down Expand Up @@ -245,6 +252,48 @@ void main() {
edf.dataset = dataset;
expect(edf.dataset, dataset);
});

test("from stream", () async {
final inputStream = Stream<String>.fromIterable([
"a,b",
"1,2"
]);
df = await DataFrame.fromStream(inputStream);
expect(df.columnsNames, ["a","b"]);
expect(df.rows.toList(), [{"a": 1, "b": 2}]);
});

test("escape quotes are consumed", () async {
final inputStream = Stream<String>.fromIterable([
"a,\"b\"",
"1,\"2\""
]);
df = await DataFrame.fromStream(inputStream);
// Escape quites should be consumed during parsing
expect(df.columnsNames, ["a","b"]);
expect(df.rows.toList(), [{"a": 1, "b": 2}]);
});

test("commas and double quotes are properly escaped", () async {
var inputStream = Stream<String>.fromIterable([
"a,\"b,c\"",
"1,\"2,3\""
]);
df = await DataFrame.fromStream(inputStream);
// Escape quotes should be consumed during parsing
expect(df.columnsNames, ["a","b,c"]);
expect(df.rows.toList(), [{"a": 1, "b,c": "2,3"}]);

inputStream = Stream<String>.fromIterable([
"a,\"b,c\"",
"\"\"\"They may say I'm a dreamer, but I'm not\"\"\",\"2,3\""
]);
df = await DataFrame.fromStream(inputStream);
// within an escaped sequence, double quotes can be included by replacing
// them with two double quotes - RFC4180-2.7
expect(df.columnsNames, ["a","b,c"]);
expect(df.rows.toList(), [{"a": "\"They may say I'm a dreamer, but I'm not\"", "b,c": "2,3"}]);
});
}

class ExtendedDf extends DataFrame {}

0 comments on commit 8347f88

Please sign in to comment.