Skip to content

Commit

Permalink
use header option in batch loader
Browse files Browse the repository at this point in the history
  • Loading branch information
Xiaoji Chen committed Aug 6, 2019
1 parent 5b80b1f commit cadef9f
Show file tree
Hide file tree
Showing 5 changed files with 99 additions and 19 deletions.
2 changes: 1 addition & 1 deletion docs/api-reference/table-loaders/csv-loader.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ The following options are passed on to [papaparse](https://www.papaparse.com/doc
| Option | Description |
| ------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
| `delimiter`= | The delimiting character. By default auto-detects from a list of common delimiters (or `delimitersToGuess`). |
| `header`= | If `true`, the first row of parsed data will be interpreted as field names. If `false`, the first row is interpreted as data. By default auto-detects. |
| `newline`= | The newline sequence. By default auto-detects. Must be `\r`, `\n`, or `\r\n`. |
| `quoteChar`=`"` | The character used to quote fields. (Note: unquoted fields are parsed correctly). |
| `escapeChar`=`"` | The character used to escape the quote character within a field. |
Expand All @@ -37,7 +38,6 @@ Note that the following `papaparse` options are NOT supported by `CSVLoader` (th

| Option | Description | Reason/Replacement |
| ------------------ | ---------------------------------------------------------------------------- | ------------------------------------------------------------------------------- |
| `header`=`false` | If true, the first row of parsed data will be interpreted as field names. \* | Header is detected and parsed by `CSVLoader` |
| `transformHeader`= | Function to apply on each header. | (Only available in version 5.0) |
| `worker` | Whether to use a worker thread. | Use `CSVWorkerLoader` instead. |
| `step` | Callback function for streaming. | Use `loadInBatches` instead. |
Expand Down
11 changes: 6 additions & 5 deletions modules/csv/src/csv-loader.js
Original file line number Diff line number Diff line change
Expand Up @@ -63,7 +63,8 @@ function parseCSVInBatches(asyncIterator, options) {

// Check if we need to save a header row
if (isFirstRow && !headerRow) {
if (isHeaderRow(row)) {
const {header = isHeaderRow(row)} = options;
if (header) {
headerRow = row;
return;
}
Expand Down Expand Up @@ -128,19 +129,19 @@ function hasHeader(csvText, options) {
}

function deduceSchema(row, headerRow) {
const schema = {};
const schema = headerRow ? {} : [];
for (let i = 0; i < row.length; i++) {
const columnName = (headerRow && headerRow[i]) || String(i);
const columnName = (headerRow && headerRow[i]) || i;
const value = row[i];
switch (typeof value) {
case 'number':
case 'boolean':
// TODO - booleans could be handled differently...
schema[columnName] = {name: columnName, type: Float32Array};
schema[columnName] = {name: String(columnName), index: i, type: Float32Array};
break;
case 'string':
default:
schema[columnName] = {name: columnName, type: Array};
schema[columnName] = {name: String(columnName), index: i, type: Array};
// We currently only handle numeric rows
// TODO we could offer a function to map strings to numbers?
}
Expand Down
43 changes: 43 additions & 0 deletions modules/csv/test/csv-loader.spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,27 @@ const CSV_SAMPLE_URL = '@loaders.gl/csv/test/data/sample.csv';
// const CSV_SAMLE_LONG_URL = '@loaders.gl/csv/test/data/sample-long.csv';
const CSV_SAMPLE_VERY_LONG_URL = '@loaders.gl/csv/test/data/sample-very-long.csv';

function validateColumn(column, length, type) {
if (column.length !== length) {
return `column length should be ${length}`;
}
let validator = null;
switch (type) {
case 'string':
validator = d => typeof d === 'string';
break;

case 'float':
validator = d => Number.isFinite(d);
break;

default:
return null;
}

return column.every(validator) ? true : `column elements are not all ${type}s`;
}

test('CSVLoader#load', async t => {
const rows = await load(CSV_SAMPLE_URL, CSVLoader);
t.is(rows.length, 2, 'Got correct table size');
Expand Down Expand Up @@ -38,6 +59,11 @@ test('CSVLoader#loadInBatches(sample.csv, columns)', async t => {
for await (const batch of iterator) {
t.comment(`BATCH ${batch.count}: ${batch.length} ${JSON.stringify(batch.data).slice(0, 200)}`);
t.equal(batch.length, 2, 'Got correct batch size');

t.ok(validateColumn(batch.data[0], batch.length, 'string'), 'column 0 valid');
t.ok(validateColumn(batch.data[1], batch.length, 'string'), 'column 1 valid');
t.ok(validateColumn(batch.data[2], batch.length, 'float'), 'column 2 valid');

batchCount++;
}
t.equal(batchCount, 1, 'Correct number of batches received');
Expand All @@ -56,6 +82,17 @@ test('CSVLoader#loadInBatches(sample-very-long.csv, columns)', async t => {
for await (const batch of iterator) {
t.comment(`BATCH ${batch.count}: ${batch.length} ${JSON.stringify(batch.data).slice(0, 200)}`);
t.equal(batch.length, batchSize, 'Got correct batch size');

t.ok(validateColumn(batch.data.TLD, batch.length, 'string'), 'column TLD valid');
t.ok(
validateColumn(batch.data['meaning of life'], batch.length, 'float'),
'column meaning of life valid'
);
t.ok(
validateColumn(batch.data.placeholder, batch.length, 'string'),
'column placeholder valid'
);

batchCount++;
if (batchCount === 5) {
break;
Expand All @@ -74,6 +111,7 @@ test('CSVLoader#loadInBatches(sample.csv, rows)', async t => {
for await (const batch of iterator) {
t.comment(`BATCH ${batch.count}: ${batch.length} ${JSON.stringify(batch.data).slice(0, 200)}`);
t.equal(batch.length, 2, 'Got correct batch size');
t.deepEqual(batch.data[0], ['A', 'B', 1], 'Got correct first row');
batchCount++;
}
t.equal(batchCount, 1, 'Correct number of batches received');
Expand All @@ -90,6 +128,11 @@ test('CSVLoader#loadInBatches(sample-very-long.csv, rows)', async t => {
for await (const batch of iterator) {
t.comment(`BATCH ${batch.count}: ${batch.length} ${JSON.stringify(batch.data).slice(0, 200)}`);
t.equal(batch.length, batchSize, 'Got correct batch size');

t.ok(batch.data[0].TLD, 'first row has TLD value');
t.ok(batch.data[0]['meaning of life'], 'first row has meaning of life value');
t.ok(batch.data[0].placeholder, 'first row has placeholder value');

batchCount++;
if (batchCount === 5) {
break;
Expand Down
40 changes: 28 additions & 12 deletions modules/experimental/src/categories/table/columnar-table-batch.js
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ export default class ColumnarTableBatch {
// If user keeps pushing rows beyond batch size, reallocate
this.reallocateColumns();
for (const fieldName in row) {
this.columns[fieldName] = row[fieldName];
this.columns[fieldName][this.length] = row[fieldName];
}
this.length++;
}
Expand All @@ -25,10 +25,18 @@ export default class ColumnarTableBatch {
}

getNormalizedBatch() {
const columns = this.columns;
this.pruneColumns();
const columns = Array.isArray(this.schema) ? this.columns : {};

if (!Array.isArray(this.schema)) {
for (const fieldName in this.schema) {
const field = this.schema[fieldName];
columns[field.name] = this.columns[field.index];
}
}

this.columns = null;
// TODO - Ensure column lengths are set to the actual loaded size
// this.pruneColumns();

return {data: columns, schema: this.schema, length: this.length};
}

Expand All @@ -40,22 +48,30 @@ export default class ColumnarTableBatch {
}

this.allocated = this.allocated > 0 ? (this.allocated *= 2) : this.batchSize;
this.columns = {};
this.columns = [];

for (const fieldName in this.schema) {
const field = this.schema[fieldName];
const ArrayType = field.type || Float32Array;
// const oldColumn = this.columns[fieldName];
this.columns[fieldName] = new ArrayType(this.allocated);
const oldColumn = this.columns[field.index];

// Copy the old data to the new array
// if (oldColumn) {
// copy(this.columns[fieldName], oldColumn);
// }
if (oldColumn && ArrayBuffer.isView(oldColumn)) {
// Copy the old data to the new array
const typedArray = new ArrayType(this.allocated);
typedArray.set(oldColumn);
this.columns[field.index] = typedArray;
} else if (oldColumn) {
// Plain array
oldColumn.length = this.allocated;
this.columns[field.index] = oldColumn;
} else {
// Create new
this.columns[field.index] = new ArrayType(this.allocated);
}
}
}

pruneColumns() {
return this.columns;
this.columns = this.columns.map(column => column.slice(0, this.length));
}
}
22 changes: 21 additions & 1 deletion modules/experimental/src/categories/table/row-table-batch.js
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,13 @@ export default class RowTableBatch {
this.batchSize = batchSize;
this.rows = null;
this.length = 0;

if (!Array.isArray(schema)) {
this._headers = [];
for (const key in schema) {
this._headers[schema[key].index] = schema[key].name;
}
}
}

addRow(row) {
Expand All @@ -21,10 +28,23 @@ export default class RowTableBatch {

getNormalizedBatch() {
if (this.rows) {
const rows = this.rows.slice(0, this.length);
let rows = this.rows.slice(0, this.length);

if (this._headers) {
rows = rows.map(row => convertRowToObject(row, this._headers));
}

this.rows = null;
return {data: rows, schema: this.schema, length: rows.length};
}
return null;
}
}

function convertRowToObject(row, headers) {
const result = {};
for (let i = 0; i < headers.length; i++) {
result[headers[i]] = row[i];
}
return result;
}

0 comments on commit cadef9f

Please sign in to comment.