use header option in batch loader

visgl · Aug 6, 2019 · cadef9f · cadef9f
1 parent 5b80b1f
commit cadef9f
Show file tree

Hide file tree

Showing 5 changed files with 99 additions and 19 deletions.
diff --git a/docs/api-reference/table-loaders/csv-loader.md b/docs/api-reference/table-loaders/csv-loader.md
@@ -19,6 +19,7 @@ The following options are passed on to [papaparse](https://www.papaparse.com/doc
 | Option                   | Description                                                                                                                                                                                                                                                                                     |
 | ------------------------ | ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
 | `delimiter`=             | The delimiting character. By default auto-detects from a list of common delimiters (or `delimitersToGuess`).                                                                                                                                                                                    |
+| `header`=   | If `true`, the first row of parsed data will be interpreted as field names. If `false`, the first row is interpreted as data. By default auto-detects. |
 | `newline`=               | The newline sequence. By default auto-detects. Must be `\r`, `\n`, or `\r\n`.                                                                                                                                                                                                                   |
 | `quoteChar`=`"`          | The character used to quote fields. (Note: unquoted fields are parsed correctly).                                                                                                                                                                                                               |
 | `escapeChar`=`"`         | The character used to escape the quote character within a field.                                                                                                                                                                                                                                |
@@ -37,7 +38,6 @@ Note that the following `papaparse` options are NOT supported by `CSVLoader` (th
 
 | Option             | Description                                                                  | Reason/Replacement                                                              |
 | ------------------ | ---------------------------------------------------------------------------- | ------------------------------------------------------------------------------- |
-| `header`=`false`   | If true, the first row of parsed data will be interpreted as field names. \* | Header is detected and parsed by `CSVLoader`                                    |
 | `transformHeader`= | Function to apply on each header.                                            | (Only available in version 5.0)                                                 |
 | `worker`           | Whether to use a worker thread.                                              | Use `CSVWorkerLoader` instead.                                                  |
 | `step`             | Callback function for streaming.                                             | Use `loadInBatches` instead.                                                    |

diff --git a/modules/csv/src/csv-loader.js b/modules/csv/src/csv-loader.js
@@ -63,7 +63,8 @@ function parseCSVInBatches(asyncIterator, options) {
 
       // Check if we need to save a header row
       if (isFirstRow && !headerRow) {
-        if (isHeaderRow(row)) {
+        const {header = isHeaderRow(row)} = options;
+        if (header) {
           headerRow = row;
           return;
         }
@@ -128,19 +129,19 @@ function hasHeader(csvText, options) {
 }
 
 function deduceSchema(row, headerRow) {
-  const schema = {};
+  const schema = headerRow ? {} : [];
   for (let i = 0; i < row.length; i++) {
-    const columnName = (headerRow && headerRow[i]) || String(i);
+    const columnName = (headerRow && headerRow[i]) || i;
     const value = row[i];
     switch (typeof value) {
       case 'number':
       case 'boolean':
         // TODO - booleans could be handled differently...
-        schema[columnName] = {name: columnName, type: Float32Array};
+        schema[columnName] = {name: String(columnName), index: i, type: Float32Array};
         break;
       case 'string':
       default:
-        schema[columnName] = {name: columnName, type: Array};
+        schema[columnName] = {name: String(columnName), index: i, type: Array};
       // We currently only handle numeric rows
       // TODO we could offer a function to map strings to numbers?
     }

diff --git a/modules/csv/test/csv-loader.spec.js b/modules/csv/test/csv-loader.spec.js
@@ -8,6 +8,27 @@ const CSV_SAMPLE_URL = '@loaders.gl/csv/test/data/sample.csv';
 // const CSV_SAMLE_LONG_URL = '@loaders.gl/csv/test/data/sample-long.csv';
 const CSV_SAMPLE_VERY_LONG_URL = '@loaders.gl/csv/test/data/sample-very-long.csv';
 
+function validateColumn(column, length, type) {
+  if (column.length !== length) {
+    return `column length should be ${length}`;
+  }
+  let validator = null;
+  switch (type) {
+    case 'string':
+      validator = d => typeof d === 'string';
+      break;
+
+    case 'float':
+      validator = d => Number.isFinite(d);
+      break;
+
+    default:
+      return null;
+  }
+
+  return column.every(validator) ? true : `column elements are not all ${type}s`;
+}
+
 test('CSVLoader#load', async t => {
   const rows = await load(CSV_SAMPLE_URL, CSVLoader);
   t.is(rows.length, 2, 'Got correct table size');
@@ -38,6 +59,11 @@ test('CSVLoader#loadInBatches(sample.csv, columns)', async t => {
   for await (const batch of iterator) {
     t.comment(`BATCH ${batch.count}: ${batch.length} ${JSON.stringify(batch.data).slice(0, 200)}`);
     t.equal(batch.length, 2, 'Got correct batch size');
+
+    t.ok(validateColumn(batch.data[0], batch.length, 'string'), 'column 0 valid');
+    t.ok(validateColumn(batch.data[1], batch.length, 'string'), 'column 1 valid');
+    t.ok(validateColumn(batch.data[2], batch.length, 'float'), 'column 2 valid');
+
     batchCount++;
   }
   t.equal(batchCount, 1, 'Correct number of batches received');
@@ -56,6 +82,17 @@ test('CSVLoader#loadInBatches(sample-very-long.csv, columns)', async t => {
   for await (const batch of iterator) {
     t.comment(`BATCH ${batch.count}: ${batch.length} ${JSON.stringify(batch.data).slice(0, 200)}`);
     t.equal(batch.length, batchSize, 'Got correct batch size');
+
+    t.ok(validateColumn(batch.data.TLD, batch.length, 'string'), 'column TLD valid');
+    t.ok(
+      validateColumn(batch.data['meaning of life'], batch.length, 'float'),
+      'column meaning of life valid'
+    );
+    t.ok(
+      validateColumn(batch.data.placeholder, batch.length, 'string'),
+      'column placeholder valid'
+    );
+
     batchCount++;
     if (batchCount === 5) {
       break;
@@ -74,6 +111,7 @@ test('CSVLoader#loadInBatches(sample.csv, rows)', async t => {
   for await (const batch of iterator) {
     t.comment(`BATCH ${batch.count}: ${batch.length} ${JSON.stringify(batch.data).slice(0, 200)}`);
     t.equal(batch.length, 2, 'Got correct batch size');
+    t.deepEqual(batch.data[0], ['A', 'B', 1], 'Got correct first row');
     batchCount++;
   }
   t.equal(batchCount, 1, 'Correct number of batches received');
@@ -90,6 +128,11 @@ test('CSVLoader#loadInBatches(sample-very-long.csv, rows)', async t => {
   for await (const batch of iterator) {
     t.comment(`BATCH ${batch.count}: ${batch.length} ${JSON.stringify(batch.data).slice(0, 200)}`);
     t.equal(batch.length, batchSize, 'Got correct batch size');
+
+    t.ok(batch.data[0].TLD, 'first row has TLD value');
+    t.ok(batch.data[0]['meaning of life'], 'first row has meaning of life value');
+    t.ok(batch.data[0].placeholder, 'first row has placeholder value');
+
     batchCount++;
     if (batchCount === 5) {
       break;

diff --git a/modules/experimental/src/categories/table/columnar-table-batch.js b/modules/experimental/src/categories/table/columnar-table-batch.js
@@ -14,7 +14,7 @@ export default class ColumnarTableBatch {
     // If user keeps pushing rows beyond batch size, reallocate
     this.reallocateColumns();
     for (const fieldName in row) {
-      this.columns[fieldName] = row[fieldName];
+      this.columns[fieldName][this.length] = row[fieldName];
     }
     this.length++;
   }
@@ -25,10 +25,18 @@ export default class ColumnarTableBatch {
   }
 
   getNormalizedBatch() {
-    const columns = this.columns;
+    this.pruneColumns();
+    const columns = Array.isArray(this.schema) ? this.columns : {};
+
+    if (!Array.isArray(this.schema)) {
+      for (const fieldName in this.schema) {
+        const field = this.schema[fieldName];
+        columns[field.name] = this.columns[field.index];
+      }
+    }
+
     this.columns = null;
-    // TODO - Ensure column lengths are set to the actual loaded size
-    // this.pruneColumns();
+
     return {data: columns, schema: this.schema, length: this.length};
   }
 
@@ -40,22 +48,30 @@ export default class ColumnarTableBatch {
     }
 
     this.allocated = this.allocated > 0 ? (this.allocated *= 2) : this.batchSize;
-    this.columns = {};
+    this.columns = [];
 
     for (const fieldName in this.schema) {
       const field = this.schema[fieldName];
       const ArrayType = field.type || Float32Array;
-      // const oldColumn = this.columns[fieldName];
-      this.columns[fieldName] = new ArrayType(this.allocated);
+      const oldColumn = this.columns[field.index];
 
-      // Copy the old data to the new array
-      // if (oldColumn) {
-      //   copy(this.columns[fieldName], oldColumn);
-      // }
+      if (oldColumn && ArrayBuffer.isView(oldColumn)) {
+        // Copy the old data to the new array
+        const typedArray = new ArrayType(this.allocated);
+        typedArray.set(oldColumn);
+        this.columns[field.index] = typedArray;
+      } else if (oldColumn) {
+        // Plain array
+        oldColumn.length = this.allocated;
+        this.columns[field.index] = oldColumn;
+      } else {
+        // Create new
+        this.columns[field.index] = new ArrayType(this.allocated);
+      }
     }
   }
 
   pruneColumns() {
-    return this.columns;
+    this.columns = this.columns.map(column => column.slice(0, this.length));
   }
 }
diff --git a/modules/experimental/src/categories/table/row-table-batch.js b/modules/experimental/src/categories/table/row-table-batch.js
@@ -4,6 +4,13 @@ export default class RowTableBatch {
     this.batchSize = batchSize;
     this.rows = null;
     this.length = 0;
+
+    if (!Array.isArray(schema)) {
+      this._headers = [];
+      for (const key in schema) {
+        this._headers[schema[key].index] = schema[key].name;
+      }
+    }
   }
 
   addRow(row) {
@@ -21,10 +28,23 @@ export default class RowTableBatch {
 
   getNormalizedBatch() {
     if (this.rows) {
-      const rows = this.rows.slice(0, this.length);
+      let rows = this.rows.slice(0, this.length);
+
+      if (this._headers) {
+        rows = rows.map(row => convertRowToObject(row, this._headers));
+      }
+
       this.rows = null;
       return {data: rows, schema: this.schema, length: rows.length};
     }
     return null;
   }
 }
+
+function convertRowToObject(row, headers) {
+  const result = {};
+  for (let i = 0; i < headers.length; i++) {
+    result[headers[i]] = row[i];
+  }
+  return result;
+}