From 3dc9fb078fc2be4ec23fb1f85fd27045df74d318 Mon Sep 17 00:00:00 2001
From: Nick Peihl <nick.peihl@elastic.co>
Date: Fri, 28 Jan 2022 15:05:47 -0500
Subject: [PATCH] Add `bytesUsed` to Shapefile `parseInBatches`.

This modifies `zipBatchIterators` to return an object with keys `progress` and `data` rather than an array.
---
 .../shapefile/src/lib/parsers/parse-dbf.ts    | 12 ++++---
 .../src/lib/parsers/parse-shapefile.ts        | 11 ++++--
 .../shapefile/src/lib/parsers/parse-shp.ts    | 20 ++++++++---
 .../src/lib/streaming/zip-batch-iterators.ts  | 36 ++++++++++++-------
 4 files changed, 54 insertions(+), 25 deletions(-)

diff --git a/modules/shapefile/src/lib/parsers/parse-dbf.ts b/modules/shapefile/src/lib/parsers/parse-dbf.ts
index e34b88bd7c..e6cf71a39b 100644
--- a/modules/shapefile/src/lib/parsers/parse-dbf.ts
+++ b/modules/shapefile/src/lib/parsers/parse-dbf.ts
@@ -5,7 +5,7 @@ type DBFRowsOutput = object[];
 
 interface DBFTableOutput {
   schema?: Schema;
-  rows: DBFRowsOutput;
+  data: DBFRowsOutput;
 }
 
 type DBFHeader = {
@@ -111,7 +111,7 @@ export function parseDBF(
   switch (options.tables && options.tables.format) {
     case 'table':
       // TODO - parse columns
-      return {schema, rows: data};
+      return {schema, data};
 
     case 'rows':
     default:
@@ -139,13 +139,17 @@ export async function* parseDBFInBatches(
     }
 
     if (parser.result.data.length > 0) {
-      yield parser.result.data;
+      yield {
+        data: parser.result.data
+      };
       parser.result.data = [];
     }
   }
   parser.end();
   if (parser.result.data.length > 0) {
-    yield parser.result.data;
+    yield {
+      data: parser.result.data
+    };
   }
 }
 /**
diff --git a/modules/shapefile/src/lib/parsers/parse-shapefile.ts b/modules/shapefile/src/lib/parsers/parse-shapefile.ts
index d574b83b41..cd89c875e4 100644
--- a/modules/shapefile/src/lib/parsers/parse-shapefile.ts
+++ b/modules/shapefile/src/lib/parsers/parse-shapefile.ts
@@ -17,6 +17,8 @@ interface ShapefileOutput {
   shx?: SHXOutput;
   header: SHPHeader;
   data: object[];
+  bytesUsed?: number;
+  bytesTotal?: number;
 }
 /**
  * Parsing of file in batches
@@ -77,10 +79,11 @@ export async function* parseShapefileInBatches(
   for await (const item of iterator) {
     let geometries: any;
     let properties: any;
+    const {bytesUsed, bytesTotal} = item.progress;
     if (!propertyIterable) {
-      geometries = item;
+      geometries = item.data;
     } else {
-      [geometries, properties] = item;
+      [geometries, properties] = item.data;
     }
 
     const geojsonGeometries = parseGeometries(geometries);
@@ -94,7 +97,9 @@ export async function* parseShapefileInBatches(
       prj,
       shx,
       header: shapeHeader,
-      data: features
+      data: features,
+      bytesUsed,
+      bytesTotal
     };
   }
 }
diff --git a/modules/shapefile/src/lib/parsers/parse-shp.ts b/modules/shapefile/src/lib/parsers/parse-shp.ts
index 24973a527c..6a12fe350d 100644
--- a/modules/shapefile/src/lib/parsers/parse-shp.ts
+++ b/modules/shapefile/src/lib/parsers/parse-shp.ts
@@ -23,6 +23,7 @@ type SHPResult = {
   geometries: [];
   header?: {};
   error?: string;
+  progress: {bytesUsed?: number; totalBytes?: number};
 };
 
 class SHPParser {
@@ -30,7 +31,8 @@ class SHPParser {
   binaryReader = new BinaryChunkReader({maxRewindBytes: SHP_RECORD_HEADER_SIZE});
   state = STATE.EXPECTING_HEADER;
   result: SHPResult = {
-    geometries: []
+    geometries: [],
+    progress: {}
   };
 
   constructor(options?: LoaderOptions) {
@@ -45,7 +47,6 @@ class SHPParser {
   end() {
     this.binaryReader.end();
     this.state = parseState(this.state, this.result, this.binaryReader, this.options);
-    // this.result.progress.bytesUsed = this.binaryReader.bytesUsed();
     if (this.state !== STATE.END) {
       this.state = STATE.ERROR;
       this.result.error = 'SHP incomplete file';
@@ -81,13 +82,19 @@ export async function* parseSHPInBatches(
     }
 
     if (parser.result.geometries.length > 0) {
-      yield parser.result.geometries;
+      yield {
+        data: parser.result.geometries,
+        progress: parser.result.progress
+      };
       parser.result.geometries = [];
     }
   }
   parser.end();
   if (parser.result.geometries.length > 0) {
-    yield parser.result.geometries;
+    yield {
+      data: parser.result.geometries,
+      progress: parser.result.progress
+    };
   }
 
   return;
@@ -130,7 +137,7 @@ function parseState(
 
           result.header = parseSHPHeader(dataView);
           result.progress = {
-            bytesUsed: 0,
+            bytesUsed: SHP_HEADER_SIZE,
             bytesTotal: result.header.length,
             rows: 0
           };
@@ -178,6 +185,9 @@ function parseState(
 
               result.currentIndex++;
               result.progress.rows = result.currentIndex - 1;
+              // +8 because the content length field in the record's header
+              // excludes the 8-byte record header itself
+              result.progress.bytesUsed += recordHeader.byteLength + 8;
             }
           }
 
diff --git a/modules/shapefile/src/lib/streaming/zip-batch-iterators.ts b/modules/shapefile/src/lib/streaming/zip-batch-iterators.ts
index 90686af019..d5c1239e99 100644
--- a/modules/shapefile/src/lib/streaming/zip-batch-iterators.ts
+++ b/modules/shapefile/src/lib/streaming/zip-batch-iterators.ts
@@ -1,3 +1,8 @@
+type Batch = {
+  data: number[] | number[][];
+  progress?: {bytesUsed: number; totalBytes: number; rows: number};
+};
+
 /**
  * Zip two iterators together
  *
@@ -7,23 +12,23 @@
 export async function* zipBatchIterators(
   iterator1: AsyncIterator<any[]>,
   iterator2: AsyncIterator<any[]>
-): AsyncGenerator<number[][], void, unknown> {
-  let batch1 = [];
-  let batch2 = [];
+): AsyncGenerator<Batch, void, unknown> {
+  let batch1 = {data: []};
+  let batch2 = {data: []};
   let iterator1Done: boolean = false;
   let iterator2Done: boolean = false;
 
   // TODO - one could let all iterators flow at full speed using `Promise.race`
   // however we might end up with a big temporary buffer
   while (!iterator1Done && !iterator2Done) {
-    if (batch1.length === 0 && !iterator1Done) {
+    if (batch1.data.length === 0 && !iterator1Done) {
       const {value, done} = await iterator1.next();
       if (done) {
         iterator1Done = true;
       } else {
         batch1 = value;
       }
-    } else if (batch2.length === 0 && !iterator2Done) {
+    } else if (batch2.data.length === 0 && !iterator2Done) {
       const {value, done} = await iterator2.next();
       if (done) {
         iterator2Done = true;
@@ -46,17 +51,22 @@ export async function* zipBatchIterators(
  * @param batch2
  * @return array | null
  */
-function extractBatch(batch1: number[], batch2: number[]): number[][] | null {
-  const batchLength: number = Math.min(batch1.length, batch2.length);
-  if (batchLength === 0) {
+function extractBatch(batch1: Batch, batch2: Batch): Batch | null {
+  const {data: data1, progress} = batch1;
+  const {data: data2} = batch2;
+  const dataLength: number = Math.min(data1.length, data2.length);
+  if (dataLength === 0) {
     return null;
   }
 
   // Non interleaved arrays
-  const batch: number[][] = [batch1.slice(0, batchLength), batch2.slice(0, batchLength)];
+  const result: any = {
+    progress,
+    data: [data1.slice(0, dataLength), data2.slice(0, dataLength)]
+  };
 
-  // Modify the 2 batches
-  batch1.splice(0, batchLength);
-  batch2.splice(0, batchLength);
-  return batch;
+  // Modify the 2 data arrays
+  data1.splice(0, dataLength);
+  data2.splice(0, dataLength);
+  return result;
 }