feat(csv): add/update CSVOpts, cell transforms, docs

- allow arrays for `cols` option - update initIndex(), add column autonaming fallback - refactor parseCSV() - rename CoercionFn => CellTransform - add upper/lower() transforms - add/update tests
thi-ng · Nov 18, 2020 · 282e85c · 282e85c
1 parent 28cac18
commit 282e85c
Show file tree

Hide file tree

Showing 6 changed files with 239 additions and 67 deletions.
diff --git a/packages/csv/src/api.ts b/packages/csv/src/api.ts
@@ -1,8 +1,8 @@
-import type { Fn2 } from "@thi.ng/api";
+import type { Fn2, Nullable } from "@thi.ng/api";
 
 export type CSVRow = Record<string, any>;
 
-export type CoercionFn = Fn2<string, CSVRow, any>;
+export type CellTransform = Fn2<string, CSVRow, any>;
 
 export interface ColumnSpec {
     /**
@@ -14,20 +14,26 @@ export interface ColumnSpec {
      * and (incomplete) result object of current row. Return value is used as
      * actual value for the cell.
      */
-    coerce?: CoercionFn;
+    tx?: CellTransform;
 }
 
 export interface CSVOpts {
     /**
      * Field delimiter character.
      *
-     * @defaultValue ","
+     * @defaultValue `,`
      */
     delim: string;
+    /**
+     * Field value quote character.
+     *
+     * @defaultValue `"`
+     */
+    quote: string;
     /**
      * Line comment prefix.
      *
-     * @defaultValue "#"
+     * @defaultValue `#`
      */
     comment: string;
     /**
@@ -44,9 +50,20 @@ export interface CSVOpts {
      */
     all: boolean;
     /**
-     * Object of column specific options/transformations.
+     * Array or object of column specific options/transformations.
+     *
+     * If given as array:
+     *
+     * - each item will be related to its respective column (array order)
+     * - any nullish {@link ColumnSpec} values will be skipped
+     * - if a spec provides no `alias` and no column name is made available
+     *   otherwise (i.e. either via 1st data row or the `header` option), then
+     *   that column will be named numerically
+     *
+     * If given as object, each key must match an existing/original column name
+     * (either as per 1st data row or the `header` option).
      */
-    cols: Record<string, ColumnSpec>;
+    cols: Nullable<ColumnSpec>[] | Record<string, ColumnSpec>;
     /**
      * If true, all leading and trailing whitespace for each field value will be
      * trimmed.

diff --git a/packages/csv/src/coerce.ts b/packages/csv/src/coerce.ts
diff --git a/packages/csv/src/index.ts b/packages/csv/src/index.ts
@@ -1,3 +1,3 @@
 export * from "./api";
-export * from "./coerce";
 export * from "./parse";
+export * from "./transforms";
diff --git a/packages/csv/src/parse.ts b/packages/csv/src/parse.ts
@@ -1,8 +1,76 @@
-import { isIterable } from "@thi.ng/checks";
-import { ESCAPES, split } from "@thi.ng/strings";
 import { compR, iterator1, Reducer, Transducer } from "@thi.ng/transducers";
+import { ESCAPES, split } from "@thi.ng/strings";
+import { isArray, isIterable } from "@thi.ng/checks";
+import type { Nullable } from "@thi.ng/api";
 import type { ColumnSpec, CSVOpts, CSVRow } from "./api";
 
+/** @internal */
+type IndexEntry = { i: number; spec: ColumnSpec };
+
+/**
+ * Default parser options.
+ *
+ * @internal
+ */
+const DEFAULT_OPTS: Partial<CSVOpts> = {
+    all: true,
+    delim: ",",
+    quote: '"',
+    comment: "#",
+    trim: false,
+};
+
+/**
+ * Configurable CSV parsing transducer, operating on line-based string iterables
+ * and yielding tuple objects of CSV records. If called with input, returns ES6
+ * iterator instead.
+ *
+ * @remarks
+ * Parsing behavior can be customized via given {@link CSVOpts}. The default
+ * behavior is:
+ *
+ * - comma delimiter
+ * - field names are obtained from first line/row
+ * - all columns are processed, but no coercions
+ * - untrimmed cell values
+ * - line comment prefix `#`
+ *
+ * Using the `cols` option, specific columns can be renamed and their values
+ * coerced/transformed. Additionally, if `all` option is `false`, then the
+ * result objects will only contain values of the columns specified in `cols`.
+ *
+ * Also see {@link parseCSVString}.
+ *
+ * @example
+ * ```ts
+ * import { parseCSV, upper, float } from "@thi.ng/csv";
+ *
+ * [...parseCSV(
+ *   {
+ *     all: false,
+ *     cols: {
+ *       "country": { tx: upper },
+ *       "latitude": { alias: "lat", tx: float() },
+ *       "longitude": { alias: "lon", tx: float() },
+ *     }
+ *   },
+ *   [
+ *      `"country","country group","name (en)","latitude","longitude"`,
+ *      `"at","eu","Austria","47.6965545","13.34598005"`,
+ *      `"be","eu","Belgium","50.501045","4.47667405"`,
+ *      `"bg","eu","Bulgaria","42.72567375","25.4823218"`,
+ *   ]
+ * )]
+ *
+ * // [
+ * //   { country: 'AT', lat: 47.6965545, lon: 13.34598005 },
+ * //   { country: 'BE', lat: 50.501045, lon: 4.47667405 },
+ * //   { country: 'BG', lat: 42.72567375, lon: 25.4823218 }
+ * // ]
+ * ```
+ *
+ * @param opts
+ */
 export function parseCSV(opts?: Partial<CSVOpts>): Transducer<string, CSVRow>;
 export function parseCSV(
     opts: Partial<CSVOpts>,
@@ -13,67 +81,73 @@ export function parseCSV(opts?: Partial<CSVOpts>, src?: Iterable<string>): any {
         ? iterator1(parseCSV(opts), src)
         : (rfn: Reducer<any, CSVRow>) => {
               const { all, cols, delim, quote, comment, trim, header } = {
-                  all: true,
-                  delim: ",",
-                  quote: '"',
-                  comment: "#",
-                  trim: false,
+                  ...DEFAULT_OPTS,
                   ...opts,
               };
               const reduce = rfn[2];
-              let index: Record<string, number>;
+              let index: Record<string, IndexEntry>;
               let revIndex: Record<number, string>;
               let first = true;
               let isQuoted = false;
               let record: string[] = [];
-              if (header) {
+
+              const init = (header: string[]) => {
                   cols && (index = initIndex(header, cols));
                   all && (revIndex = initRevIndex(header));
                   first = false;
-              }
+              };
+
+              const collectAll = (row: CSVRow) =>
+                  record.reduce(
+                      (acc, x, i) => (
+                          (acc[revIndex[i]] = trim ? x.trim() : x), acc
+                      ),
+                      row
+                  );
+
+              const collectIndexed = (row: CSVRow) =>
+                  Object.entries(index).reduce((acc, [id, { i, spec }]) => {
+                      let val = record[i];
+                      if (val !== undefined) {
+                          trim && (val = val.trim());
+                          all && spec.alias && delete acc[id];
+                          acc[spec.alias || id] = spec.tx
+                              ? spec.tx(val, acc)
+                              : val;
+                      }
+                      return acc;
+                  }, row);
+
+              header && init(header);
+
               return compR(rfn, (acc, line: string) => {
-                  if ((!line.length || line.startsWith(comment)) && !isQuoted)
+                  if ((!line.length || line.startsWith(comment!)) && !isQuoted)
                       return acc;
                   if (!first) {
-                      isQuoted = tokenizeLine(
+                      isQuoted = parseLine(
                           line,
                           record,
                           isQuoted,
-                          delim,
-                          quote
+                          delim!,
+                          quote!
                       );
                       if (isQuoted) return acc;
+
                       const row: CSVRow = {};
-                      all &&
-                          record.reduce(
-                              (acc, x, i) => (
-                                  (acc[revIndex[i]] = trim ? x.trim() : x), acc
-                              ),
-                              row
-                          );
-                      cols &&
-                          Object.entries(cols).reduce((acc, [id, spec]) => {
-                              let val = record[index[id]];
-                              trim && (val = val.trim());
-                              acc[spec.alias || id] = spec.coerce
-                                  ? spec.coerce(val, acc)
-                                  : val;
-                              return acc;
-                          }, row);
+                      all && collectAll(row);
+                      index && collectIndexed(row);
                       record = [];
                       return reduce(acc, row);
                   } else {
-                      isQuoted = tokenizeLine(
+                      isQuoted = parseLine(
                           line,
                           record,
                           isQuoted,
-                          delim,
-                          quote
+                          delim!,
+                          quote!
                       );
                       if (!isQuoted) {
-                          cols && (index = initIndex(record, cols));
-                          all && (revIndex = initRevIndex(record));
-                          first = false;
+                          init(record);
                           record = [];
                       }
                       return acc;
@@ -82,10 +156,30 @@ export function parseCSV(opts?: Partial<CSVOpts>, src?: Iterable<string>): any {
           };
 }
 
+/**
+ * Syntax sugar for iterator version of {@link parseCSV}, efficiently splitting
+ * given source string into a line based input using
+ * {@link @thi.ng/strings#split}.
+ *
+ * @param opts
+ * @param src
+ */
 export const parseCSVString = (opts: Partial<CSVOpts>, src: string) =>
     parseCSV(opts, split(src));
 
 /**
+ * Parses line into `acc`, taking quoted cell values and linebreaks into
+ * account.
+ *
+ * @remarks
+ * If `isQuoted` is true, the previous line ended with a quoted cell value,
+ * which might only end in the new or a future line. If that's the case, then
+ * the current line's contents will be added to the current last value of `acc`
+ * until the quoted cell is complete.
+ *
+ * Function returns current state of `isQuoted` (i.e. if line terminated in a
+ * quoted cell) and should be (re)called with new lines until it returns false.
+ *
  * @param line
  * @param acc
  * @param isQuoted
@@ -94,7 +188,7 @@ export const parseCSVString = (opts: Partial<CSVOpts>, src: string) =>
  *
  * @internal
  */
-export const tokenizeLine = (
+export const parseLine = (
     line: string,
     acc: string[],
     isQuoted: boolean,
@@ -142,11 +236,23 @@ export const tokenizeLine = (
     return isQuoted;
 };
 
-const initIndex = (line: string[], cols: Record<string, ColumnSpec>) =>
-    line.reduce(
-        (acc, id, i) => (cols![id] ? ((acc[id] = i), acc) : acc),
-        <Record<string, number>>{}
-    );
+const initIndex = (
+    line: string[],
+    cols: Nullable<ColumnSpec>[] | Record<string, ColumnSpec>
+) =>
+    isArray(cols)
+        ? cols.reduce((acc, spec, i) => {
+              if (spec) {
+                  const alias = spec.alias || line[i] || String(i);
+                  acc[alias] = { i, spec: { alias, ...spec } };
+              }
+              return acc;
+          }, <Record<string, IndexEntry>>{})
+        : line.reduce(
+              (acc, id, i) =>
+                  cols![id] ? ((acc[id] = { i, spec: cols![id] }), acc) : acc,
+              <Record<string, IndexEntry>>{}
+          );
 
 const initRevIndex = (line: string[]) =>
     line.reduce((acc, x, i) => ((acc[i] = x), acc), <Record<number, string>>{});
diff --git a/packages/csv/src/transforms.ts b/packages/csv/src/transforms.ts
@@ -0,0 +1,15 @@
+import { maybeParseFloat, maybeParseInt } from "@thi.ng/strings";
+import { CellTransform } from "./api";
+
+export const upper: CellTransform = (x) => x.toUpperCase();
+
+export const lower: CellTransform = (x) => x.toLowerCase();
+
+export const float = (defaultVal = 0): CellTransform => (x) =>
+    maybeParseFloat(x, defaultVal);
+
+export const int = (defaultVal = 0): CellTransform => (x) =>
+    maybeParseInt(x, defaultVal, 10);
+
+export const hex = (defaultVal = 0): CellTransform => (x) =>
+    maybeParseInt(x, defaultVal, 16);