Skip to content

Commit

Permalink
feat(csv): add/update CSVOpts, cell transforms, docs
Browse files Browse the repository at this point in the history
- allow arrays for `cols` option
- update initIndex(), add column autonaming fallback
- refactor parseCSV()
- rename CoercionFn => CellTransform
- add upper/lower() transforms
- add/update tests
  • Loading branch information
postspectacular committed Nov 18, 2020
1 parent 28cac18 commit 282e85c
Show file tree
Hide file tree
Showing 6 changed files with 239 additions and 67 deletions.
31 changes: 24 additions & 7 deletions packages/csv/src/api.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import type { Fn2 } from "@thi.ng/api";
import type { Fn2, Nullable } from "@thi.ng/api";

export type CSVRow = Record<string, any>;

export type CoercionFn = Fn2<string, CSVRow, any>;
export type CellTransform = Fn2<string, CSVRow, any>;

export interface ColumnSpec {
/**
Expand All @@ -14,20 +14,26 @@ export interface ColumnSpec {
* and (incomplete) result object of current row. Return value is used as
* actual value for the cell.
*/
coerce?: CoercionFn;
tx?: CellTransform;
}

export interface CSVOpts {
/**
* Field delimiter character.
*
* @defaultValue ","
* @defaultValue `,`
*/
delim: string;
/**
* Field value quote character.
*
* @defaultValue `"`
*/
quote: string;
/**
* Line comment prefix.
*
* @defaultValue "#"
* @defaultValue `#`
*/
comment: string;
/**
Expand All @@ -44,9 +50,20 @@ export interface CSVOpts {
*/
all: boolean;
/**
* Object of column specific options/transformations.
* Array or object of column specific options/transformations.
*
* If given as array:
*
* - each item will be related to its respective column (array order)
* - any nullish {@link ColumnSpec} values will be skipped
* - if a spec provides no `alias` and no column name is made available
* otherwise (i.e. either via 1st data row or the `header` option), then
* that column will be named numerically
*
* If given as object, each key must match an existing/original column name
* (either as per 1st data row or the `header` option).
*/
cols: Record<string, ColumnSpec>;
cols: Nullable<ColumnSpec>[] | Record<string, ColumnSpec>;
/**
* If true, all leading and trailing whitespace for each field value will be
* trimmed.
Expand Down
11 changes: 0 additions & 11 deletions packages/csv/src/coerce.ts

This file was deleted.

2 changes: 1 addition & 1 deletion packages/csv/src/index.ts
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
export * from "./api";
export * from "./coerce";
export * from "./parse";
export * from "./transforms";
190 changes: 148 additions & 42 deletions packages/csv/src/parse.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,76 @@
import { isIterable } from "@thi.ng/checks";
import { ESCAPES, split } from "@thi.ng/strings";
import { compR, iterator1, Reducer, Transducer } from "@thi.ng/transducers";
import { ESCAPES, split } from "@thi.ng/strings";
import { isArray, isIterable } from "@thi.ng/checks";
import type { Nullable } from "@thi.ng/api";
import type { ColumnSpec, CSVOpts, CSVRow } from "./api";

/** @internal */
type IndexEntry = { i: number; spec: ColumnSpec };

/**
* Default parser options.
*
* @internal
*/
const DEFAULT_OPTS: Partial<CSVOpts> = {
all: true,
delim: ",",
quote: '"',
comment: "#",
trim: false,
};

/**
* Configurable CSV parsing transducer, operating on line-based string iterables
* and yielding tuple objects of CSV records. If called with input, returns ES6
* iterator instead.
*
* @remarks
* Parsing behavior can be customized via given {@link CSVOpts}. The default
* behavior is:
*
* - comma delimiter
* - field names are obtained from first line/row
* - all columns are processed, but no coercions
* - untrimmed cell values
* - line comment prefix `#`
*
* Using the `cols` option, specific columns can be renamed and their values
* coerced/transformed. Additionally, if `all` option is `false`, then the
* result objects will only contain values of the columns specified in `cols`.
*
* Also see {@link parseCSVString}.
*
* @example
* ```ts
* import { parseCSV, upper, float } from "@thi.ng/csv";
*
* [...parseCSV(
* {
* all: false,
* cols: {
* "country": { tx: upper },
* "latitude": { alias: "lat", tx: float() },
* "longitude": { alias: "lon", tx: float() },
* }
* },
* [
* `"country","country group","name (en)","latitude","longitude"`,
* `"at","eu","Austria","47.6965545","13.34598005"`,
* `"be","eu","Belgium","50.501045","4.47667405"`,
* `"bg","eu","Bulgaria","42.72567375","25.4823218"`,
* ]
* )]
*
* // [
* // { country: 'AT', lat: 47.6965545, lon: 13.34598005 },
* // { country: 'BE', lat: 50.501045, lon: 4.47667405 },
* // { country: 'BG', lat: 42.72567375, lon: 25.4823218 }
* // ]
* ```
*
* @param opts
*/
export function parseCSV(opts?: Partial<CSVOpts>): Transducer<string, CSVRow>;
export function parseCSV(
opts: Partial<CSVOpts>,
Expand All @@ -13,67 +81,73 @@ export function parseCSV(opts?: Partial<CSVOpts>, src?: Iterable<string>): any {
? iterator1(parseCSV(opts), src)
: (rfn: Reducer<any, CSVRow>) => {
const { all, cols, delim, quote, comment, trim, header } = {
all: true,
delim: ",",
quote: '"',
comment: "#",
trim: false,
...DEFAULT_OPTS,
...opts,
};
const reduce = rfn[2];
let index: Record<string, number>;
let index: Record<string, IndexEntry>;
let revIndex: Record<number, string>;
let first = true;
let isQuoted = false;
let record: string[] = [];
if (header) {

const init = (header: string[]) => {
cols && (index = initIndex(header, cols));
all && (revIndex = initRevIndex(header));
first = false;
}
};

const collectAll = (row: CSVRow) =>
record.reduce(
(acc, x, i) => (
(acc[revIndex[i]] = trim ? x.trim() : x), acc
),
row
);

const collectIndexed = (row: CSVRow) =>
Object.entries(index).reduce((acc, [id, { i, spec }]) => {
let val = record[i];
if (val !== undefined) {
trim && (val = val.trim());
all && spec.alias && delete acc[id];
acc[spec.alias || id] = spec.tx
? spec.tx(val, acc)
: val;
}
return acc;
}, row);

header && init(header);

return compR(rfn, (acc, line: string) => {
if ((!line.length || line.startsWith(comment)) && !isQuoted)
if ((!line.length || line.startsWith(comment!)) && !isQuoted)
return acc;
if (!first) {
isQuoted = tokenizeLine(
isQuoted = parseLine(
line,
record,
isQuoted,
delim,
quote
delim!,
quote!
);
if (isQuoted) return acc;

const row: CSVRow = {};
all &&
record.reduce(
(acc, x, i) => (
(acc[revIndex[i]] = trim ? x.trim() : x), acc
),
row
);
cols &&
Object.entries(cols).reduce((acc, [id, spec]) => {
let val = record[index[id]];
trim && (val = val.trim());
acc[spec.alias || id] = spec.coerce
? spec.coerce(val, acc)
: val;
return acc;
}, row);
all && collectAll(row);
index && collectIndexed(row);
record = [];
return reduce(acc, row);
} else {
isQuoted = tokenizeLine(
isQuoted = parseLine(
line,
record,
isQuoted,
delim,
quote
delim!,
quote!
);
if (!isQuoted) {
cols && (index = initIndex(record, cols));
all && (revIndex = initRevIndex(record));
first = false;
init(record);
record = [];
}
return acc;
Expand All @@ -82,10 +156,30 @@ export function parseCSV(opts?: Partial<CSVOpts>, src?: Iterable<string>): any {
};
}

/**
* Syntax sugar for iterator version of {@link parseCSV}, efficiently splitting
* given source string into a line based input using
* {@link @thi.ng/strings#split}.
*
* @param opts
* @param src
*/
export const parseCSVString = (opts: Partial<CSVOpts>, src: string) =>
parseCSV(opts, split(src));

/**
* Parses line into `acc`, taking quoted cell values and linebreaks into
* account.
*
* @remarks
* If `isQuoted` is true, the previous line ended with a quoted cell value,
* which might only end in the new or a future line. If that's the case, then
* the current line's contents will be added to the current last value of `acc`
* until the quoted cell is complete.
*
* Function returns current state of `isQuoted` (i.e. if line terminated in a
* quoted cell) and should be (re)called with new lines until it returns false.
*
* @param line
* @param acc
* @param isQuoted
Expand All @@ -94,7 +188,7 @@ export const parseCSVString = (opts: Partial<CSVOpts>, src: string) =>
*
* @internal
*/
export const tokenizeLine = (
export const parseLine = (
line: string,
acc: string[],
isQuoted: boolean,
Expand Down Expand Up @@ -142,11 +236,23 @@ export const tokenizeLine = (
return isQuoted;
};

const initIndex = (line: string[], cols: Record<string, ColumnSpec>) =>
line.reduce(
(acc, id, i) => (cols![id] ? ((acc[id] = i), acc) : acc),
<Record<string, number>>{}
);
const initIndex = (
line: string[],
cols: Nullable<ColumnSpec>[] | Record<string, ColumnSpec>
) =>
isArray(cols)
? cols.reduce((acc, spec, i) => {
if (spec) {
const alias = spec.alias || line[i] || String(i);
acc[alias] = { i, spec: { alias, ...spec } };
}
return acc;
}, <Record<string, IndexEntry>>{})
: line.reduce(
(acc, id, i) =>
cols![id] ? ((acc[id] = { i, spec: cols![id] }), acc) : acc,
<Record<string, IndexEntry>>{}
);

const initRevIndex = (line: string[]) =>
line.reduce((acc, x, i) => ((acc[i] = x), acc), <Record<number, string>>{});
15 changes: 15 additions & 0 deletions packages/csv/src/transforms.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
import { maybeParseFloat, maybeParseInt } from "@thi.ng/strings";
import { CellTransform } from "./api";

export const upper: CellTransform = (x) => x.toUpperCase();

export const lower: CellTransform = (x) => x.toLowerCase();

export const float = (defaultVal = 0): CellTransform => (x) =>
maybeParseFloat(x, defaultVal);

export const int = (defaultVal = 0): CellTransform => (x) =>
maybeParseInt(x, defaultVal, 10);

export const hex = (defaultVal = 0): CellTransform => (x) =>
maybeParseInt(x, defaultVal, 16);

0 comments on commit 282e85c

Please sign in to comment.