feature: implement EngineData parsing

fixes #6
webtoon · Aug 18, 2022 · 8c5f09c · 8c5f09c
1 parent 8885f16
commit 8c5f09c
Show file tree

Hide file tree

Showing 18 changed files with 1,193 additions and 1 deletion.
diff --git a/packages/psd/src/classes/Layer.ts b/packages/psd/src/classes/Layer.ts
@@ -2,7 +2,7 @@
 // Copyright 2021-present NAVER WEBTOON
 // MIT License
 
-import {ImageData} from "../interfaces";
+import {EngineData, ImageData} from "../interfaces";
 import {LayerFrame} from "../sections";
 import {NodeParent} from "./Node";
 import {NodeBase} from "./NodeBase";
@@ -65,6 +65,14 @@ export class Layer
     return this.layerFrame.layerProperties.text;
   }
 
+  /**
+   * If this layer is a text layer, this property retrieves its text properties.
+   * Otherwise, this property is `undefined`.
+   */
+  get textProperties(): EngineData | undefined {
+    return this.layerFrame.layerProperties.textProperties;
+  }
+
   protected get imageData(): ImageData {
     const {red, green, blue, alpha} = this.layerFrame;
 

diff --git a/packages/psd/src/engineData/index.ts b/packages/psd/src/engineData/index.ts
@@ -0,0 +1,7 @@
+// @webtoon/psd
+// Copyright 2021-present NAVER WEBTOON
+// MIT License
+
+export * from "./lexer";
+export * from "./parser";
+export * from "./validator";
diff --git a/packages/psd/src/engineData/lexer.ts b/packages/psd/src/engineData/lexer.ts
@@ -0,0 +1,217 @@
+// @webtoon/psd
+// Copyright 2021-present NAVER WEBTOON
+// MIT License
+
+// Based on PDF grammar: https://web.archive.org/web/20220226063926/https://www.adobe.com/content/dam/acom/en/devnet/pdf/pdfs/PDF32000_2008.pdf
+// Section 7.2 - Lexical Conventions
+
+import {
+  Cursor,
+  InvalidEngineDataBoolean,
+  InvalidEngineDataNumber,
+  InvalidEngineDataTextBOM,
+} from "../utils";
+
+export enum TokenType {
+  String,
+  DictBeg,
+  DictEnd,
+  ArrBeg,
+  ArrEnd,
+  Name,
+  Number,
+  Boolean,
+}
+
+export type Token =
+  | {type: TokenType.String; value: string}
+  | {type: TokenType.DictBeg}
+  | {type: TokenType.DictEnd}
+  | {type: TokenType.ArrBeg}
+  | {type: TokenType.ArrEnd}
+  | {type: TokenType.Name; value: string}
+  | {type: TokenType.Number; value: number}
+  | {type: TokenType.Boolean; value: boolean};
+
+const WhitespaceCharacters = new Set([
+  0,
+  9,
+  12,
+  32, // ' '
+  10, // \n
+  13, // \r
+]);
+
+const BooleanStartCharacters = new Set([
+  0x66, // f
+  0x74, // t
+]);
+
+const Delimiters = {
+  "(": 0x28,
+  ")": 0x29,
+  "<": 0x3c,
+  ">": 0x3e,
+  "[": 0x5b,
+  "]": 0x5d,
+  "/": 0x2f,
+  "\\": 0x5c,
+  // NOTE: These have meaning within PDF. Are they used here?
+  // "{": 123,
+  // "}": 125,
+  // "%": 37,
+};
+
+const DelimiterCharacters = new Set(Object.values(Delimiters));
+
+export class Lexer {
+  constructor(private cursor: Cursor) {}
+
+  *tokens(): Generator<Token> {
+    while (!this.done()) {
+      const val = this.cursor.read("u8");
+
+      if (WhitespaceCharacters.has(val)) {
+        while (!this.done() && WhitespaceCharacters.has(this.cursor.peek()))
+          this.cursor.pass(1);
+        continue;
+      }
+      if (DelimiterCharacters.has(val)) {
+        if (val === Delimiters["("]) {
+          yield {type: TokenType.String, value: this.text()};
+          continue;
+        }
+        if (val === Delimiters["["]) {
+          yield {type: TokenType.ArrBeg};
+          continue;
+        }
+        if (val === Delimiters["]"]) {
+          yield {type: TokenType.ArrEnd};
+          continue;
+        }
+        if (val === Delimiters["<"]) {
+          // NOTE: assert that it is < indeed?
+          this.cursor.pass(1);
+          yield {type: TokenType.DictBeg};
+          continue;
+        }
+        if (val === Delimiters[">"]) {
+          // NOTE: assert that it is > indeed?
+          this.cursor.pass(1);
+          yield {type: TokenType.DictEnd};
+          continue;
+        }
+        if (val === Delimiters["/"]) {
+          yield {type: TokenType.Name, value: this.string()};
+          continue;
+        }
+        console.assert(
+          false,
+          "Unhandled delimiter: '%s'",
+          String.fromCharCode(val)
+        );
+        continue;
+      }
+      // only two types left: number or boolean
+      // we need to return val first since it starts value
+      this.cursor.unpass(1);
+      if (BooleanStartCharacters.has(val)) {
+        yield {type: TokenType.Boolean, value: this.boolean()};
+      } else {
+        yield {type: TokenType.Number, value: this.number()};
+      }
+    }
+  }
+
+  private done(): boolean {
+    return this.cursor.position >= this.cursor.length;
+  }
+
+  private text(): string {
+    const firstByte = this.cursor.peek();
+    if (firstByte === Delimiters[")"]) {
+      this.cursor.pass(1);
+      return "";
+    }
+    const hasBom = firstByte === 0xff || firstByte === 0xfe;
+    let decoder = new TextDecoder("utf-16be");
+    if (hasBom) {
+      decoder = this.textDecoderFromBOM();
+    }
+    const textParts = [] as string[];
+    const readAhead = this.cursor.clone();
+    while (readAhead.peek() !== Delimiters[")"]) {
+      readAhead.pass(1);
+      if (readAhead.peek() === Delimiters["\\"]) {
+        const length = readAhead.position - this.cursor.position;
+        let raw = this.cursor.take(length);
+        if (raw.at(-1) === 0x00) {
+          // Sometimes there's extra padding before - we need to remove it
+          raw = raw.subarray(0, -1);
+        }
+        textParts.push(decoder.decode(raw));
+        readAhead.pass(1); // skip over \\
+        textParts.push(String.fromCharCode(readAhead.take(1)[0])); // un-escape character
+        this.cursor.pass(2); // skip over escaped character to avoid decoding it in subsequent part
+      }
+    }
+    const length = readAhead.position - this.cursor.position;
+    const raw = this.cursor.take(length);
+    textParts.push(decoder.decode(raw));
+    this.cursor.pass(1); // final )
+    return textParts.join("");
+  }
+
+  private textDecoderFromBOM(): TextDecoder {
+    const firstBomPart = this.cursor.read("u8");
+    const sndBomPart = this.cursor.read("u8");
+    // https://en.wikipedia.org/wiki/Byte_order_mark#UTF-16
+    // LE is FF FE
+    if (firstBomPart === 0xff && sndBomPart === 0xfe)
+      return new TextDecoder("utf-16le");
+    // BE is FE FF
+    if (firstBomPart === 0xfe && sndBomPart === 0xff)
+      return new TextDecoder("utf-16be");
+    throw new InvalidEngineDataTextBOM(
+      `Unknown BOM value: [${firstBomPart}, ${sndBomPart}]`
+    );
+  }
+
+  private string(): string {
+    const decoder = new TextDecoder("ascii");
+    const readAhead = this.cursor.clone();
+    while (
+      !this.done() &&
+      !WhitespaceCharacters.has(this.cursor.peek()) &&
+      !DelimiterCharacters.has(this.cursor.peek())
+    ) {
+      this.cursor.pass(1);
+    }
+    const text = decoder.decode(
+      readAhead.take(this.cursor.position - readAhead.position)
+    );
+    return text;
+  }
+
+  private number(): number {
+    const text = this.string();
+    const value = Number(text);
+    if (Number.isNaN(value)) {
+      throw new InvalidEngineDataNumber(`parsing '${text}' as Number failed`);
+    }
+    return value;
+  }
+
+  private boolean(): boolean {
+    const text = this.string();
+    if (text === "true") {
+      return true;
+    }
+    if (text === "false") {
+      return false;
+    }
+    throw new InvalidEngineDataBoolean(
+      `'${text}' is neither 'true' nor 'false'`
+    );
+  }
+}
diff --git a/packages/psd/src/engineData/parser.ts b/packages/psd/src/engineData/parser.ts
@@ -0,0 +1,106 @@
+// @webtoon/psd
+// Copyright 2021-present NAVER WEBTOON
+// MIT License
+
+import {
+  InvalidEngineDataDictKey,
+  InvalidTopLevelEngineDataValue,
+  UnexpectedEndOfEngineData,
+  UnexpectedEngineDataToken,
+} from "../utils";
+import {Token, TokenType} from "./lexer";
+
+export type RawEngineData = {
+  [key: string]: RawEngineValue;
+};
+export type RawEngineValue =
+  | string
+  | number
+  | boolean
+  | RawEngineValue[]
+  | RawEngineData;
+
+export class Parser {
+  // private done: boolean = false
+  constructor(private tokens: Generator<Token>) {}
+
+  parse(): RawEngineData {
+    const value = this.value();
+    // TODO: for this to be true we'd need to force lexer somehow into reaching end-of-file
+    // console.assert(this.done, "not all tokens from engine data were consumed")
+    if (typeof value === "object" && !Array.isArray(value)) {
+      return value;
+    }
+    throw new InvalidTopLevelEngineDataValue(
+      `EngineData top-level value is not a dict; is ${typeof value}`
+    );
+  }
+
+  private value(it?: Token): RawEngineValue {
+    /**
+     * NOTE: this is recursive descent parser - simplest solution in terms of code complexity
+     * In case we ever start to run into stack-depth issues
+     * ("RangeError: Maximum call stack size exceeded" )
+     * due to parsing data that's too big, this can be re-written into stack-based one.
+     * That's because EngineData can be thought about as reverse-polish notation:
+     * ] - end of array requires popping values from stack until you hit [
+     *  (and pushing new value - an array - onto stack)
+     * same for << and >>.
+     */
+    if (!it) {
+      it = this.advance();
+    }
+    switch (it.type) {
+      case TokenType.Name:
+      case TokenType.Number:
+      case TokenType.Boolean:
+      case TokenType.String:
+        return it.value;
+      case TokenType.DictBeg:
+        return this.dict();
+      case TokenType.ArrBeg:
+        return this.arr();
+    }
+    throw new UnexpectedEngineDataToken(
+      `Unexpected token: ${TokenType[it.type]}`
+    );
+  }
+
+  private advance(): Token {
+    const it = this.tokens.next();
+    // this.done = Boolean(it.done);
+    if (!it.value) {
+      throw new UnexpectedEndOfEngineData("End of stream");
+    }
+    return it.value;
+  }
+
+  private dict(): RawEngineData {
+    const val = {} as RawEngineData;
+    for (;;) {
+      const it = this.advance();
+      if (it.type === TokenType.DictEnd) {
+        return val;
+      }
+      if (it.type !== TokenType.Name) {
+        throw new InvalidEngineDataDictKey(
+          `Dict key is not Name; is ${TokenType[it.type]}`
+        );
+      }
+      const value = this.value();
+      val[it.value] = value;
+    }
+  }
+
+  private arr(): RawEngineValue[] {
+    const val = [] as RawEngineValue[];
+    for (;;) {
+      const it = this.advance();
+      if (it.type === TokenType.ArrEnd) {
+        return val;
+      }
+      const value = this.value(it);
+      val.push(value);
+    }
+  }
+}
diff --git a/packages/psd/src/engineData/validator.ts b/packages/psd/src/engineData/validator.ts
@@ -0,0 +1,40 @@
+// @webtoon/psd
+// Copyright 2021-present NAVER WEBTOON
+// MIT License
+
+import {EngineData} from "../interfaces";
+
+const REQUIRED_KEYS = new Set([
+  "DocumentResources",
+  "EngineDict",
+  "ResourceDict",
+]);
+
+function hasOwnProperty<K extends string>(
+  obj: unknown,
+  prop: K
+): obj is Record<K, unknown> {
+  return Object.prototype.hasOwnProperty.call(obj, prop);
+}
+
+export function validateEngineData(
+  engineData: unknown
+): engineData is EngineData {
+  let ok = true;
+  if (typeof engineData !== "object") {
+    return false;
+  }
+  if (!engineData) {
+    return false;
+  }
+  for (const key of REQUIRED_KEYS) {
+    if (hasOwnProperty(engineData, key)) {
+      const value = engineData[key];
+      ok &&=
+        typeof value === "object" && !Array.isArray(value) && Boolean(value);
+    } else {
+      return false;
+    }
+  }
+  return ok;
+}