Track number of bytes per character code for mappings. Fixes problem …

…posed in PR #156 in a generic fashion.
unidoc · Jun 3, 2018 · a7abcd0 · a7abcd0
1 parent 0c9502a
commit a7abcd0
Show file tree

Hide file tree

Showing 4 changed files with 249 additions and 15 deletions.
diff --git a/pdf/internal/cmap/cmap.go b/pdf/internal/cmap/cmap.go
@@ -21,7 +21,8 @@ type CMap struct {
 	// Text encoder to look up runes from input glyph names.
 	encoder textencoding.TextEncoder
 
-	codeMap map[uint64]string
+	// map of character code to string (sequence of runes) for 1-4 byte codes separately.
+	codeMap [4]map[uint64]string
 
 	name       string
 	ctype      int
@@ -30,8 +31,9 @@ type CMap struct {
 
 // codespace represents a single codespace range used in the CMap.
 type codespace struct {
-	low  uint64
-	high uint64
+	numBytes int
+	low      uint64
+	high     uint64
 }
 
 // Name returns the name of the CMap.
@@ -61,7 +63,7 @@ func (cmap *CMap) CharcodeBytesToUnicode(src []byte) string {
 			code <<= 8
 			code |= uint64(b)
 
-			tgt, has := cmap.codeMap[code]
+			tgt, has := cmap.codeMap[j][code]
 			if has {
 				buf.WriteString(tgt)
 				break
@@ -76,9 +78,13 @@ func (cmap *CMap) CharcodeBytesToUnicode(src []byte) string {
 }
 
 // CharcodeToUnicode converts a single character code to unicode string.
+// Note that CharcodeBytesToUnicode is typically more efficient.
 func (cmap *CMap) CharcodeToUnicode(srcCode uint64) string {
-	if c, has := cmap.codeMap[srcCode]; has {
-		return c
+	// Search through different code lengths.
+	for numBytes := 1; numBytes <= 4; numBytes++ {
+		if c, has := cmap.codeMap[numBytes-1][srcCode]; has {
+			return c
+		}
 	}
 
 	// Not found.
@@ -89,7 +95,12 @@ func (cmap *CMap) CharcodeToUnicode(srcCode uint64) string {
 func newCMap() *CMap {
 	cmap := &CMap{}
 	cmap.codespaces = []codespace{}
-	cmap.codeMap = map[uint64]string{}
+	cmap.codeMap = [4]map[uint64]string{}
+	// Maps for 1-4 bytes are initialized. Minimal overhead if not used (most commonly used are 1-2 bytes).
+	cmap.codeMap[0] = map[uint64]string{}
+	cmap.codeMap[1] = map[uint64]string{}
+	cmap.codeMap[2] = map[uint64]string{}
+	cmap.codeMap[3] = map[uint64]string{}
 	return cmap
 }
 
@@ -208,10 +219,15 @@ func (cmap *CMap) parseCodespaceRange() error {
 			return errors.New("Non-hex high")
 		}
 
+		if hexLow.numBytes != hexHigh.numBytes {
+			return errors.New("Unequal number of bytes in range")
+		}
+
 		low := hexToUint64(hexLow)
 		high := hexToUint64(hexHigh)
+		numBytes := hexLow.numBytes
 
-		cspace := codespace{low, high}
+		cspace := codespace{numBytes: numBytes, low: low, high: high}
 		cmap.codespaces = append(cmap.codespaces, cspace)
 
 		common.Log.Trace("Codespace low: 0x%X, high: 0x%X", low, high)
@@ -232,6 +248,7 @@ func (cmap *CMap) parseBfchar() error {
 			return err
 		}
 		var srcCode uint64
+		var numBytes int
 
 		switch v := o.(type) {
 		case cmapOperand:
@@ -241,6 +258,7 @@ func (cmap *CMap) parseBfchar() error {
 			return errors.New("Unexpected operand")
 		case cmapHexString:
 			srcCode = hexToUint64(v)
+			numBytes = v.numBytes
 		default:
 			return errors.New("Unexpected type")
 		}
@@ -274,7 +292,11 @@ func (cmap *CMap) parseBfchar() error {
 			return errors.New("Unexpected type")
 		}
 
-		cmap.codeMap[srcCode] = toCode
+		if numBytes <= 0 || numBytes > 4 {
+			return errors.New("Invalid code length")
+		}
+
+		cmap.codeMap[numBytes-1][srcCode] = toCode
 	}
 
 	return nil
@@ -289,6 +311,7 @@ func (cmap *CMap) parseBfrange() error {
 
 		// Src code from.
 		var srcCodeFrom uint64
+		var numBytes int
 		{
 			o, err := cmap.parseObject()
 			if err != nil {
@@ -306,6 +329,7 @@ func (cmap *CMap) parseBfrange() error {
 				return errors.New("Unexpected operand")
 			case cmapHexString:
 				srcCodeFrom = hexToUint64(v)
+				numBytes = v.numBytes
 			default:
 				return errors.New("Unexpected type")
 			}
@@ -344,6 +368,10 @@ func (cmap *CMap) parseBfrange() error {
 			return err
 		}
 
+		if numBytes <= 0 || numBytes > 4 {
+			return errors.New("Invalid code length")
+		}
+
 		switch v := o.(type) {
 		case cmapArray:
 			sc := srcCodeFrom
@@ -352,7 +380,7 @@ func (cmap *CMap) parseBfrange() error {
 				if !ok {
 					return errors.New("Non-hex string in array")
 				}
-				cmap.codeMap[sc] = hexToString(hexs)
+				cmap.codeMap[numBytes-1][sc] = hexToString(hexs)
 				sc++
 			}
 			if sc != srcCodeTo+1 {
@@ -365,7 +393,7 @@ func (cmap *CMap) parseBfrange() error {
 			i := uint64(0)
 			for sc := srcCodeFrom; sc <= srcCodeTo; sc++ {
 				r := target + i
-				cmap.codeMap[sc] = string(r)
+				cmap.codeMap[numBytes-1][sc] = string(r)
 				i++
 			}
 		default:

diff --git a/pdf/internal/cmap/cmap_test.go b/pdf/internal/cmap/cmap_test.go
@@ -16,6 +16,7 @@ func init() {
 	common.SetLogger(common.NewConsoleLogger(common.LogLevelTrace))
 }
 
+// cmap1Data represents a basic CMap.
 const cmap1Data = `
 /CIDInit /ProcSet findresource begin
 12 dict begin
@@ -55,6 +56,7 @@ end
 end
 `
 
+// TestCMapParser tests basic loading of a simple CMap.
 func TestCMapParser1(t *testing.T) {
 	common.SetLogger(common.NewConsoleLogger(common.LogLevelTrace))
 
@@ -122,3 +124,206 @@ func TestCMapParser1(t *testing.T) {
 		return
 	}
 }
+
+const cmap2Data = `
+/CIDInit /ProcSet findresource begin
+12 dict begin
+begincmap
+/CIDSystemInfo
+<<  /Registry (Adobe)
+/Ordering (UCS)
+/Supplement 0
+>> def
+/CMapName /Adobe-Identity-UCS def
+/CMapType 2 def
+1 begincodespacerange
+<0000> <FFFF>
+endcodespacerange
+7 beginbfrange
+<0080> <00FF> <002C>
+<802F> <902F> <0038>
+endbfrange
+endcmap
+CMapName currentdict /CMap defineresource pop
+end
+end
+`
+
+// TestCMapParser2 tests a bug that came up when 2-byte character codes had the higher byte set to 0,
+// e.g. 0x0080, and the character map was not taking the number of bytes of the input codemap into account.
+func TestCMapParser2(t *testing.T) {
+	common.SetLogger(common.NewConsoleLogger(common.LogLevelTrace))
+
+	cmap, err := LoadCmapFromData([]byte(cmap2Data))
+	if err != nil {
+		t.Error("Failed: ", err)
+		return
+	}
+
+	if cmap.Name() != "Adobe-Identity-UCS" {
+		t.Errorf("CMap name incorrect (%s)", cmap.Name())
+		return
+	}
+
+	if cmap.Type() != 2 {
+		t.Errorf("CMap type incorrect")
+		return
+	}
+
+	if len(cmap.codespaces) != 1 {
+		t.Errorf("len codespace != 1 (%d)", len(cmap.codespaces))
+		return
+	}
+
+	if cmap.codespaces[0].low != 0 {
+		t.Errorf("code space low range != 0 (%d)", cmap.codespaces[0].low)
+		return
+	}
+
+	if cmap.codespaces[0].high != 0xFFFF {
+		t.Errorf("code space high range != 0xffff (%d)", cmap.codespaces[0].high)
+		return
+	}
+
+	expectedMappings := map[uint64]rune{
+		0x0080: 0x002C,
+		0x802F: 0x0038,
+	}
+
+	for k, expected := range expectedMappings {
+		if v := cmap.CharcodeToUnicode(k); v != string(expected) {
+			t.Errorf("incorrect mapping, expecting 0x%X -> 0x%X (got 0x%X)", k, expected, v)
+			return
+		}
+	}
+
+	// Check byte sequence mappings.
+	excpectedSequenceMappings := []struct {
+		bytes    []byte
+		expected string
+	}{
+		{[]byte{0x80, 0x2F, 0x00, 0x80}, string([]rune{0x0038, 0x002C})},
+	}
+
+	for _, exp := range excpectedSequenceMappings {
+		str := cmap.CharcodeBytesToUnicode(exp.bytes)
+		if str != exp.expected {
+			t.Errorf("Incorrect byte sequence mapping -> % X -> % X (got % X)", exp.bytes, []rune(exp.expected), []rune(str))
+			return
+		}
+	}
+}
+
+// cmapData3 is a CMap with a mixture of 1 and 2 byte codespaces.
+const cmapData3 = `
+/CIDInit /ProcSet findresource begin
+12 dict begin begincmap
+/CIDSystemInfo
+3 dict dup begin
+/Registry (Adobe) def
+/Supplement 2 def
+end def
+
+/CMapName /test-1 def
+/CMapType 1 def
+
+4 begincodespacerange
+<00> <80>
+<8100> <9fff>
+<a0> <df>
+<d040> <fbfc>
+endcodespacerange
+7 beginbfrange
+<00> <80> <10>
+<8100> <9f00> <1000>
+<a0> <d0> <90>
+<d140> <f000> <a000>
+endbfrange
+endcmap
+`
+
+// TestCMapParser3 test case of a CMap with mixed number of 1 and 2 bytes in the codespace range.
+func TestCMapParser3(t *testing.T) {
+	common.SetLogger(common.NewConsoleLogger(common.LogLevelTrace))
+
+	cmap, err := LoadCmapFromData([]byte(cmapData3))
+	if err != nil {
+		t.Error("Failed: ", err)
+		return
+	}
+
+	if cmap.Name() != "test-1" {
+		t.Errorf("CMap name incorrect (%s)", cmap.Name())
+		return
+	}
+
+	if cmap.Type() != 1 {
+		t.Errorf("CMap type incorrect")
+		return
+	}
+
+	// Check codespaces.
+	expectedCodespaces := []struct {
+		numBytes int
+		low      uint64
+		high     uint64
+	}{
+		{1, 0x00, 0x80},
+		{2, 0x8100, 0x9fff},
+		{1, 0xa0, 0xdf},
+		{2, 0xd040, 0xfbfc},
+	}
+
+	if len(cmap.codespaces) != len(expectedCodespaces) {
+		t.Errorf("len codespace != %d (%d)", len(expectedCodespaces), len(cmap.codespaces))
+		return
+	}
+
+	for i, cs := range cmap.codespaces {
+		exp := expectedCodespaces[i]
+		if cs.numBytes != exp.numBytes {
+			t.Errorf("code space number of bytes != %d (%d)", exp.numBytes, cs.numBytes)
+			return
+		}
+
+		if cs.low != exp.low {
+			t.Errorf("code space low range != %d (%d)", exp.low, cs.low)
+			return
+		}
+
+		if cs.high != exp.high {
+			t.Errorf("code space high range != 0x%X (0x%X)", exp.high, cs.high)
+			return
+		}
+	}
+
+	// Check mappings.
+	expectedMappings := map[uint64]rune{
+		0x0080: 0x10 + 0x80,
+		0x8100: 0x1000,
+		0x00a0: 0x90,
+		0xd140: 0xa000,
+	}
+	for k, expected := range expectedMappings {
+		if v := cmap.CharcodeToUnicode(k); v != string(expected) {
+			t.Errorf("incorrect mapping, expecting 0x%X -> 0x%X (got 0x%X)", k, expected, v)
+			return
+		}
+	}
+
+	// Check byte sequence mappings.
+	excpectedSequenceMappings := []struct {
+		bytes    []byte
+		expected string
+	}{
+		{[]byte{0x80, 0x81, 0x00, 0xa1, 0xd1, 0x80, 0x00}, string([]rune{0x90, 0x1000, 0x91, 0xa000 + 0x40, 0x10})},
+	}
+
+	for _, exp := range excpectedSequenceMappings {
+		str := cmap.CharcodeBytesToUnicode(exp.bytes)
+		if str != exp.expected {
+			t.Errorf("Incorrect byte sequence mapping -> % X -> % X (got % X)", exp.bytes, []rune(exp.expected), []rune(str))
+			return
+		}
+	}
+}
diff --git a/pdf/internal/cmap/parser.go b/pdf/internal/cmap/parser.go
@@ -279,7 +279,7 @@ func (p *cMapParser) parseHexString() (cmapHexString, error) {
 
 		bb, err := p.reader.Peek(1)
 		if err != nil {
-			return cmapHexString{[]byte("")}, err
+			return cmapHexString{numBytes: 0, b: []byte("")}, err
 		}
 
 		if bb[0] == '>' {
@@ -296,10 +296,10 @@ func (p *cMapParser) parseHexString() (cmapHexString, error) {
 	if buf.Len()%2 == 1 {
 		buf.WriteByte('0')
 	}
+	numBytes := buf.Len() / 2
 
 	hexb, _ := hex.DecodeString(buf.String())
-
-	return cmapHexString{hexb}, nil
+	return cmapHexString{numBytes: numBytes, b: hexb}, nil
 }
 
 // Starts with '[' ends with ']'.  Can contain any kinds of direct objects.