Extractor package with powerful text extraction capabilities and CMap…

… handling. Closes #17
unidoc · Mar 22, 2018 · 817ea40 · 817ea40
1 parent 820be65
commit 817ea40
Show file tree

Hide file tree

Showing 12 changed files with 1,361 additions and 1 deletion.
diff --git a/pdf/contentstream/contentstream.go b/pdf/contentstream/contentstream.go
@@ -103,8 +103,10 @@ func (this *ContentStreamOperations) Bytes() []byte {
 	return buf.Bytes()
 }
 
-// Parses and extracts all text data in content streams and returns as a string.
+// ExtractText parses and extracts all text data in content streams and returns as a string.
 // Does not take into account Encoding table, the output is simply the character codes.
+//
+// Deprecated: More advanced text extraction is offered in package extractor with character encoding support.
 func (this *ContentStreamParser) ExtractText() (string, error) {
 	operations, err := this.Parse()
 	if err != nil {

diff --git a/pdf/extractor/doc.go b/pdf/extractor/doc.go
@@ -0,0 +1,10 @@
+/*
+ * This file is subject to the terms and conditions defined in
+ * file 'LICENSE.md', which is part of this source code package.
+ */
+
+//
+// Package extractor is used for quickly extracting PDF content through a simple interface.
+// Currently offers functionality for extracting textual content.
+//
+package extractor
diff --git a/pdf/extractor/extractor.go b/pdf/extractor/extractor.go
@@ -0,0 +1,23 @@
+package extractor
+
+import "github.com/unidoc/unidoc/pdf/model"
+
+// Extractor stores and offers functionality for extracting content from PDF pages.
+type Extractor struct {
+	contents  string
+	resources *model.PdfPageResources
+}
+
+// New returns an Extractor instance for extracting content from the input PDF page.
+func New(page *model.PdfPage) (*Extractor, error) {
+	contents, err := page.GetAllContentStreams()
+	if err != nil {
+		return nil, err
+	}
+
+	e := &Extractor{}
+	e.contents = contents
+	e.resources = page.Resources
+
+	return e, nil
+}
diff --git a/pdf/extractor/text.go b/pdf/extractor/text.go
@@ -0,0 +1,226 @@
+package extractor
+
+import (
+	"bytes"
+	"errors"
+	"fmt"
+
+	"github.com/unidoc/unidoc/common"
+	"github.com/unidoc/unidoc/pdf/contentstream"
+	"github.com/unidoc/unidoc/pdf/core"
+	"github.com/unidoc/unidoc/pdf/internal/cmap"
+	"github.com/unidoc/unidoc/pdf/model"
+)
+
+// ExtractText processes and extracts all text data in content streams and returns as a string. Takes into
+// account character encoding via CMaps in the PDF file.
+// The text is processed linearly e.g. in the order in which it appears. A best effort is done to add
+// spaces and newlines.
+func (e *Extractor) ExtractText() (string, error) {
+	var buf bytes.Buffer
+
+	cstreamParser := contentstream.NewContentStreamParser(e.contents)
+	operations, err := cstreamParser.Parse()
+	if err != nil {
+		return buf.String(), err
+	}
+
+	processor := contentstream.NewContentStreamProcessor(*operations)
+
+	var codemap *cmap.CMap
+	inText := false
+	xPos, yPos := float64(-1), float64(-1)
+
+	processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "",
+		func(op *contentstream.ContentStreamOperation, gs contentstream.GraphicsState, resources *model.PdfPageResources) error {
+			operand := op.Operand
+			switch operand {
+			case "BT":
+				inText = true
+			case "ET":
+				inText = false
+			case "Tf":
+				if !inText {
+					common.Log.Debug("Tf operand outside text")
+					return nil
+				}
+
+				if len(op.Params) != 2 {
+					common.Log.Debug("Error Tf should only get 2 input params, got %d", len(op.Params))
+					return errors.New("Incorrect parameter count")
+				}
+
+				codemap = nil
+
+				fontName, ok := op.Params[0].(*core.PdfObjectName)
+				if !ok {
+					common.Log.Debug("Error Tf font input not a name")
+					return errors.New("Tf range error")
+				}
+
+				if resources == nil {
+					return nil
+				}
+
+				fontObj, found := resources.GetFontByName(*fontName)
+				if !found {
+					common.Log.Debug("Font not found...")
+					return errors.New("Font not in resources")
+				}
+
+				fontObj = core.TraceToDirectObject(fontObj)
+				if fontDict, isDict := fontObj.(*core.PdfObjectDictionary); isDict {
+					toUnicode := fontDict.Get("ToUnicode")
+					if toUnicode != nil {
+						toUnicode = core.TraceToDirectObject(toUnicode)
+						toUnicodeStream, ok := toUnicode.(*core.PdfObjectStream)
+						if !ok {
+							return errors.New("Invalid ToUnicode entry - not a stream")
+						}
+						decoded, err := core.DecodeStream(toUnicodeStream)
+						if err != nil {
+							return err
+						}
+
+						codemap, err = cmap.LoadCmapFromData(decoded)
+						if err != nil {
+							return err
+						}
+					}
+				}
+			case "T*":
+				if !inText {
+					common.Log.Debug("T* operand outside text")
+					return nil
+				}
+				buf.WriteString("\n")
+			case "Td", "TD":
+				if !inText {
+					common.Log.Debug("Td/TD operand outside text")
+					return nil
+				}
+
+				// Params: [tx ty], corresponeds to Tm=Tlm=[1 0 0;0 1 0;tx ty 1]*Tm
+				if len(op.Params) != 2 {
+					common.Log.Debug("Td/TD invalid arguments")
+					return nil
+				}
+				tx, err := getNumberAsFloat(op.Params[0])
+				if err != nil {
+					common.Log.Debug("Td Float parse error")
+					return nil
+				}
+				ty, err := getNumberAsFloat(op.Params[1])
+				if err != nil {
+					common.Log.Debug("Td Float parse error")
+					return nil
+				}
+
+				if tx > 0 {
+					buf.WriteString(" ")
+				}
+				if ty < 0 {
+					// TODO: More flexible space characters?
+					buf.WriteString("\n")
+				}
+			case "Tm":
+				if !inText {
+					common.Log.Debug("Tm operand outside text")
+					return nil
+				}
+
+				// Params: a,b,c,d,e,f as in Tm = [a b 0; c d 0; e f 1].
+				// The last two (e,f) represent translation.
+				if len(op.Params) != 6 {
+					return errors.New("Tm: Invalid number of inputs")
+				}
+				xfloat, ok := op.Params[4].(*core.PdfObjectFloat)
+				if !ok {
+					xint, ok := op.Params[4].(*core.PdfObjectInteger)
+					if !ok {
+						return nil
+					}
+					xfloat = core.MakeFloat(float64(*xint))
+				}
+				yfloat, ok := op.Params[5].(*core.PdfObjectFloat)
+				if !ok {
+					yint, ok := op.Params[5].(*core.PdfObjectInteger)
+					if !ok {
+						return nil
+					}
+					yfloat = core.MakeFloat(float64(*yint))
+				}
+				if yPos == -1 {
+					yPos = float64(*yfloat)
+				} else if yPos > float64(*yfloat) {
+					buf.WriteString("\n")
+					xPos = float64(*xfloat)
+					yPos = float64(*yfloat)
+					return nil
+				}
+				if xPos == -1 {
+					xPos = float64(*xfloat)
+				} else if xPos < float64(*xfloat) {
+					buf.WriteString("\t")
+					xPos = float64(*xfloat)
+				}
+			case "TJ":
+				if !inText {
+					common.Log.Debug("TJ operand outside text")
+					return nil
+				}
+				if len(op.Params) < 1 {
+					return nil
+				}
+				paramList, ok := op.Params[0].(*core.PdfObjectArray)
+				if !ok {
+					return fmt.Errorf("Invalid parameter type, no array (%T)", op.Params[0])
+				}
+				for _, obj := range *paramList {
+					switch v := obj.(type) {
+					case *core.PdfObjectString:
+						if codemap != nil {
+							buf.WriteString(codemap.CharcodeBytesToUnicode([]byte(*v)))
+						} else {
+							buf.WriteString(string(*v))
+						}
+					case *core.PdfObjectFloat:
+						if *v < -100 {
+							buf.WriteString(" ")
+						}
+					case *core.PdfObjectInteger:
+						if *v < -100 {
+							buf.WriteString(" ")
+						}
+					}
+				}
+			case "Tj":
+				if !inText {
+					common.Log.Debug("Tj operand outside text")
+					return nil
+				}
+				if len(op.Params) < 1 {
+					return nil
+				}
+				param, ok := op.Params[0].(*core.PdfObjectString)
+				if !ok {
+					return fmt.Errorf("Invalid parameter type, not string (%T)", op.Params[0])
+				}
+				if codemap != nil {
+					buf.WriteString(codemap.CharcodeBytesToUnicode([]byte(*param)))
+				} else {
+					buf.WriteString(string(*param))
+				}
+			}
+
+			return nil
+		})
+
+	err = processor.Process(e.resources)
+	if err != nil {
+		common.Log.Error("Error processing: %v", err)
+		return buf.String(), err
+	}
+
+	return buf.String(), nil
+}
diff --git a/pdf/extractor/text_test.go b/pdf/extractor/text_test.go
@@ -0,0 +1,29 @@
+package extractor
+
+import "testing"
+
+const testContents1 = `
+BT
+/F1 24 Tf
+(Hello World!)Tj
+0 -10 Td
+(Doink)Tj
+ET
+`
+const testExpected1 = "Hello World!\nDoink"
+
+func TestTextExtraction1(t *testing.T) {
+	e := Extractor{}
+	e.contents = testContents1
+
+	s, err := e.ExtractText()
+	if err != nil {
+		t.Errorf("Error extracting text: %v", err)
+		return
+	}
+	if s != testExpected1 {
+		t.Errorf("Text mismatch (%s)", s)
+		t.Errorf("Text mismatch (% X vs % X)", s, testExpected1)
+		return
+	}
+}
diff --git a/pdf/extractor/utils.go b/pdf/extractor/utils.go
@@ -0,0 +1,20 @@
+package extractor
+
+import (
+	"errors"
+
+	"github.com/unidoc/unidoc/pdf/core"
+)
+
+// getNumberAsFloat can retrieve numeric values from PdfObject (both integer/float).
+func getNumberAsFloat(obj core.PdfObject) (float64, error) {
+	if fObj, ok := obj.(*core.PdfObjectFloat); ok {
+		return float64(*fObj), nil
+	}
+
+	if iObj, ok := obj.(*core.PdfObjectInteger); ok {
+		return float64(*iObj), nil
+	}
+
+	return 0, errors.New("Not a number")
+}