Skip to content

Commit

Permalink
Extractor package with powerful text extraction capabilities and CMap…
Browse files Browse the repository at this point in the history
… handling. Closes #17
  • Loading branch information
gunnsth committed Mar 22, 2018
1 parent 820be65 commit 817ea40
Show file tree
Hide file tree
Showing 12 changed files with 1,361 additions and 1 deletion.
4 changes: 3 additions & 1 deletion pdf/contentstream/contentstream.go
Original file line number Diff line number Diff line change
Expand Up @@ -103,8 +103,10 @@ func (this *ContentStreamOperations) Bytes() []byte {
return buf.Bytes()
}

// Parses and extracts all text data in content streams and returns as a string.
// ExtractText parses and extracts all text data in content streams and returns as a string.
// Does not take into account Encoding table, the output is simply the character codes.
//
// Deprecated: More advanced text extraction is offered in package extractor with character encoding support.
func (this *ContentStreamParser) ExtractText() (string, error) {
operations, err := this.Parse()
if err != nil {
Expand Down
10 changes: 10 additions & 0 deletions pdf/extractor/doc.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
/*
* This file is subject to the terms and conditions defined in
* file 'LICENSE.md', which is part of this source code package.
*/

//
// Package extractor is used for quickly extracting PDF content through a simple interface.
// Currently offers functionality for extracting textual content.
//
package extractor
23 changes: 23 additions & 0 deletions pdf/extractor/extractor.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
package extractor

import "github.com/unidoc/unidoc/pdf/model"

// Extractor stores and offers functionality for extracting content from PDF pages.
type Extractor struct {
contents string
resources *model.PdfPageResources
}

// New returns an Extractor instance for extracting content from the input PDF page.
func New(page *model.PdfPage) (*Extractor, error) {
contents, err := page.GetAllContentStreams()
if err != nil {
return nil, err
}

e := &Extractor{}
e.contents = contents
e.resources = page.Resources

return e, nil
}
226 changes: 226 additions & 0 deletions pdf/extractor/text.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,226 @@
package extractor

import (
"bytes"
"errors"
"fmt"

"github.com/unidoc/unidoc/common"
"github.com/unidoc/unidoc/pdf/contentstream"
"github.com/unidoc/unidoc/pdf/core"
"github.com/unidoc/unidoc/pdf/internal/cmap"
"github.com/unidoc/unidoc/pdf/model"
)

// ExtractText processes and extracts all text data in content streams and returns as a string. Takes into
// account character encoding via CMaps in the PDF file.
// The text is processed linearly e.g. in the order in which it appears. A best effort is done to add
// spaces and newlines.
func (e *Extractor) ExtractText() (string, error) {
var buf bytes.Buffer

cstreamParser := contentstream.NewContentStreamParser(e.contents)
operations, err := cstreamParser.Parse()
if err != nil {
return buf.String(), err
}

processor := contentstream.NewContentStreamProcessor(*operations)

var codemap *cmap.CMap
inText := false
xPos, yPos := float64(-1), float64(-1)

processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "",
func(op *contentstream.ContentStreamOperation, gs contentstream.GraphicsState, resources *model.PdfPageResources) error {
operand := op.Operand
switch operand {
case "BT":
inText = true
case "ET":
inText = false
case "Tf":
if !inText {
common.Log.Debug("Tf operand outside text")
return nil
}

if len(op.Params) != 2 {
common.Log.Debug("Error Tf should only get 2 input params, got %d", len(op.Params))
return errors.New("Incorrect parameter count")
}

codemap = nil

fontName, ok := op.Params[0].(*core.PdfObjectName)
if !ok {
common.Log.Debug("Error Tf font input not a name")
return errors.New("Tf range error")
}

if resources == nil {
return nil
}

fontObj, found := resources.GetFontByName(*fontName)
if !found {
common.Log.Debug("Font not found...")
return errors.New("Font not in resources")
}

fontObj = core.TraceToDirectObject(fontObj)
if fontDict, isDict := fontObj.(*core.PdfObjectDictionary); isDict {
toUnicode := fontDict.Get("ToUnicode")
if toUnicode != nil {
toUnicode = core.TraceToDirectObject(toUnicode)
toUnicodeStream, ok := toUnicode.(*core.PdfObjectStream)
if !ok {
return errors.New("Invalid ToUnicode entry - not a stream")
}
decoded, err := core.DecodeStream(toUnicodeStream)
if err != nil {
return err
}

codemap, err = cmap.LoadCmapFromData(decoded)
if err != nil {
return err
}
}
}
case "T*":
if !inText {
common.Log.Debug("T* operand outside text")
return nil
}
buf.WriteString("\n")
case "Td", "TD":
if !inText {
common.Log.Debug("Td/TD operand outside text")
return nil
}

// Params: [tx ty], corresponeds to Tm=Tlm=[1 0 0;0 1 0;tx ty 1]*Tm
if len(op.Params) != 2 {
common.Log.Debug("Td/TD invalid arguments")
return nil
}
tx, err := getNumberAsFloat(op.Params[0])
if err != nil {
common.Log.Debug("Td Float parse error")
return nil
}
ty, err := getNumberAsFloat(op.Params[1])
if err != nil {
common.Log.Debug("Td Float parse error")
return nil
}

if tx > 0 {
buf.WriteString(" ")
}
if ty < 0 {
// TODO: More flexible space characters?
buf.WriteString("\n")
}
case "Tm":
if !inText {
common.Log.Debug("Tm operand outside text")
return nil
}

// Params: a,b,c,d,e,f as in Tm = [a b 0; c d 0; e f 1].
// The last two (e,f) represent translation.
if len(op.Params) != 6 {
return errors.New("Tm: Invalid number of inputs")
}
xfloat, ok := op.Params[4].(*core.PdfObjectFloat)
if !ok {
xint, ok := op.Params[4].(*core.PdfObjectInteger)
if !ok {
return nil
}
xfloat = core.MakeFloat(float64(*xint))
}
yfloat, ok := op.Params[5].(*core.PdfObjectFloat)
if !ok {
yint, ok := op.Params[5].(*core.PdfObjectInteger)
if !ok {
return nil
}
yfloat = core.MakeFloat(float64(*yint))
}
if yPos == -1 {
yPos = float64(*yfloat)
} else if yPos > float64(*yfloat) {
buf.WriteString("\n")
xPos = float64(*xfloat)
yPos = float64(*yfloat)
return nil
}
if xPos == -1 {
xPos = float64(*xfloat)
} else if xPos < float64(*xfloat) {
buf.WriteString("\t")
xPos = float64(*xfloat)
}
case "TJ":
if !inText {
common.Log.Debug("TJ operand outside text")
return nil
}
if len(op.Params) < 1 {
return nil
}
paramList, ok := op.Params[0].(*core.PdfObjectArray)
if !ok {
return fmt.Errorf("Invalid parameter type, no array (%T)", op.Params[0])
}
for _, obj := range *paramList {
switch v := obj.(type) {
case *core.PdfObjectString:
if codemap != nil {
buf.WriteString(codemap.CharcodeBytesToUnicode([]byte(*v)))
} else {
buf.WriteString(string(*v))
}
case *core.PdfObjectFloat:
if *v < -100 {
buf.WriteString(" ")
}
case *core.PdfObjectInteger:
if *v < -100 {
buf.WriteString(" ")
}
}
}
case "Tj":
if !inText {
common.Log.Debug("Tj operand outside text")
return nil
}
if len(op.Params) < 1 {
return nil
}
param, ok := op.Params[0].(*core.PdfObjectString)
if !ok {
return fmt.Errorf("Invalid parameter type, not string (%T)", op.Params[0])
}
if codemap != nil {
buf.WriteString(codemap.CharcodeBytesToUnicode([]byte(*param)))
} else {
buf.WriteString(string(*param))
}
}

return nil
})

err = processor.Process(e.resources)
if err != nil {
common.Log.Error("Error processing: %v", err)
return buf.String(), err
}

return buf.String(), nil
}
29 changes: 29 additions & 0 deletions pdf/extractor/text_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
package extractor

import "testing"

const testContents1 = `
BT
/F1 24 Tf
(Hello World!)Tj
0 -10 Td
(Doink)Tj
ET
`
const testExpected1 = "Hello World!\nDoink"

func TestTextExtraction1(t *testing.T) {
e := Extractor{}
e.contents = testContents1

s, err := e.ExtractText()
if err != nil {
t.Errorf("Error extracting text: %v", err)
return
}
if s != testExpected1 {
t.Errorf("Text mismatch (%s)", s)
t.Errorf("Text mismatch (% X vs % X)", s, testExpected1)
return
}
}
20 changes: 20 additions & 0 deletions pdf/extractor/utils.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
package extractor

import (
"errors"

"github.com/unidoc/unidoc/pdf/core"
)

// getNumberAsFloat can retrieve numeric values from PdfObject (both integer/float).
func getNumberAsFloat(obj core.PdfObject) (float64, error) {
if fObj, ok := obj.(*core.PdfObjectFloat); ok {
return float64(*fObj), nil
}

if iObj, ok := obj.(*core.PdfObjectInteger); ok {
return float64(*iObj), nil
}

return 0, errors.New("Not a number")
}

0 comments on commit 817ea40

Please sign in to comment.