-
Notifications
You must be signed in to change notification settings - Fork 87
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Extractor package with powerful text extraction capabilities and CMap…
… handling. Closes #17
- Loading branch information
Showing
12 changed files
with
1,361 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,10 @@ | ||
/* | ||
* This file is subject to the terms and conditions defined in | ||
* file 'LICENSE.md', which is part of this source code package. | ||
*/ | ||
|
||
// | ||
// Package extractor is used for quickly extracting PDF content through a simple interface. | ||
// Currently offers functionality for extracting textual content. | ||
// | ||
package extractor |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
package extractor | ||
|
||
import "github.com/unidoc/unidoc/pdf/model" | ||
|
||
// Extractor stores and offers functionality for extracting content from PDF pages. | ||
type Extractor struct { | ||
contents string | ||
resources *model.PdfPageResources | ||
} | ||
|
||
// New returns an Extractor instance for extracting content from the input PDF page. | ||
func New(page *model.PdfPage) (*Extractor, error) { | ||
contents, err := page.GetAllContentStreams() | ||
if err != nil { | ||
return nil, err | ||
} | ||
|
||
e := &Extractor{} | ||
e.contents = contents | ||
e.resources = page.Resources | ||
|
||
return e, nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,226 @@ | ||
package extractor | ||
|
||
import ( | ||
"bytes" | ||
"errors" | ||
"fmt" | ||
|
||
"github.com/unidoc/unidoc/common" | ||
"github.com/unidoc/unidoc/pdf/contentstream" | ||
"github.com/unidoc/unidoc/pdf/core" | ||
"github.com/unidoc/unidoc/pdf/internal/cmap" | ||
"github.com/unidoc/unidoc/pdf/model" | ||
) | ||
|
||
// ExtractText processes and extracts all text data in content streams and returns as a string. Takes into | ||
// account character encoding via CMaps in the PDF file. | ||
// The text is processed linearly e.g. in the order in which it appears. A best effort is done to add | ||
// spaces and newlines. | ||
func (e *Extractor) ExtractText() (string, error) { | ||
var buf bytes.Buffer | ||
|
||
cstreamParser := contentstream.NewContentStreamParser(e.contents) | ||
operations, err := cstreamParser.Parse() | ||
if err != nil { | ||
return buf.String(), err | ||
} | ||
|
||
processor := contentstream.NewContentStreamProcessor(*operations) | ||
|
||
var codemap *cmap.CMap | ||
inText := false | ||
xPos, yPos := float64(-1), float64(-1) | ||
|
||
processor.AddHandler(contentstream.HandlerConditionEnumAllOperands, "", | ||
func(op *contentstream.ContentStreamOperation, gs contentstream.GraphicsState, resources *model.PdfPageResources) error { | ||
operand := op.Operand | ||
switch operand { | ||
case "BT": | ||
inText = true | ||
case "ET": | ||
inText = false | ||
case "Tf": | ||
if !inText { | ||
common.Log.Debug("Tf operand outside text") | ||
return nil | ||
} | ||
|
||
if len(op.Params) != 2 { | ||
common.Log.Debug("Error Tf should only get 2 input params, got %d", len(op.Params)) | ||
return errors.New("Incorrect parameter count") | ||
} | ||
|
||
codemap = nil | ||
|
||
fontName, ok := op.Params[0].(*core.PdfObjectName) | ||
if !ok { | ||
common.Log.Debug("Error Tf font input not a name") | ||
return errors.New("Tf range error") | ||
} | ||
|
||
if resources == nil { | ||
return nil | ||
} | ||
|
||
fontObj, found := resources.GetFontByName(*fontName) | ||
if !found { | ||
common.Log.Debug("Font not found...") | ||
return errors.New("Font not in resources") | ||
} | ||
|
||
fontObj = core.TraceToDirectObject(fontObj) | ||
if fontDict, isDict := fontObj.(*core.PdfObjectDictionary); isDict { | ||
toUnicode := fontDict.Get("ToUnicode") | ||
if toUnicode != nil { | ||
toUnicode = core.TraceToDirectObject(toUnicode) | ||
toUnicodeStream, ok := toUnicode.(*core.PdfObjectStream) | ||
if !ok { | ||
return errors.New("Invalid ToUnicode entry - not a stream") | ||
} | ||
decoded, err := core.DecodeStream(toUnicodeStream) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
codemap, err = cmap.LoadCmapFromData(decoded) | ||
if err != nil { | ||
return err | ||
} | ||
} | ||
} | ||
case "T*": | ||
if !inText { | ||
common.Log.Debug("T* operand outside text") | ||
return nil | ||
} | ||
buf.WriteString("\n") | ||
case "Td", "TD": | ||
if !inText { | ||
common.Log.Debug("Td/TD operand outside text") | ||
return nil | ||
} | ||
|
||
// Params: [tx ty], corresponeds to Tm=Tlm=[1 0 0;0 1 0;tx ty 1]*Tm | ||
if len(op.Params) != 2 { | ||
common.Log.Debug("Td/TD invalid arguments") | ||
return nil | ||
} | ||
tx, err := getNumberAsFloat(op.Params[0]) | ||
if err != nil { | ||
common.Log.Debug("Td Float parse error") | ||
return nil | ||
} | ||
ty, err := getNumberAsFloat(op.Params[1]) | ||
if err != nil { | ||
common.Log.Debug("Td Float parse error") | ||
return nil | ||
} | ||
|
||
if tx > 0 { | ||
buf.WriteString(" ") | ||
} | ||
if ty < 0 { | ||
// TODO: More flexible space characters? | ||
buf.WriteString("\n") | ||
} | ||
case "Tm": | ||
if !inText { | ||
common.Log.Debug("Tm operand outside text") | ||
return nil | ||
} | ||
|
||
// Params: a,b,c,d,e,f as in Tm = [a b 0; c d 0; e f 1]. | ||
// The last two (e,f) represent translation. | ||
if len(op.Params) != 6 { | ||
return errors.New("Tm: Invalid number of inputs") | ||
} | ||
xfloat, ok := op.Params[4].(*core.PdfObjectFloat) | ||
if !ok { | ||
xint, ok := op.Params[4].(*core.PdfObjectInteger) | ||
if !ok { | ||
return nil | ||
} | ||
xfloat = core.MakeFloat(float64(*xint)) | ||
} | ||
yfloat, ok := op.Params[5].(*core.PdfObjectFloat) | ||
if !ok { | ||
yint, ok := op.Params[5].(*core.PdfObjectInteger) | ||
if !ok { | ||
return nil | ||
} | ||
yfloat = core.MakeFloat(float64(*yint)) | ||
} | ||
if yPos == -1 { | ||
yPos = float64(*yfloat) | ||
} else if yPos > float64(*yfloat) { | ||
buf.WriteString("\n") | ||
xPos = float64(*xfloat) | ||
yPos = float64(*yfloat) | ||
return nil | ||
} | ||
if xPos == -1 { | ||
xPos = float64(*xfloat) | ||
} else if xPos < float64(*xfloat) { | ||
buf.WriteString("\t") | ||
xPos = float64(*xfloat) | ||
} | ||
case "TJ": | ||
if !inText { | ||
common.Log.Debug("TJ operand outside text") | ||
return nil | ||
} | ||
if len(op.Params) < 1 { | ||
return nil | ||
} | ||
paramList, ok := op.Params[0].(*core.PdfObjectArray) | ||
if !ok { | ||
return fmt.Errorf("Invalid parameter type, no array (%T)", op.Params[0]) | ||
} | ||
for _, obj := range *paramList { | ||
switch v := obj.(type) { | ||
case *core.PdfObjectString: | ||
if codemap != nil { | ||
buf.WriteString(codemap.CharcodeBytesToUnicode([]byte(*v))) | ||
} else { | ||
buf.WriteString(string(*v)) | ||
} | ||
case *core.PdfObjectFloat: | ||
if *v < -100 { | ||
buf.WriteString(" ") | ||
} | ||
case *core.PdfObjectInteger: | ||
if *v < -100 { | ||
buf.WriteString(" ") | ||
} | ||
} | ||
} | ||
case "Tj": | ||
if !inText { | ||
common.Log.Debug("Tj operand outside text") | ||
return nil | ||
} | ||
if len(op.Params) < 1 { | ||
return nil | ||
} | ||
param, ok := op.Params[0].(*core.PdfObjectString) | ||
if !ok { | ||
return fmt.Errorf("Invalid parameter type, not string (%T)", op.Params[0]) | ||
} | ||
if codemap != nil { | ||
buf.WriteString(codemap.CharcodeBytesToUnicode([]byte(*param))) | ||
} else { | ||
buf.WriteString(string(*param)) | ||
} | ||
} | ||
|
||
return nil | ||
}) | ||
|
||
err = processor.Process(e.resources) | ||
if err != nil { | ||
common.Log.Error("Error processing: %v", err) | ||
return buf.String(), err | ||
} | ||
|
||
return buf.String(), nil | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
package extractor | ||
|
||
import "testing" | ||
|
||
const testContents1 = ` | ||
BT | ||
/F1 24 Tf | ||
(Hello World!)Tj | ||
0 -10 Td | ||
(Doink)Tj | ||
ET | ||
` | ||
const testExpected1 = "Hello World!\nDoink" | ||
|
||
func TestTextExtraction1(t *testing.T) { | ||
e := Extractor{} | ||
e.contents = testContents1 | ||
|
||
s, err := e.ExtractText() | ||
if err != nil { | ||
t.Errorf("Error extracting text: %v", err) | ||
return | ||
} | ||
if s != testExpected1 { | ||
t.Errorf("Text mismatch (%s)", s) | ||
t.Errorf("Text mismatch (% X vs % X)", s, testExpected1) | ||
return | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
package extractor | ||
|
||
import ( | ||
"errors" | ||
|
||
"github.com/unidoc/unidoc/pdf/core" | ||
) | ||
|
||
// getNumberAsFloat can retrieve numeric values from PdfObject (both integer/float). | ||
func getNumberAsFloat(obj core.PdfObject) (float64, error) { | ||
if fObj, ok := obj.(*core.PdfObjectFloat); ok { | ||
return float64(*fObj), nil | ||
} | ||
|
||
if iObj, ok := obj.(*core.PdfObjectInteger); ok { | ||
return float64(*iObj), nil | ||
} | ||
|
||
return 0, errors.New("Not a number") | ||
} |
Oops, something went wrong.