Skip to content

Commit

Permalink
Add a new renderer to create box files from images for LSTM training
Browse files Browse the repository at this point in the history
  • Loading branch information
Shreeshrii committed Jan 31, 2019
1 parent 94b8988 commit 921da6b
Show file tree
Hide file tree
Showing 9 changed files with 150 additions and 1 deletion.
1 change: 1 addition & 0 deletions src/api/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ endif
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp
libtesseract_api_la_SOURCES += altorenderer.cpp
libtesseract_api_la_SOURCES += hocrrenderer.cpp
libtesseract_api_la_SOURCES += lstmboxrenderer.cpp
libtesseract_api_la_SOURCES += pdfrenderer.cpp
libtesseract_api_la_SOURCES += renderer.cpp

Expand Down
8 changes: 8 additions & 0 deletions src/api/baseapi.h
Original file line number Diff line number Diff line change
Expand Up @@ -613,6 +613,14 @@ class TESS_API TessBaseAPI {
* Returned string must be freed with the delete [] operator.
*/
char* GetTSVText(int page_number);

/**
* Make a box file for LSTM training from the internal data structures.
* Constructs coordinates in the original image - not just the rectangle.
* page_number is a 0-based page index that will appear in the box file.
* Returned string must be freed with the delete [] operator.
*/
char* GetLSTMBOXText(int page_number);

/**
* The recognized text is returned as a char* which is coded in the same
Expand Down
2 changes: 1 addition & 1 deletion src/api/hocrrenderer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -268,7 +268,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
if (grapheme && grapheme[0] != 0) {
if (hocr_boxes) {
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);
hocr_str << "<span class='ocrx_cinfo' title='x_bboxes "
hocr_str << "\n <span class='ocrx_cinfo' title='x_bboxes "
<< left << " " << top << " " << right << " " << bottom
<< "; x_conf " << res_it->Confidence(RIL_SYMBOL) << "'>";
}
Expand Down
111 changes: 111 additions & 0 deletions src/api/lstmboxrenderer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
/**********************************************************************
* File: lstmboxrenderer.cpp
* Description: Renderer for creating box file for LSTM training.
* based on the lstm_box renderer.
*
* (C) Copyright 2006, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/


#include <locale> // for std::locale::classic
#include <memory> // for std::unique_ptr
#include <sstream> // for std::stringstream
#include "baseapi.h" // for TessBaseAPI
#include "renderer.h"
#include "tesseractclass.h" // for Tesseract

namespace tesseract {

/**
* Create a UTF8 box file for LSTM training from the internal data structures.
* page_number is a 0-base page index that will appear in the box file.
* Returned string must be freed with the delete [] operator.
*/

char* TessBaseAPI::GetLSTMBOXText(int page_number) {
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0))
return nullptr;

STRING lstm_box_str("");

int page_num = page_number;
bool first_word = true;

LTRResultIterator* res_it = GetLTRIterator();
while (!res_it->Empty(RIL_BLOCK)) {
if (res_it->Empty(RIL_SYMBOL)) {
res_it->Next(RIL_SYMBOL);
continue;
}

int left, top, right, bottom;

if (!first_word) {
if (res_it->IsAtBeginningOf(RIL_WORD)) {
lstm_box_str.add_str_int(" ", left);
lstm_box_str.add_str_int(" ", image_height_ - bottom);
lstm_box_str.add_str_int(" ", right + 2);
lstm_box_str.add_str_int(" ", image_height_ - top);
lstm_box_str.add_str_int(" ", page_num); // level 5 - word
lstm_box_str += "\n"; // end of row for word
}
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
lstm_box_str.add_str_int("\t ", left);
lstm_box_str.add_str_int(" ", image_height_ - bottom);
lstm_box_str.add_str_int(" ", right + 5);
lstm_box_str.add_str_int(" ", image_height_ - top);
lstm_box_str.add_str_int(" ", page_num); // level 4 - line
lstm_box_str += "\n"; // end of row for line
}
}
first_word=false;
res_it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom);

do {
lstm_box_str +=std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
res_it->Next(RIL_SYMBOL);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_SYMBOL));

lstm_box_str.add_str_int(" ", left);
lstm_box_str.add_str_int(" ", image_height_ - bottom);
lstm_box_str.add_str_int(" ", right);
lstm_box_str.add_str_int(" ", image_height_ - top);
lstm_box_str.add_str_int(" ", page_num); // level 6 - symbol
lstm_box_str += "\n"; // end of row


}

char* ret = new char[lstm_box_str.length() + 1];
strcpy(ret, lstm_box_str.string());
delete res_it;
return ret;
}

/**********************************************************************
* LSTMBOX Renderer interface implementation
**********************************************************************/
TessLSTMBOXRenderer::TessLSTMBOXRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "box") {
}

bool TessLSTMBOXRenderer::AddImageHandler(TessBaseAPI* api) {
const std::unique_ptr<const char[]> lstmbox(api->GetLSTMBOXText(imagenum()));
if (lstmbox == nullptr) return false;

AppendString(lstmbox.get());

return true;
}

} // namespace tesseract.
11 changes: 11 additions & 0 deletions src/api/renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -247,6 +247,17 @@ class TESS_API TessUnlvRenderer : public TessResultRenderer {
virtual bool AddImageHandler(TessBaseAPI* api);
};

/**
* Renders tesseract output into a plain UTF-8 text string for LSTMBOX
*/
class TESS_API TessLSTMBOXRenderer : public TessResultRenderer {
public:
explicit TessLSTMBOXRenderer(const char *outputbase);

protected:
virtual bool AddImageHandler(TessBaseAPI* api);
};

/**
* Renders tesseract output into a plain UTF-8 text string
*/
Expand Down
14 changes: 14 additions & 0 deletions src/api/tesseractmain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -494,6 +494,20 @@ static void PreloadRenderers(
}
}

api->GetBoolVariable("tessedit_create_lstmbox", &b);
if (b) {
tesseract::TessLSTMBOXRenderer* renderer =
new tesseract::TessLSTMBOXRenderer(outputbase);
if (renderer->happy()) {
renderers->push_back(renderer);
} else {
delete renderer;
tprintf("Error, could not create LSTM BOX output file: %s\n",
strerror(errno));
error = true;
}
}

api->GetBoolVariable("tessedit_create_boxfile", &b);
if (b) {
tesseract::TessBoxTextRenderer* renderer =
Expand Down
2 changes: 2 additions & 0 deletions src/ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,8 @@ Tesseract::Tesseract()
this->params()),
BOOL_MEMBER(tessedit_create_alto, false, "Write .xml ALTO file",
this->params()),
BOOL_MEMBER(tessedit_create_lstmbox, false, "Write .box file for LSTM training",
this->params()),
BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
this->params()),
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
Expand Down
1 change: 1 addition & 0 deletions src/ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -1040,6 +1040,7 @@ class Tesseract : public Wordrec {
BOOL_VAR_H(tessedit_create_txt, false, "Write .txt output file");
BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file");
BOOL_VAR_H(tessedit_create_alto, false, "Write .xml ALTO output file");
BOOL_VAR_H(tessedit_create_lstmbox, false, "Write .box file for LSTM training");
BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
BOOL_VAR_H(textonly_pdf, false,
Expand Down
1 change: 1 addition & 0 deletions tessdata/configs/lstmbox
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tessedit_create_lstmbox 1

1 comment on commit 921da6b

@Shreeshrii
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@stweil and @amitdo Please review.

This is based on the tsv renderer and uses LTRResultIterator similar to the regular box files.
The resulting lstmbox files add space between words and add a Tab at end of each line. The file extension is .box.

There is no line with tab at end of file.

Please sign in to comment.