Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Non-linear grayscale normalization for layout analyse and/or text recognition #3857

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
11 changes: 11 additions & 0 deletions include/tesseract/baseapi.h
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,17 @@ class TESS_API TessBaseAPI {
*/
void SetImage(Pix *pix);

/**
* Preprocessing the InputImage
* Grayscale normalizatin based on nlbin (Thomas Breuel)
* Current modes:
* - 0 = No normalization
* - 1 = Thresholding+Recognition
* - 2 = Thresholding
* - 3 = Recognition
*/
bool NormalizeImage(int mode);

/**
* Set the resolution of the source image in pixels per inch so font size
* information can be calculated in results. Call this after SetImage().
Expand Down
49 changes: 48 additions & 1 deletion src/api/baseapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -926,6 +926,25 @@ Pix *TessBaseAPI::GetInputImage() {
return tesseract_->pix_original();
}

// Grayscale normalization (preprocessing)
bool TessBaseAPI::NormalizeImage(int mode){
if (!GetInputImage()){
tprintf("Please use SetImage before applying the image pre-processing steps.");
return false;
}
if (mode == 1) {
SetInputImage(thresholder_->GetPixNormRectGrey());
thresholder_->SetImage(GetInputImage());
} else if (mode == 2) {
thresholder_->SetImage(thresholder_->GetPixNormRectGrey());
} else if (mode == 3) {
SetInputImage(thresholder_->GetPixNormRectGrey());
} else {
return false;
}
Comment on lines +935 to +944
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some considerations regarding where this should be placed in the code base:

  1. Using a separate entry point NormalizeImage called in ProcessPages instead of merely modifying the thresholder prevents applying this on any PSM other than full pages. And on the API, you would need to add NormalizeImage to the calling code instead of merely setting the configuration parameter.
  2. Recognition (SetupForRecognitionBestPix) does not always use pix_original_: after SetRectangle(), it uses pix_grey_ or even pix_binary_.
  3. Layout analysis mostly uses pix_binary_, but LineFinder also tries to use pix_grey_ and pix_thresholds_.
  4. DPI information (which influences LA in various ways) is taken from pix_ (i.e. the thresholder's SetImage), and that might not work on the output of Leptonica because the metadata might be lost. We still have fallback DPI estimation (which is based on the CC statistics from pix_binary_), but that might not be as accurate.

return true;
}

const char *TessBaseAPI::GetInputName() {
if (!input_file_.empty()) {
return input_file_.c_str();
Expand Down Expand Up @@ -1265,8 +1284,31 @@ bool TessBaseAPI::ProcessPagesInternal(const char *filename, const char *retry_c
bool TessBaseAPI::ProcessPage(Pix *pix, int page_index, const char *filename,
const char *retry_config, int timeout_millisec,
TessResultRenderer *renderer) {

SetInputName(filename);

SetImage(pix);

// Image preprocessing on image
// Grayscale normalization
int graynorm_mode;
GetIntVariable("preprocess_graynorm_mode", &graynorm_mode);
if (graynorm_mode > 0 && NormalizeImage(graynorm_mode) && tesseract_->tessedit_write_images) {
// Write normalized image
std::string output_filename = output_file_ + ".preprocessed";
if (page_index > 0) {
output_filename += std::to_string(page_index);
}
output_filename += ".tif";
if (graynorm_mode == 2) {
pixWrite(output_filename.c_str(), thresholder_->GetPixRect(), IFF_TIFF_G4);
} else {
pixWrite(output_filename.c_str(), GetInputImage(), IFF_TIFF_G4);
}
}

// Recognition

bool failed = false;

if (tesseract_->tessedit_pageseg_mode == PSM_AUTO_ONLY) {
Expand Down Expand Up @@ -1313,6 +1355,11 @@ bool TessBaseAPI::ProcessPage(Pix *pix, int page_index, const char *filename,
// Switch to alternate mode for retry.
ReadConfigFile(retry_config);
SetImage(pix);

// Apply image preprocessing
NormalizeImage(graynorm_mode);

//if (normalize_grayscale) thresholder_->SetImage(thresholder_->GetPixNormRectGrey());
Recognize(nullptr);
// Restore saved config variables.
ReadConfigFile(kOldVarsFile);
Expand All @@ -1321,7 +1368,7 @@ bool TessBaseAPI::ProcessPage(Pix *pix, int page_index, const char *filename,
if (renderer && !failed) {
failed = !renderer->AddImage(this);
}

//pixDestroy(&pixs);
return !failed;
}

Expand Down
5 changes: 5 additions & 0 deletions src/ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,11 @@ Tesseract::Tesseract()
"11=sparse_text, 12=sparse_text+osd, 13=raw_line"
" (Values from PageSegMode enum in tesseract/publictypes.h)",
this->params())
, INT_MEMBER(preprocess_graynorm_mode, 0,
"Grayscale normalization mode: 0=no normalization, 1=tresholding+recognition, "
"2=tresholding_only, 3=recognition_only "
"The modes 1–3 are applied on the fullimage",
this->params())
, INT_MEMBER(thresholding_method,
static_cast<int>(ThresholdMethod::Otsu),
"Thresholding method: 0 = Otsu, 1 = LeptonicaOtsu, 2 = "
Expand Down
1 change: 1 addition & 0 deletions src/ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -759,6 +759,7 @@ class TESS_API Tesseract : public Wordrec {
BOOL_VAR_H(tessedit_do_invert);
double_VAR_H(invert_threshold);
INT_VAR_H(tessedit_pageseg_mode);
INT_VAR_H(preprocess_graynorm_mode);
INT_VAR_H(thresholding_method);
BOOL_VAR_H(thresholding_debug);
double_VAR_H(thresholding_window_size);
Expand Down
110 changes: 109 additions & 1 deletion src/ccmain/thresholder.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -183,6 +183,103 @@ void ImageThresholder::SetImage(const Image pix) {
Init();
}

/*----------------------------------------------------------------------*
* Non-linear contrast normalization *
*----------------------------------------------------------------------*/
/*!
* \brief pixNLNorm()
*
* \param[in] pixs 8 or 32 bpp
* \param[out] ptresh l_int32 global threshold value
* \return pixd 8 bpp grayscale, or NULL on error
*
* <pre>
* Notes:
* (1) This composite operation is good for adaptively removing
* dark background. Adaption of Thomas Breuel's nlbin version
* from ocropus.
* (2) A good thresholder together NLNorm is WAN
* </pre>
*/
Pix *ImageThresholder::pixNLNorm(Pix *pixs, int *pthresh) {
l_int32 d, thresh, w1, h1, w2, h2, fgval, bgval;
l_uint32 black_val, white_val;
l_float32 factor, threshpos, avefg, avebg;
PIX *pixg, *pixd, *pixd2;
BOX *pixbox;
NUMA *na;

PROCNAME("pixNLNorm");

if (!pixs || (d = pixGetDepth(pixs)) < 8) {
return (PIX *)ERROR_PTR("pixs undefined or d < 8 bpp", procName, NULL);
}
if (d == 32) {
// ITU-R 601-2 luma
pixg = pixConvertRGBToGray(pixs, 0.299, 0.587, 0.114);
// Legacy converting
// pixg = pixConvertRGBToGray(pixs, 0.3, 0.4, 0.3);
} else {
pixg = pixConvertTo8(pixs, 0);
}

/// Normalize contrast
// pixGetBlackOrWhiteVal(pixg, L_GET_BLACK_VAL, &black_val);
// if (black_val>0) pixAddConstantGray(pixg, -1 * black_val);
// pixGetBlackOrWhiteVal(pixg, L_GET_WHITE_VAL, &white_val);
// if (white_val<255) pixMultConstantGray(pixg, (255. / white_val));
pixd = pixMaxDynamicRange(pixg, L_LINEAR_SCALE);
pixDestroy(&pixg);
pixg = pixCopy(nullptr, pixd);
pixDestroy(&pixd);

/// Calculate flat version
pixGetDimensions(pixg, &w1, &h1, NULL);
pixd = pixScaleGeneral(pixg, 0.5, 0.5, 0.0, 0);
pixd2 = pixRankFilter(pixd, 20, 2, 0.8);
pixDestroy(&pixd);
pixd = pixRankFilter(pixd2, 2, 20, 0.8);
pixDestroy(&pixd2);
pixGetDimensions(pixd, &w2, &h2, NULL);
pixd2 = pixScaleGrayLI(pixd, (l_float32)w1 / (l_float32)w2,
(l_float32)h1 / (l_float32)h2);
pixDestroy(&pixd);
pixInvert(pixd2, pixd2);
pixAddGray(pixg, pixg, pixd2);
pixDestroy(&pixd2);

/// Local contrast enhancement
// Ignore a border of 10 % and get a mean threshold,
// background and foreground value
pixbox = boxCreate(w1 * 0.1, h1 * 0.1, w1 * 0.9, h1 * 0.9);
na = pixGetGrayHistogramInRect(pixg, pixbox, 1);
numaSplitDistribution(na, 0.1, &thresh, &avefg, &avebg, NULL, NULL, NULL);
boxDestroy(&pixbox);
numaDestroy(&na);

/// Subtract by a foreground value and multiply by factor to
// set a background value to 255
fgval = (l_int32)(avefg + 0.5);
bgval = (l_int32)(avebg + 0.5);
threshpos = (l_float32)(thresh - fgval) / (bgval - fgval);
// Todo: fgval or fgval + slightly offset
fgval = fgval; // + (l_int32) ((thresh - fgval)*.25);
bgval = bgval +
(l_int32)std::min((l_int32)((bgval - thresh) * .5), (255 - bgval));
factor = 255. / (bgval - fgval);
if (pthresh) {
*pthresh = (l_int32)threshpos * factor - threshpos * .1;
}
pixAddConstantGray(pixg, -1 * fgval);
pixMultConstantGray(pixg, factor);

return pixg;
}

/*----------------------------------------------------------------------*
* Thresholding *
*----------------------------------------------------------------------*/

std::tuple<bool, Image, Image, Image> ImageThresholder::Threshold(
TessBaseAPI *api,
ThresholdMethod method) {
Expand All @@ -203,7 +300,7 @@ std::tuple<bool, Image, Image, Image> ImageThresholder::Threshold(
int r;

l_int32 pix_w, pix_h;
pixGetDimensions(pix_grey, &pix_w, &pix_h, nullptr);
pixGetDimensions(pix_, &pix_w, &pix_h, nullptr);

bool thresholding_debug;
api->GetBoolVariable("thresholding_debug", &thresholding_debug);
Expand Down Expand Up @@ -381,6 +478,17 @@ Image ImageThresholder::GetPixRectGrey() {
return pix;
}

// Get a clone/copy of the source image rectangle, reduced to normalized greyscale,
// and at the same resolution as the output binary.
// The returned Pix must be pixDestroyed.
// Provided to the classifier to extract features from the greyscale image.
Image ImageThresholder::GetPixNormRectGrey() {
auto pix = GetPixRect();
auto result = ImageThresholder::pixNLNorm(pix, nullptr);
pix.destroy();
return result;
}

// Otsu thresholds the rectangle, taking the rectangle from *this.
void ImageThresholder::OtsuThresholdRectToPix(Image src_pix, Image *out_pix) const {
std::vector<int> thresholds;
Expand Down
9 changes: 9 additions & 0 deletions src/ccmain/thresholder.h
Original file line number Diff line number Diff line change
Expand Up @@ -154,6 +154,12 @@ class TESS_API ImageThresholder {
// Provided to the classifier to extract features from the greyscale image.
virtual Image GetPixRectGrey();

// Get a clone/copy of the source image rectangle, reduced to normalized greyscale,
// and at the same resolution as the output binary.
// The returned Pix must be pixDestroyed.
// Provided to the classifier to extract features from the greyscale image.
virtual Image GetPixNormRectGrey();

protected:
// ----------------------------------------------------------------------
// Utility functions that may be useful components for other thresholders.
Expand All @@ -170,6 +176,9 @@ class TESS_API ImageThresholder {
// Otsu thresholds the rectangle, taking the rectangle from *this.
void OtsuThresholdRectToPix(Image src_pix, Image *out_pix) const;

// Return non-linear normalized grayscale
Pix *pixNLNorm(Pix *pixs, int *pthresh);

/// Threshold the rectangle, taking everything except the src_pix
/// from the class, using thresholds/hi_values to the output pix.
/// NOTE that num_channels is the size of the thresholds and hi_values
Expand Down