Skip to content

Commit

Permalink
Add support for image or image list by URL
Browse files Browse the repository at this point in the history
This allows OCR of images from the internet without downloading them first:

    tesseract http://IMAGE_URL OUTPUT ...

It uses libcurl.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
  • Loading branch information
stweil committed Oct 1, 2019
1 parent da0fa73 commit 286d827
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 3 deletions.
8 changes: 8 additions & 0 deletions configure.ac
Original file line number Diff line number Diff line change
Expand Up @@ -428,6 +428,14 @@ AC_CHECK_TYPES([mbstate_t],,, [#include "wchar.h"])
# Test auxiliary packages
# ----------------------------------------
AM_CONDITIONAL([HAVE_LIBCURL], false)
PKG_CHECK_MODULES([libcurl], [libcurl], [have_libcurl=true], [have_libcurl=false])
if $have_libcurl; then
AM_CONDITIONAL([HAVE_LIBCURL], true)
else
AM_CONDITIONAL([HAVE_LIBCURL], false)
fi
PKG_CHECK_MODULES([LEPTONICA], [lept >= 1.74], [have_lept=true], [have_lept=false])
if $have_lept; then
CPPFLAGS="$CPPFLAGS $LEPTONICA_CFLAGS"
Expand Down
5 changes: 5 additions & 0 deletions src/api/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@ libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS)
if VISIBILITY
libtesseract_api_la_CPPFLAGS += -DTESS_EXPORTS
endif
if HAVE_LIBCURL
libtesseract_api_la_CPPFLAGS += $(libcurl_CFLAGS) -DHAVE_LIBCURL
endif
libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp
libtesseract_api_la_SOURCES += altorenderer.cpp
libtesseract_api_la_SOURCES += hocrrenderer.cpp
Expand All @@ -42,6 +45,7 @@ libtesseract_api_la_SOURCES += renderer.cpp

lib_LTLIBRARIES += libtesseract.la
libtesseract_la_LDFLAGS = $(LEPTONICA_LIBS) $(OPENCL_LDFLAGS) $(libarchive_LIBS)
libtesseract_la_LDFLAGS += $(libcurl_LIBS)
libtesseract_la_LDFLAGS += $(TENSORFLOW_LIBS)
libtesseract_la_SOURCES =
# Dummy C++ source to cause C++ linking.
Expand Down Expand Up @@ -94,6 +98,7 @@ tesseract_LDADD += $(LEPTONICA_LIBS)
tesseract_LDADD += $(OPENMP_CXXFLAGS)
tesseract_LDADD += $(TENSORFLOW_LIBS)
tesseract_LDADD += $(libarchive_LIBS)
tesseract_LDADD += $(libcurl_LIBS)

if T_WIN
tesseract_LDADD += -ltiff
Expand Down
43 changes: 40 additions & 3 deletions src/api/baseapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,9 @@
#include <set> // for std::pair
#include <sstream> // for std::stringstream
#include <vector> // for std::vector
#ifdef HAVE_LIBCURL
#include <curl/curl.h>
#endif
#include "allheaders.h" // for pixDestroy, boxCreate, boxaAddBox, box...
#ifndef DISABLED_LEGACY_ENGINE
#include "blobclass.h" // for ExtractFontName
Expand Down Expand Up @@ -1081,6 +1084,15 @@ bool TessBaseAPI::ProcessPages(const char* filename, const char* retry_config,
return result;
}

static size_t
WriteMemoryCallback(void *contents, size_t size, size_t nmemb, void *userp)
{
size = size * nmemb;
std::string* buf = reinterpret_cast<std::string*>(userp);
buf->append(reinterpret_cast<const char*>(contents), size);
return size;
}

// In the ideal scenario, Tesseract will start working on data as soon
// as it can. For example, if you stream a filelist through stdin, we
// should start the OCR process as soon as the first filename is
Expand Down Expand Up @@ -1119,6 +1131,31 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename,
buf.assign((std::istreambuf_iterator<char>(std::cin)),
(std::istreambuf_iterator<char>()));
data = reinterpret_cast<const l_uint8 *>(buf.data());
} else if (strncmp(filename, "http:", 5) == 0 ||
strncmp(filename, "https:", 6) == 0 ) {
// Get image or image list by URL.
#ifdef HAVE_LIBCURL
CURL* curl = curl_easy_init();
if (curl == nullptr) {
fprintf(stderr, "Error, curl_easy_init failed\n");
return false;
} else {
CURLcode curlcode;
curlcode = curl_easy_setopt(curl, CURLOPT_URL, filename);
ASSERT_HOST(curlcode == CURLE_OK);
curlcode = curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, WriteMemoryCallback);
ASSERT_HOST(curlcode == CURLE_OK);
curlcode = curl_easy_setopt(curl, CURLOPT_WRITEDATA, &buf);
ASSERT_HOST(curlcode == CURLE_OK);
curlcode = curl_easy_perform(curl);
ASSERT_HOST(curlcode == CURLE_OK);
curl_easy_cleanup(curl);
data = reinterpret_cast<const l_uint8 *>(buf.data());
}
#else
fprintf(stderr, "Error, this tesseract has no URL support\n");
return false;
#endif
} else {
// Check whether the input file can be read.
if (FILE* file = fopen(filename, "rb")) {
Expand All @@ -1132,14 +1169,14 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename,

// Here is our autodetection
int format;
int r = (stdInput) ?
int r = (data != nullptr) ?
findFileFormatBuffer(data, &format) :
findFileFormat(filename, &format);

// Maybe we have a filelist
if (r != 0 || format == IFF_UNKNOWN) {
STRING s;
if (stdInput) {
if (data != nullptr) {
s = buf.c_str();
} else {
std::ifstream t(filename);
Expand All @@ -1164,7 +1201,7 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename,
// Fail early if we can, before producing any output
Pix *pix = nullptr;
if (!tiff) {
pix = (stdInput) ? pixReadMem(data, buf.size()) : pixRead(filename);
pix = (data != nullptr) ? pixReadMem(data, buf.size()) : pixRead(filename);
if (pix == nullptr) {
return false;
}
Expand Down

0 comments on commit 286d827

Please sign in to comment.