Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RFC: Add initial support for traineddata files in compressed archive formats (don't merge) #911

Closed
wants to merge 8 commits into from
Closed
6 changes: 6 additions & 0 deletions .travis.yml
Expand Up @@ -21,6 +21,12 @@ addons:
sources:
#- ubuntu-toolchain-r-test
packages:
# Try different packages for compressed archives.
- libarchive-dev
# Trusty does not support libminizip-dev (requires Zenial).
#libminizip-dev
- libzip-dev
- libzzip-dev
#- g++-6

#matrix:
Expand Down
24 changes: 24 additions & 0 deletions configure.ac
Expand Up @@ -422,6 +422,30 @@ else
AC_MSG_ERROR([Leptonica 1.74 or higher is required. Try to install libleptonica-dev package.])
fi

PKG_CHECK_MODULES([libarchive], [libarchive], [have_libarchive=true], [have_libarchive=false])
AM_CONDITIONAL([HAVE_LIBARCHIVE], [$have_libarchive])
if $have_libarchive; then
AC_DEFINE([HAVE_LIBARCHIVE], [], [Enable libarchive])
fi

PKG_CHECK_MODULES([libzip], [libzip], [have_libzip=true], [have_libzip=false])
AM_CONDITIONAL([HAVE_LIBZIP], [$have_libzip])
if $have_libzip; then
AC_DEFINE([HAVE_LIBZIP], [], [Enable libzip])
fi

PKG_CHECK_MODULES([minizip], [minizip], [have_minizip=true], [have_minizip=false])
AM_CONDITIONAL([HAVE_MINIZIP], [$have_minizip])
if $have_minizip; then
AC_DEFINE([HAVE_MINIZIP], [], [Enable minizip])
fi

PKG_CHECK_MODULES([zziplib], [zziplib], [have_zziplib=true], [have_zziplib=false])
AM_CONDITIONAL([HAVE_ZZIPLIB], $have_zziplib)
if $have_zziplib; then
AC_DEFINE([HAVE_ZZIPLIB], [], [Enable zziplib])
fi

AM_CONDITIONAL([ENABLE_TRAINING], true)

# Check availability of ICU packages.
Expand Down
12 changes: 12 additions & 0 deletions src/api/Makefile.am
Expand Up @@ -88,6 +88,18 @@ tesseract_LDFLAGS = $(OPENCL_LDFLAGS)

tesseract_LDADD += $(LEPTONICA_LIBS)
tesseract_LDADD += $(OPENMP_CXXFLAGS)
if HAVE_LIBARCHIVE
tesseract_LDADD += $(libarchive_LIBS)
endif
if HAVE_LIBZIP
tesseract_LDADD += $(libzip_LIBS)
endif
if HAVE_MINIZIP
tesseract_LDADD += $(minizip_LIBS)
endif
if HAVE_ZZIPLIB
tesseract_LDADD += $(zziplib_LIBS)
endif

if T_WIN
tesseract_LDADD += -ltiff
Expand Down
3 changes: 1 addition & 2 deletions src/api/baseapi.cpp
Expand Up @@ -477,8 +477,7 @@ int TessBaseAPI::InitLangMod(const char* datapath, const char* language) {
tesseract_ = new Tesseract;
else
ParamUtils::ResetToDefaults(tesseract_->params());
TessdataManager mgr;
return tesseract_->init_tesseract_lm(datapath, nullptr, language, &mgr);
return tesseract_->init_tesseract_lm(datapath, nullptr, language);
}
#endif // ndef DISABLED_LEGACY_ENGINE

Expand Down
7 changes: 4 additions & 3 deletions src/ccmain/tessedit.cpp
Expand Up @@ -460,12 +460,13 @@ void Tesseract::SetupUniversalFontIds() {

// init the LM component
int Tesseract::init_tesseract_lm(const char *arg0, const char *textbase,
const char *language, TessdataManager *mgr) {
const char *language) {
TessdataManager mgr;
if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
nullptr, 0, nullptr, nullptr, false, mgr))
nullptr, 0, nullptr, nullptr, false, &mgr))
return -1;
getDict().SetupForLoad(Dict::GlobalDawgCache());
getDict().Load(lang, mgr);
getDict().Load(lang, &mgr);
getDict().FinishLoad();
return 0;
}
Expand Down
2 changes: 1 addition & 1 deletion src/ccmain/tesseractclass.h
Expand Up @@ -556,7 +556,7 @@ class Tesseract : public Wordrec {
void SetupUniversalFontIds();

int init_tesseract_lm(const char* arg0, const char* textbase,
const char* language, TessdataManager* mgr);
const char* language);

void recognize_page(STRING& image_name);
void end_tesseract();
Expand Down
13 changes: 13 additions & 0 deletions src/ccutil/Makefile.am
Expand Up @@ -40,6 +40,19 @@ libtesseract_ccutil_la_SOURCES = \
unichar.cpp unicharcompress.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \
params.cpp universalambigs.cpp

if HAVE_LIBARCHIVE
AM_CPPFLAGS += $(libarchive_CFLAGS)
endif
if HAVE_LIBZIP
AM_CPPFLAGS += $(libzip_CFLAGS)
endif
if HAVE_MINIZIP
AM_CPPFLAGS += $(minizip_CFLAGS)
endif
if HAVE_ZZIPLIB
AM_CPPFLAGS += $(zziplib_CFLAGS)
endif

if T_WIN
AM_CPPFLAGS += -DWINDLLNAME=\"lib@GENERIC_LIBRARY_NAME@\"
endif
201 changes: 201 additions & 0 deletions src/ccutil/tessdatamanager.cpp
Expand Up @@ -24,6 +24,26 @@
#include "tessdatamanager.h"

#include <cstdio>
#include <string>

#if defined(HAVE_LIBARCHIVE)
#include <archive.h>
#include <archive_entry.h>
#endif
#if defined(HAVE_LIBZIP)
#if defined(HAVE_MINIZIP)
// libminizip provides minizip/zip.h. Hack to get the right one.
#include "/usr/include/zip.h"
#else
#include <zip.h>
#endif
#endif
#if defined(HAVE_MINIZIP)
#include <unzip.h>
#endif
#if defined(HAVE_ZZIPLIB)
#include <zzip/lib.h>
#endif

#include "errcode.h"
#include "helpers.h"
Expand Down Expand Up @@ -52,9 +72,186 @@ void TessdataManager::LoadFileLater(const char *data_file_name) {
data_file_name_ = data_file_name;
}

#if defined(HAVE_LIBARCHIVE)
bool TessdataManager::LoadArchiveFile(const char *filename) {
bool result = false;
archive *a = archive_read_new();
if (a != nullptr) {
archive_read_support_filter_all(a);
archive_read_support_format_all(a);
if (archive_read_open_filename(a, filename, 8192) == ARCHIVE_OK) {
archive_entry *ae;
while (archive_read_next_header(a, &ae) == ARCHIVE_OK) {
const char *component = archive_entry_pathname(ae);
if (component != nullptr) {
TessdataType type;
if (TessdataTypeFromFileName(component, &type)) {
int64_t size = archive_entry_size(ae);
if (size > 0) {
entries_[type].resize_no_init(size);
if (archive_read_data(a, &entries_[type][0], size) == size) {
is_loaded_ = true;
}
}
}
}
}
result = is_loaded_;
} else {
tprintf("archive_read_open_filename(...,%s,...) failed, %s\n",
filename, strerror(archive_errno(a)));
}
archive_read_free(a);
}
return result;
}
#endif

#if defined(HAVE_LIBZIP)
bool TessdataManager::LoadZipFile(const char *filename) {
bool result = false;
int err;
zip_t *uf = zip_open(filename, ZIP_RDONLY, &err);
if (uf != nullptr) {
int64_t nEntries = zip_get_num_entries(uf, ZIP_FL_UNCHANGED);
for (int i = 0; i < nEntries; i++) {
zip_stat_t zipStat;
if (zip_stat_index(uf, i, ZIP_FL_UNCHANGED, &zipStat) == 0 &&
(zipStat.valid & ZIP_STAT_NAME) && (zipStat.valid & ZIP_STAT_SIZE)) {
TessdataType type;
if (TessdataTypeFromFileName(zipStat.name, &type)) {
zip_file_t *zipFile = zip_fopen_index(uf, i, ZIP_FL_UNCHANGED);
if (zipFile == nullptr) {
tprintf("zip_fopen_index(...) failed\n");
} else {
entries_[type].resize_no_init(zipStat.size);
if (zip_fread(zipFile, &entries_[type][0], zipStat.size) !=
static_cast<int64_t>(zipStat.size)) {
tprintf("zip_fread(...) failed\n");
}
zip_fclose(zipFile);
}
}
}
}
is_loaded_ = true;
err = zip_close(uf);
if (err != 0) {
tprintf("zip_close(...) failed\n");
}
result = true;
}
return result;
}
#endif

#if defined(HAVE_MINIZIP)
bool TessdataManager::LoadMinizipFile(const char *filename) {
bool result = false;
unzFile uf = unzOpen(filename);
if (uf != nullptr) {
unz_global_info global_info;
int err;
err = unzGetGlobalInfo(uf, &global_info);
if (err == UNZ_OK) {
}
unz_file_info file_info;
char component[32];
char extraField[32];
char comment[32];
//~ $1 = {version = 798, version_needed = 20, flag = 0, compression_method = 8, dosDate = 1252768343, crc = 2481269679, compressed_size = 7131663, uncompressed_size = 16109842,
//~ size_filename = 15, size_file_extra = 24, size_file_comment = 0, disk_num_start = 0, internal_fa = 0, external_fa = 2175008768, tmu_date = {tm_sec = 46, tm_min = 18,
//~ tm_hour = 23, tm_mday = 11, tm_mon = 4, tm_year = 2017}}
for (unsigned i = 0; i < global_info.number_entry; i++) {
err = unzGetCurrentFileInfo(uf, &file_info,
component, sizeof(component),
extraField, sizeof(extraField),
comment, sizeof(comment));
if (err == UNZ_OK) {
TessdataType type;
if (TessdataTypeFromFileName(component, &type)) {
err = unzOpenCurrentFilePassword(uf, nullptr);
if (err != UNZ_OK) {
tprintf("unzOpenCurrentFilePassword(...) failed, err %d\n", err);
} else {
entries_[type].resize_no_init(file_info.uncompressed_size);
err = unzReadCurrentFile(uf, &entries_[type][0], file_info.uncompressed_size);
if (err < UNZ_OK) {
tprintf("unzReadCurrentFile(...) failed, err %d\n", err);
}
err = unzCloseCurrentFile(uf);
if (err != UNZ_OK) {
tprintf("unzCloseCurrentFile(...) failed\n");
}
}
}
}

err = unzGoToNextFile(uf);
if (err != UNZ_OK) {
tprintf("unzGoToNextFile(...) failed\n");
}
}
is_loaded_ = true;
err = unzClose(uf);
if (err != UNZ_OK) {
tprintf("unzClose(...) failed\n");
}
result = true;
}
return result;
}
#endif

#if defined(HAVE_ZZIPLIB)
bool TessdataManager::LoadZzipFile(const char *filename) {
bool result = false;
zzip_error_t err;
ZZIP_DIR *dir = zzip_dir_open(filename, &err);
if (dir != nullptr) {
ZZIP_DIRENT d;
while (zzip_dir_read(dir, &d)) {
TessdataType type;
if (TessdataTypeFromFileName(d.d_name, &type)) {
ZZIP_FILE *f = zzip_file_open(dir, d.d_name, 0);
if (f != nullptr) {
entries_[type].resize_no_init(d.st_size);
ssize_t len = zzip_file_read(f, &entries_[type][0], d.st_size);
if (len != d.st_size) {
tprintf("zzip_file_read(...) failed\n");
}
zzip_file_close(f);
}
}
}
is_loaded_ = true;
zzip_dir_close(dir);
result = true;
}
return result;
}
#endif

bool TessdataManager::Init(const char *data_file_name) {
GenericVector<char> data;
if (reader_ == nullptr) {
const char *tessarchive = getenv("TESSARCHIVE");
#if defined(HAVE_LIBARCHIVE)
if (tessarchive == nullptr || strcmp(tessarchive, "libarchive") == 0)
if (LoadArchiveFile(data_file_name)) return true;
#endif
#if defined(HAVE_MINIZIP)
if (tessarchive == nullptr || strcmp(tessarchive, "libminizip") == 0)
if (LoadZipFile(data_file_name)) return true;
#endif // HAVE_MINIZIP
#if defined(HAVE_LIBZIP)
if (tessarchive == nullptr || strcmp(tessarchive, "libzip") == 0)
if (LoadZipFile(data_file_name)) return true;
#endif // HAVE_MINIZIP
#if defined(HAVE_ZZIPLIB)
if (tessarchive == nullptr || strcmp(tessarchive, "libzzip") == 0)
if (LoadZzipFile(data_file_name)) return true;
#endif
if (!LoadDataFromFile(data_file_name, &data)) return false;
} else {
if (!(*reader_)(data_file_name, &data)) return false;
Expand All @@ -65,6 +262,7 @@ bool TessdataManager::Init(const char *data_file_name) {
// Loads from the given memory buffer as if a file.
bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
int size) {
// TODO: This method supports only the proprietary file format.
Clear();
data_file_name_ = name;
TFile fp;
Expand Down Expand Up @@ -106,6 +304,7 @@ void TessdataManager::OverwriteEntry(TessdataType type, const char *data,
// Saves to the given filename.
bool TessdataManager::SaveFile(const STRING &filename,
FileWriter writer) const {
// TODO: This method supports only the proprietary file format.
ASSERT_HOST(is_loaded_);
GenericVector<char> data;
Serialize(&data);
Expand All @@ -117,6 +316,7 @@ bool TessdataManager::SaveFile(const STRING &filename,

// Serializes to the given vector.
void TessdataManager::Serialize(GenericVector<char> *data) const {
// TODO: This method supports only the proprietary file format.
ASSERT_HOST(is_loaded_);
// Compute the offset_table and total size.
int64_t offset_table[TESSDATA_NUM_ENTRIES];
Expand Down Expand Up @@ -229,6 +429,7 @@ bool TessdataManager::OverwriteComponents(
char **component_filenames,
int num_new_components) {
// Open the files with the new components.
// TODO: This method supports only the proprietary file format.
for (int i = 0; i < num_new_components; ++i) {
TessdataType type;
if (TessdataTypeFromFileName(component_filenames[i], &type)) {
Expand Down