Many bugs in training the legacy engine #3925

SpaceView · 2022-09-22T08:28:42Z

I doubt anybody have successfully trained custom data with tesseract 5.2.0 and 5.1.0, the latest I can succeed is 5.0.0-alpha-20201224.
Below are some BUGs when I'm running tesseract 5.2.0 for custom data training. I can say there are TOO MANY BUGS, thus I was not able to finish the whole training due to limited time at this moment, below are just a few of the found BUGs for reference.

// Deletes all samples with zero features marked by KillSample.
void TrainingSampleSet::DeleteDeadSamples() {
  using namespace std::placeholders; // for _1
  auto old_it = samples_.begin();
  for (; old_it < samples_.end(); ++old_it) {
    if (*old_it == nullptr || (*old_it)->class_id() < 0) {
      break;
    }
  }
  auto new_it = old_it;
  for (; old_it < samples_.end(); ++old_it) {
    if (*old_it == nullptr || (*old_it)->class_id() < 0) {
      delete *old_it;
    } else {
      *new_it = *old_it;
      ++new_it;
    }
  }
  //samples_.resize(new_it - samples_.begin() + 1);      //<------------crash the program when samples_.size() is 0
  samples_.resize(new_it - samples_.begin());
  num_raw_samples_ = samples_.size();
  // Samples must be re-organized now we have deleted a few.
}



INT_TEMPLATES_STRUCT *Classify::CreateIntTemplates(CLASSES FloatProtos,
                                           const UNICHARSET &target_unicharset) {
  CLASS_TYPE FClass;
  INT_CLASS_STRUCT *IClass;
  int ProtoId;
  int ConfigId;

  auto IntTemplates = new INT_TEMPLATES_STRUCT;

  for (unsigned ClassId = 0; ClassId < target_unicharset.size(); ClassId++) {
    FClass = &(FloatProtos[ClassId]);
    if (FClass->NumProtos == 0 && FClass->NumConfigs == 0 &&
        strcmp(target_unicharset.id_to_unichar(ClassId), " ") != 0) {
      tprintf("Warning: no protos/configs for %s in CreateIntTemplates()\n",
              target_unicharset.id_to_unichar(ClassId));
    }
    assert(UnusedClassIdIn(IntTemplates, ClassId));
    IClass = new INT_CLASS_STRUCT(FClass->NumProtos, FClass->NumConfigs);
    //FontSet fs{FClass->font_set.size()};             //<---------------------- it will force to push an element in, not size of the vector
    int fsize = FClass->font_set.size();
    FontSet fs(fsize);
	
	
	
	
/**
 * This routine converts from the old floating point format
 * to the new integer format.
 * @param FloatProtos prototypes in old floating pt format
 * @param target_unicharset the UNICHARSET to use
 * @return New set of training templates in integer format.
 * @note Globals: none
 */
INT_TEMPLATES_STRUCT *Classify::CreateIntTemplates(CLASSES FloatProtos,
                                           const UNICHARSET &target_unicharset) {
  CLASS_TYPE FClass;
  INT_CLASS_STRUCT *IClass;
  int ProtoId;
  int ConfigId;

  auto IntTemplates = new INT_TEMPLATES_STRUCT;

  for (unsigned ClassId = 0; ClassId < target_unicharset.size(); ClassId++) {
    FClass = &(FloatProtos[ClassId]);
    if (FClass->NumProtos == 0 && FClass->NumConfigs == 0 &&
        strcmp(target_unicharset.id_to_unichar(ClassId), " ") != 0) {
      tprintf("Warning: no protos/configs for %s in CreateIntTemplates()\n",
              target_unicharset.id_to_unichar(ClassId));
    }
    assert(UnusedClassIdIn(IntTemplates, ClassId));
    IClass = new INT_CLASS_STRUCT(FClass->NumProtos, FClass->NumConfigs);
    //FontSet fs{FClass->font_set.size()};
    int fsize = FClass->font_set.size();
    FontSet fs(fsize);
    for (unsigned i = 0; i < fs.size(); ++i) {
      fs[i] = FClass->font_set.at(i);
    }
    IClass->font_set_id = this->fontset_table_.push_back(fs);  // <------------------------------ref. to below push_back function
    AddIntClass(IntTemplates, ClassId, IClass);                         

    for (ProtoId = 0; ProtoId < FClass->NumProtos; ProtoId++) {
      AddIntProto(IClass);
      ConvertProto(ProtoIn(FClass, ProtoId), ProtoId, IClass);
      AddProtoToProtoPruner(ProtoIn(FClass, ProtoId), ProtoId, IClass,
                            classify_learning_debug_level >= 2);
      AddProtoToClassPruner(ProtoIn(FClass, ProtoId), ClassId, IntTemplates);
    }

    for (ConfigId = 0; ConfigId < FClass->NumConfigs; ConfigId++) {
      AddIntConfig(IClass);
      ConvertConfig(FClass->Configurations[ConfigId], ConfigId, IClass);
    }
  }
  return (IntTemplates);
} /* CreateIntTemplates */

//ref. unicity_table.h
  /// Add an element in the table
  int push_back(T object)  {
    auto idx = get_index(object);
    if (idx == -1) {
      //table_.push_back(object);  //<----------- it will crash the program since idx will be 1 and when size() is 1; actually index should be 0 for size of 1;
      //idx = size();
      idx = table_.push_back(object);
    }
    return idx;
  }


bool write_set(FILE *f, const FontSet &fs) {
  int size = fs.size();
  //return tesseract::Serialize(f, &size) && tesseract::Serialize(f,  &fs[0], size); //<----------------this will crash the program when fs.size() is 0
  return tesseract::Serialize(f, &size) && tesseract::Serialize(f, (size?&fs[0]:0), size);
}



/*---------------------------------------------------------------------------*/
// TODO(rays) This is now used only by cntraining. Convert cntraining to use
// the new method or get rid of it entirely.
/**
 * This routine reads training samples from a file and
 * places them into a data structure which organizes the
 * samples by FontName and CharName.  It then returns this
 * data structure.
 * @param file open text file to read samples from
 * @param feature_definitions
 * @param feature_name
 * @param max_samples
 * @param unicharset
 * @param training_samples
 */
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_definitions, const char *feature_name,
                         int max_samples, UNICHARSET *unicharset, FILE *file,
                         LIST *training_samples) {
  char buffer[2048];
  char unichar[UNICHAR_LEN + 1];
  LABELEDLIST char_sample;
  FEATURE_SET feature_samples;
  uint32_t feature_type = ShortNameToFeatureType(feature_definitions, feature_name);

  // Zero out the font_sample_count for all the classes.
  LIST it = *training_samples;
  iterate(it) {
    char_sample = reinterpret_cast<LABELEDLIST>(it->first_node());
    char_sample->font_sample_count = 0;
  }

  while (fgets(buffer, 2048, file) != nullptr) {
    if (buffer[0] == '\n') {
      continue;
    }

    sscanf(buffer, "%*s %s", unichar);
    if (unicharset != nullptr && !unicharset->contains_unichar(unichar)) {
      unicharset->unichar_insert(unichar);
      if (unicharset->size() > MAX_NUM_CLASSES) {
        tprintf(
            "Error: Size of unicharset in training is "
            "greater than MAX_NUM_CLASSES\n");
        exit(1);
      }
    }
    char_sample = FindList(*training_samples, unichar);
    if (char_sample == nullptr) {
      char_sample = new LABELEDLISTNODE(unichar);
      *training_samples = push(*training_samples, char_sample);
    }
    auto char_desc = ReadCharDescription(feature_definitions, file);
    feature_samples = char_desc->FeatureSets[feature_type];
    if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
      char_sample->List = push(char_sample->List, feature_samples);
      char_sample->SampleCount++;
      char_sample->font_sample_count++;
    } else {
      delete feature_samples;
    }
    for (size_t i = 0; i < char_desc->NumFeatureSets; i++) {
      if (feature_type != i) {
        delete char_desc->FeatureSets[i];
        char_desc->FeatureSets[i] = nullptr;  //<--------------newly added, otherwise crash the program on "delete char_desc;" when destruction is forced by char_desc;
      }
    }
    delete char_desc;
  }
} // ReadTrainingSamples

I changed the above code and can get "shapeclustering.exe" and "mftraining.exe" to run smoothly, all training materail such as "inttemp" and "pffmtable" are well generated.
Currently the cntraining.exe will crash, but I don't have any more time to test.

The text was updated successfully, but these errors were encountered:

stweil · 2022-09-22T10:59:50Z

Thank you for this detailled report. So you are training a legacy model? That is indeed rarely done as most people (including myself) typically train LSTM models.

It would help if you could describe the single steps which are necessary to reproduce the failures. Ideally we should create unit tests then to avoid future regressions.

amitdo · 2022-09-22T14:37:55Z

Unit testing is not enough, we should do real world testing on thousands of pages to test the layout analysis and the two OCR engines.

BTW, there was a report by @tfmorris about a huge drop in speed and accuracy that occurred between version 3.02 and version 3.03 (and some later versions). Nobody did anything to find out the source of the regression.

I also read a report on a drop in accuracy of the layout analysis that occurred between 3.04 and 4.0. I don't have a reference to that report.

There were also some general reports (without much details) about a drop in accuracy that occurred between 4.x and 5.0.

SpaceView · 2022-09-23T03:01:19Z

I'm not sure which is the legacy method which is not, I'm working for industry application program thus I MUST use c++ only. I don't use tesstrain since it seems works in PYTHON environment and cannot be deployed in c++. And I don't find any real step-by-step training guidance for latest versions.
Please let me know if you have different training methods.
My method is given as follows,

(1) add path to environment (windows 10)
E:\pkg_ocr\tesseract\tesseract520
 
(2) edit your image with jTessBoxEditor
cd  E:\pkg_ocr\tesstrain\jTessBoxEditor231
train.bat ----> jTessBoxEditor  ---> merge TIFF ---> save it as myfontlab.normal.exp0.tif
 
(3) do the following operation，
tesseract  myfontlab.normal.exp0.tif   myfontlab.normal.exp0   batch.nochop   makebox
tesseract   myfontlab.normal.exp0.tif    myfontlab.normal.exp0   nobatch   box.train
NOTE: you have to adjust image contrast or brightness if these reports "empty ..."
 
(4)
unicharset_extractor myfontlab.normal.exp0.box
 
(5)
echo normal 0 0 0 0 0 > font_properties
NOTE: file name is font_properties ( it works if you use font_properties.txt). content is normal 0 0 0 0 0 . note that the word "normal" must be the same work as in file name "myfontlab.normal.exp0.tif ".
 
(6)
shapeclustering -F font_properties -U unicharset myfontlab.normal.exp0.tr
OR
shapeclustering -F font_properties.txt -U unicharset myfontlab.normal.exp0.tr
 
(7)
mftraining  -F font_properties -U unicharset -O train.unicharset myfontlab.normal.exp0.tr
NOTE: this step will generate inttemp、pffmtable, if it doesn't work, use the below cmd,
mftraining -F font_properties.txt -U unicharset -O train.unicharset myfontlab.normal.exp0.tr
 
(8)
cntraining myfontlab.normal.exp0.tr
 
(9)
combine_tessdata normal
 
(10)generated result is t_7B-normal.txt
tesseract E:\test_images\ocr\t_7B.png  E:\test_images\ocr\t_7B-normal -l normal

amitdo · 2022-09-24T16:57:39Z

(1)

tesseract/src/training/common/trainingsampleset.cpp

Line 557 in 371ee22

samples_.resize(new_it - samples_.begin() + 1);

cac116d

(2)

tesseract/src/classify/intproto.cpp

Lines 507 to 508 in 74e226b

    
           IClass = new INT_CLASS_STRUCT(FClass->NumProtos, FClass->NumConfigs); 
        
           FontSet fs{FClass->font_set.size()};

tesseract/src/ccstruct/fontinfo.cpp

Lines 222 to 224 in 5a36943

    
           bool write_set(FILE *f, const FontSet &fs) { 
        
             int size = fs.size(); 
        
             return tesseract::Serialize(f, &size) && tesseract::Serialize(f, &fs[0], size);

a7f938d

(3)

tesseract/src/ccutil/unicity_table.h

Lines 76 to 77 in 839f528

    
           table_.push_back(object); 
        
           idx = size();

1d3d1fb

amitdo · 2022-09-25T11:55:40Z

@stweil, @egorpugin,

Can you please see if the suggested changes can be applied?

stweil · 2022-09-25T13:34:54Z

I still try to reproduce the bugs locally.

amitdo · 2022-10-27T07:39:26Z

@stweil,

Were you able to reproduce the reported bugs?

stweil · 2022-10-27T09:51:07Z

No, not up to now.

zdenop · 2022-11-24T18:44:44Z

I did a miniature reproduction part of the problem (crash of shapeclustering) for those who want to dig into this problem:
i3925_test_case.zip

I also find an old version of tesseract 3.05.02, which is able to create shapetable from this example.

The steps for reproducing are quite simple:

tesseract num.ocra.exp0.png num.ocra.exp0 nobatch box.train
unicharset_extractor num.ocra.exp0.box
shapeclustering -F font_properties -U unicharset num.ocra.exp0.tr

amitdo · 2022-11-25T09:00:59Z

If the training tools for the the legacy are broken and nobody will fix it in time for the 5.3.0 release, I suggest to modify cmake, sw and autotools to not compile and install the legacy training tools.

stweil · 2022-11-25T10:59:17Z

Thank you, @zdenop, for the test code. git bisect finds commit cac116d which caused the regression.

amitdo · 2022-11-25T11:12:52Z

I already pointed to that commit in #3925 (comment)

Fixes: cac116d ("Replace more PointerVector by std::vector [...]") Signed-off-by: Stefan Weil <sw@weilnetz.de>

stweil · 2022-11-25T11:22:12Z

@SpaceView, pull request #3970 fixes the issue in my test. Perhaps you can try it and confirm whether it works for you, too.

Fixes: cac116d ("Replace more PointerVector by std::vector [...]") Signed-off-by: Stefan Weil <sw@weilnetz.de> Signed-off-by: Stefan Weil <sw@weilnetz.de>

amitdo · 2022-11-30T07:57:45Z

Fixed in #3970.

zdenop · 2022-11-30T16:55:45Z

I am afraid this issue is not solved fully. This set of commands works for me with tesseract 3.05.02 (to be sure how the process should look like):

tesseract num.ocra.exp0.png num.ocra.exp0 nobatch box.train
unicharset_extractor num.ocra.exp0.box
set_unicharset_properties -U unicharset -O num.unicharset --script_dir=langdata/
shapeclustering -F font_properties -U num.unicharset num.ocra.exp0.tr
mftraining -F font_properties -U num.unicharset -O num.unicharset num.ocra.exp0.tr
cntraining num.ocra.exp0.tr
mv inttemp num.inttemp
mv pffmtable num.pffmtable
mv normproto num.normproto
mv shapetable num.shapetable
combine_tessdata num.
mkdir tessdata
mv num.traineddata tessdata
tesseract num.ocra.exp0.png - --psm 7 -l num --tessdata-dir .

However mftraining from the current code has the problem reading created shapetable:

Error: Failed to read shape table shapetable
Reading num.ocra.exp0.tr ...
Flat shape table summary: Number of shapes = 10 max unichars = 1 number with multiple unichars = 0

Unfortunately, I do not have time to test the other version mentioned by the reporter.

zdenop · 2022-12-01T10:04:57Z

I found some spare time for testing are here are some observations:

tesseract-ocr-w64-setup-v4.1.0-elag2019 and Tesseract-OCR-5.0.0-alpha.20201127 works for me => problem seems to be related to code modernization
here are outputs from training i3925_legacy_training_outputs.zip. I realized that current version of tesseract produces (significantly) different output (num.ocra.exp0.tr) from box training (first step). It use different rounding (6 decimal poinsts instead of 8), different number type (float instead of integer). Not sure if this is problem. Anyway shapetable is smaller (84b vs 184b)

IMO it would be good to create small test case also for LSTM training to checks if the output is similar as of 5.0.0-alpha.

stweil · 2022-12-03T22:11:42Z

I'm afraid that the changes 51909d5...36f9131 at least contribute to the regression.

Extract from old num.ocra.exp0.tr:

Extract from new num.ocra.exp0.tr:

if 84
 80.000000 82.000000 192.000000
 80.000000 96.000000 192.000000
 80.000000 109.000000 192.000000
 80.000000 123.000000 192.000000

The old code used add_str_double(), the new code uses std::to_string() which obviously gives a different string. In addition, std::to_string() writes a decimal comma instead of a decimal point with a German locale.

Related functions: tesseract::WriteCharDescription (with Type==2) and tesseract::WriteFeatureSet.

Fixes: 3b07599 ("Replace more STRING by std::string") Signed-off-by: Stefan Weil <sw@weilnetz.de>

mftraining crashed because the returned value was 1 instead of 0 for the first call of UnicityTable::push_back. Signed-off-by: Stefan Weil <sw@weilnetz.de>

It crashed when running mftraining with fs.size() == 0. Signed-off-by: Stefan Weil <sw@weilnetz.de>

It crashed when running mftraining because unicharset_size in file "inttemp" was written with 8 bytes instead of 4 bytes. Signed-off-by: Stefan Weil <sw@weilnetz.de>

This fixes duplicate delete when running cntraining. Signed-off-by: Stefan Weil <sw@weilnetz.de>

…ract-ocr#3925) It is required for mftraining which otherwise writes a wrong shapetable. Signed-off-by: Stefan Weil <sw@weilnetz.de>

The old code did not work correctly if FClass->font_set.size() was 0. It created the FontSet fs with size 1 instead of 0. Signed-off-by: Stefan Weil <sw@weilnetz.de>

It was triggered by mftraining. Signed-off-by: Stefan Weil <sw@weilnetz.de>

mftraining crashed if the search did not find anything. Signed-off-by: Stefan Weil <sw@weilnetz.de>

mftraining crashed because the returned value was 1 instead of 0 for the first call of UnicityTable::push_back. Signed-off-by: Stefan Weil <sw@weilnetz.de>

It crashed when running mftraining with fs.size() == 0. Signed-off-by: Stefan Weil <sw@weilnetz.de>

It crashed when running mftraining because unicharset_size in file "inttemp" was written with 8 bytes instead of 4 bytes. Signed-off-by: Stefan Weil <sw@weilnetz.de>

This fixes duplicate delete when running cntraining. Signed-off-by: Stefan Weil <sw@weilnetz.de>

stweil · 2022-12-12T21:55:21Z

@SpaceView, hopefully the really many bugs which you found and reported are fixed by the many commits in pull request #3977. Some of those commits are nearly identical to your proposed code changes.

It is required for mftraining which otherwise writes a wrong shapetable. Signed-off-by: Stefan Weil <sw@weilnetz.de>

The old code did not work correctly if FClass->font_set.size() was 0. It created the FontSet fs with size 1 instead of 0. Signed-off-by: Stefan Weil <sw@weilnetz.de>

It was triggered by mftraining. Signed-off-by: Stefan Weil <sw@weilnetz.de>

mftraining crashed if the search did not find anything. Signed-off-by: Stefan Weil <sw@weilnetz.de>

mftraining crashed because the returned value was 1 instead of 0 for the first call of UnicityTable::push_back. Signed-off-by: Stefan Weil <sw@weilnetz.de>

It crashed when running mftraining with fs.size() == 0. Signed-off-by: Stefan Weil <sw@weilnetz.de>

It crashed when running mftraining because unicharset_size in file "inttemp" was written with 8 bytes instead of 4 bytes. Signed-off-by: Stefan Weil <sw@weilnetz.de>

This fixes duplicate delete when running cntraining. Signed-off-by: Stefan Weil <sw@weilnetz.de>

…ract-ocr#3925) It is required for mftraining which otherwise writes a wrong shapetable. Signed-off-by: Stefan Weil <sw@weilnetz.de> # Conflicts: # src/ccutil/helpers.h

The old code did not work correctly if FClass->font_set.size() was 0. It created the FontSet fs with size 1 instead of 0. Signed-off-by: Stefan Weil <sw@weilnetz.de>

It was triggered by mftraining. Signed-off-by: Stefan Weil <sw@weilnetz.de>

mftraining crashed if the search did not find anything. Signed-off-by: Stefan Weil <sw@weilnetz.de>

mftraining crashed because the returned value was 1 instead of 0 for the first call of UnicityTable::push_back. Signed-off-by: Stefan Weil <sw@weilnetz.de>

It crashed when running mftraining with fs.size() == 0. Signed-off-by: Stefan Weil <sw@weilnetz.de>

It crashed when running mftraining because unicharset_size in file "inttemp" was written with 8 bytes instead of 4 bytes. Signed-off-by: Stefan Weil <sw@weilnetz.de> # Conflicts: # src/classify/intproto.cpp

This fixes duplicate delete when running cntraining. Signed-off-by: Stefan Weil <sw@weilnetz.de>

SpaceView changed the title ~~BUGs in training~~ TOO MANY MANY BUGs in training with tesseract 5.2.0 Sep 22, 2022

amitdo changed the title ~~TOO MANY MANY BUGs in training with tesseract 5.2.0~~ Many bugs in training the legacy engine Sep 22, 2022

amitdo added training legacy regression labels Sep 22, 2022

amitdo mentioned this issue Sep 23, 2022

segmentation fault shapeclustering -F #3898

Closed

amitdo mentioned this issue Oct 6, 2022

segmentation fault shapeclustering -F font_properties -U unicharset -O chi.unicharset #3939

Closed

stweil added a commit to stweil/tesseract that referenced this issue Nov 25, 2022

Fix training tools for legacy engine (issue tesseract-ocr#3925)

78c2222

Fixes: cac116d ("Replace more PointerVector by std::vector [...]") Signed-off-by: Stefan Weil <sw@weilnetz.de>

stweil added this to In progress in Tesseract next Nov 25, 2022

amitdo pushed a commit that referenced this issue Nov 30, 2022

Fix training tools for legacy engine (issue #3925) (#3970)

af13124

Fixes: cac116d ("Replace more PointerVector by std::vector [...]") Signed-off-by: Stefan Weil <sw@weilnetz.de> Signed-off-by: Stefan Weil <sw@weilnetz.de>

amitdo closed this as completed Nov 30, 2022

amitdo reopened this Dec 1, 2022

stweil added a commit to stweil/tesseract that referenced this issue Dec 3, 2022

Fix function tesseract::WriteFeature (issue tesseract-ocr#3925)

deaeb53

Fixes: 3b07599 ("Replace more STRING by std::string") Signed-off-by: Stefan Weil <sw@weilnetz.de>

stweil added a commit to stweil/tesseract that referenced this issue Dec 11, 2022

Fix function tesseract::write_set (issue tesseract-ocr#3925)

0253897

It crashed when running mftraining with fs.size() == 0. Signed-off-by: Stefan Weil <sw@weilnetz.de>

stweil added a commit to stweil/tesseract that referenced this issue Dec 11, 2022

Fix function ReadTrainingSamples (issue tesseract-ocr#3925)

5a8a4d6

This fixes duplicate delete when running cntraining. Signed-off-by: Stefan Weil <sw@weilnetz.de>

stweil added a commit to stweil/tesseract that referenced this issue Dec 12, 2022

Remove assertion in function NewSimpleProto (issue tesseract-ocr#3925)

8aaf5fe

It was triggered by mftraining. Signed-off-by: Stefan Weil <sw@weilnetz.de>

stweil added a commit to stweil/tesseract that referenced this issue Dec 12, 2022

Fix function ComputeChiSquared (issue tesseract-ocr#3925)

d5e9dd2

mftraining crashed if the search did not find anything. Signed-off-by: Stefan Weil <sw@weilnetz.de>

stweil added a commit to stweil/tesseract that referenced this issue Dec 12, 2022

Fix function tesseract::write_set (issue tesseract-ocr#3925)

2070d28

It crashed when running mftraining with fs.size() == 0. Signed-off-by: Stefan Weil <sw@weilnetz.de>

stweil added a commit to stweil/tesseract that referenced this issue Dec 12, 2022

Fix function ReadTrainingSamples (issue tesseract-ocr#3925)

c7477f5

This fixes duplicate delete when running cntraining. Signed-off-by: Stefan Weil <sw@weilnetz.de>

stweil added a commit that referenced this issue Dec 13, 2022

Add missing serialization to FILE for vector of pointers (issue #3925)

6b7cb1c

It is required for mftraining which otherwise writes a wrong shapetable. Signed-off-by: Stefan Weil <sw@weilnetz.de>

stweil added a commit that referenced this issue Dec 13, 2022

Fix function Classify::CreateIntTemplates (issue #3925)

f969ba9

The old code did not work correctly if FClass->font_set.size() was 0. It created the FontSet fs with size 1 instead of 0. Signed-off-by: Stefan Weil <sw@weilnetz.de>

stweil added a commit that referenced this issue Dec 13, 2022

Remove assertion in function NewSimpleProto (issue #3925)

5591bc0

It was triggered by mftraining. Signed-off-by: Stefan Weil <sw@weilnetz.de>

stweil added a commit that referenced this issue Dec 13, 2022

Fix function ComputeChiSquared (issue #3925)

1d3b410

mftraining crashed if the search did not find anything. Signed-off-by: Stefan Weil <sw@weilnetz.de>

stweil added a commit that referenced this issue Dec 13, 2022

Fix function UnicityTable::push_back (issue #3925)

1fd8f81

mftraining crashed because the returned value was 1 instead of 0 for the first call of UnicityTable::push_back. Signed-off-by: Stefan Weil <sw@weilnetz.de>

stweil added a commit that referenced this issue Dec 13, 2022

Fix function tesseract::write_set (issue #3925)

4fa046b

It crashed when running mftraining with fs.size() == 0. Signed-off-by: Stefan Weil <sw@weilnetz.de>

stweil added a commit that referenced this issue Dec 13, 2022

Fix function Classify::WriteIntTemplates (issue #3925)

23138ab

It crashed when running mftraining because unicharset_size in file "inttemp" was written with 8 bytes instead of 4 bytes. Signed-off-by: Stefan Weil <sw@weilnetz.de>

stweil added a commit that referenced this issue Dec 13, 2022

Fix function ReadTrainingSamples (issue #3925)

a806d21

This fixes duplicate delete when running cntraining. Signed-off-by: Stefan Weil <sw@weilnetz.de>

amitdo closed this as completed Dec 18, 2022

GerHobbelt pushed a commit to GerHobbelt/tesseract that referenced this issue Jan 27, 2023

Remove assertion in function NewSimpleProto (issue tesseract-ocr#3925)

295d7e2

It was triggered by mftraining. Signed-off-by: Stefan Weil <sw@weilnetz.de>

GerHobbelt pushed a commit to GerHobbelt/tesseract that referenced this issue Jan 27, 2023

Fix function ComputeChiSquared (issue tesseract-ocr#3925)

071ddfe

mftraining crashed if the search did not find anything. Signed-off-by: Stefan Weil <sw@weilnetz.de>

GerHobbelt pushed a commit to GerHobbelt/tesseract that referenced this issue Jan 27, 2023

Fix function tesseract::write_set (issue tesseract-ocr#3925)

9c8a15f

It crashed when running mftraining with fs.size() == 0. Signed-off-by: Stefan Weil <sw@weilnetz.de>

GerHobbelt pushed a commit to GerHobbelt/tesseract that referenced this issue Jan 27, 2023

Fix function ReadTrainingSamples (issue tesseract-ocr#3925)

312b7d4

This fixes duplicate delete when running cntraining. Signed-off-by: Stefan Weil <sw@weilnetz.de>

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Many bugs in training the legacy engine #3925

Many bugs in training the legacy engine #3925

SpaceView commented Sep 22, 2022

stweil commented Sep 22, 2022

amitdo commented Sep 22, 2022 •

edited

Loading

SpaceView commented Sep 23, 2022

amitdo commented Sep 24, 2022

amitdo commented Sep 25, 2022

stweil commented Sep 25, 2022

amitdo commented Oct 27, 2022 •

edited

Loading

stweil commented Oct 27, 2022

zdenop commented Nov 24, 2022

amitdo commented Nov 25, 2022

stweil commented Nov 25, 2022

amitdo commented Nov 25, 2022

stweil commented Nov 25, 2022

amitdo commented Nov 30, 2022

zdenop commented Nov 30, 2022 •

edited

Loading

zdenop commented Dec 1, 2022

stweil commented Dec 3, 2022

stweil commented Dec 12, 2022 •

edited

Loading

Many bugs in training the legacy engine #3925

Many bugs in training the legacy engine #3925

Comments

SpaceView commented Sep 22, 2022

stweil commented Sep 22, 2022

amitdo commented Sep 22, 2022 • edited Loading

SpaceView commented Sep 23, 2022

amitdo commented Sep 24, 2022

amitdo commented Sep 25, 2022

stweil commented Sep 25, 2022

amitdo commented Oct 27, 2022 • edited Loading

stweil commented Oct 27, 2022

zdenop commented Nov 24, 2022

amitdo commented Nov 25, 2022

stweil commented Nov 25, 2022

amitdo commented Nov 25, 2022

stweil commented Nov 25, 2022

amitdo commented Nov 30, 2022

zdenop commented Nov 30, 2022 • edited Loading

zdenop commented Dec 1, 2022

stweil commented Dec 3, 2022

stweil commented Dec 12, 2022 • edited Loading

amitdo commented Sep 22, 2022 •

edited

Loading

amitdo commented Oct 27, 2022 •

edited

Loading

zdenop commented Nov 30, 2022 •

edited

Loading

stweil commented Dec 12, 2022 •

edited

Loading