tesseract-ocr · stweil · Apr 10, 2024 · Mar 3, 2024 · Mar 3, 2024 · Feb 4, 2022
diff --git a/.gitignore b/.gitignore
@@ -1,25 +1,4 @@
-data/*ground-truth/*
-data/langdata/*
-!data/ground-truth/.gitkeep
-data/all-*
-data/list.*
-data/unicharset
-dta19-reduced
-dta19-reduced.tar.gz
+/data/
 *.built
 *.BAK
-/usr
-data/checkpoints
-9ae97508aed1e5508458f1181b08501f984bf4e2.zip
-langdata-*
-data/test/*
-data/*.traineddata
-wackenroder_herzensergiessungen_*.gt.txt
-wackenroder_herzensergiessungen_*.tif
-master.zip
-main.zip
-plot/*.LOG
-plot/ocrd*
-
-# ignore temporary training data
-*checkpoints*
+*~
diff --git a/Makefile b/Makefile
@@ -11,7 +11,7 @@ SHELL := /bin/bash
 LOCAL := $(PWD)/usr
 PATH := $(LOCAL)/bin:$(PATH)
 
-# Path to the .traineddata directory with traineddata suitable for training 
+# Path to the .traineddata directory with traineddata suitable for training
 # (for example from tesseract-ocr/tessdata_best). Default: $(LOCAL)/share/tessdata
 TESSDATA =  $(LOCAL)/share/tessdata
 
@@ -117,6 +117,8 @@ else
     PY_CMD := python3
 endif
 
+LOG_FILE = $(OUTPUT_DIR)/training.log
+
 # BEGIN-EVAL makefile-parser --make-help Makefile
 
 help:
@@ -126,19 +128,21 @@ help:
 	@echo "    unicharset       Create unicharset"
 	@echo "    charfreq         Show character histogram"
 	@echo "    lists            Create lists of lstmf filenames for training and eval"
-	@echo "    training         Start training"
+	@echo "    training         Start training (i.e. create .checkpoint files)"
-	@echo "    training         Start training (i.e. create .checkpoint files)"
+	@echo "    training         Start training"
-	@echo "    training         Start training (i.e. create .checkpoint files)"
+	@echo "    training         Start training"
 	@echo "    traineddata      Create best and fast .traineddata files from each .checkpoint file"
 	@echo "    proto-model      Build the proto model"
 	@echo "    tesseract-langdata  Download stock unicharsets"
+	@echo "    evaluation       Evaluate .checkpoint models on eval dataset via lstmeval"
-	@echo "    evaluation       Evaluate .checkpoint models on eval dataset via lstmeval"
+	@echo "    evaluation       Evaluate intermediate models on evaluation dataset"
-	@echo "    evaluation       Evaluate .checkpoint models on eval dataset via lstmeval"
+	@echo "    evaluation       Evaluate intermediate models on evaluation dataset"
+	@echo "    plot             Generate train/eval error rate charts from training log"
 	@echo "    clean-box        Clean generated .box files"
 	@echo "    clean-lstmf      Clean generated .lstmf files"
 	@echo "    clean-output     Clean generated output files"
 	@echo "    clean            Clean all generated files"
 	@echo ""
 	@echo "  Variables"
 	@echo ""
-	@echo "    TESSDATA           Path to the .traineddata directory with traineddata suitable for training "
-	@echo "                       (for example from tesseract-ocr/tessdata_best). Default: $(TESSDATA)"
+	@echo "    TESSDATA           Path to the directory containing START_MODEL.traineddata"
+	@echo "                       (for example tesseract-ocr/tessdata_best). Default: $(TESSDATA)"
 	@echo "    MODEL_NAME         Name of the model to be built. Default: $(MODEL_NAME)"
 	@echo "    DATA_DIR           Data directory for output files, proto model, start model, etc. Default: $(DATA_DIR)"
 	@echo "    LANGDATA_DIR       Data directory for langdata (downloaded from Tesseract langdata repo). Default: $(LANGDATA_DIR)"
@@ -147,19 +151,20 @@ help:
 	@echo "    WORDLIST_FILE      Optional Wordlist file for Dictionary dawg. Default: $(WORDLIST_FILE)"
 	@echo "    NUMBERS_FILE       Optional Numbers file for number patterns dawg. Default: $(NUMBERS_FILE)"
 	@echo "    PUNC_FILE          Optional Punc file for Punctuation dawg. Default: $(PUNC_FILE)"
-	@echo "    START_MODEL        Name of the model to continue from. Default: '$(START_MODEL)'"
-	@echo "    PROTO_MODEL        Name of the proto model. Default: '$(PROTO_MODEL)'"
+	@echo "    START_MODEL        Name of the model to continue from (i.e. fine-tune). Default: $(START_MODEL)"
+	@echo "    PROTO_MODEL        Name of the prototype model. Default: $(PROTO_MODEL)"
 	@echo "    TESSDATA_REPO      Tesseract model repo to use (_fast or _best). Default: $(TESSDATA_REPO)"
 	@echo "    MAX_ITERATIONS     Max iterations. Default: $(MAX_ITERATIONS)"
 	@echo "    EPOCHS             Set max iterations based on the number of lines for the training. Default: none"
 	@echo "    DEBUG_INTERVAL     Debug Interval. Default:  $(DEBUG_INTERVAL)"
 	@echo "    LEARNING_RATE      Learning rate. Default: $(LEARNING_RATE)"
-	@echo "    NET_SPEC           Network specification. Default: $(NET_SPEC)"
+	@echo "    NET_SPEC           Network specification (in VGSL) for new model from scratch. Default: $(NET_SPEC)"
-	@echo "    NET_SPEC           Network specification (in VGSL) for new model from scratch. Default: $(NET_SPEC)"
+	@echo "    NET_SPEC           Network specification (in VGSL), only used for training without START_MODEL. Default: $(NET_SPEC)"
-	@echo "    NET_SPEC           Network specification (in VGSL) for new model from scratch. Default: $(NET_SPEC)"
+	@echo "    NET_SPEC           Network specification (in VGSL), only used for training without START_MODEL. Default: $(NET_SPEC)"
 	@echo "    LANG_TYPE          Language Type - Indic, RTL or blank. Default: '$(LANG_TYPE)'"
 	@echo "    PSM                Page segmentation mode. Default: $(PSM)"
 	@echo "    RANDOM_SEED        Random seed for shuffling of the training data. Default: $(RANDOM_SEED)"
 	@echo "    RATIO_TRAIN        Ratio of train / eval training data. Default: $(RATIO_TRAIN)"
 	@echo "    TARGET_ERROR_RATE  Default Target Error Rate. Default: $(TARGET_ERROR_RATE)"
+	@echo "    LOG_FILE           File to copy training output to and read plot figures from. Default: $(LOG_FILE)"
-	@echo "    LOG_FILE           File to copy training output to and read plot figures from. Default: $(LOG_FILE)"
+	@echo "    LOG_FILE           File copy of the training protocol (also used for plotting). Default: $(LOG_FILE)"
-	@echo "    LOG_FILE           File to copy training output to and read plot figures from. Default: $(LOG_FILE)"
+	@echo "    LOG_FILE           File copy of the training protocol (also used for plotting). Default: $(LOG_FILE)"
 
 # END-EVAL
 
@@ -254,15 +259,14 @@ $(ALL_LSTMF): $(ALL_FILES:%.gt.txt=%.lstmf)
 %.lstmf: %.tif %.box
 	tesseract "$<" $* --psm $(PSM) lstm.train
 
-CHECKPOINT_FILES := $(wildcard $(OUTPUT_DIR)/checkpoints/$(MODEL_NAME)*.checkpoint)
 .PHONY: traineddata
-
+CHECKPOINT_FILES = $(wildcard $(OUTPUT_DIR)/checkpoints/$(MODEL_NAME)*.checkpoint)
+BESTMODEL_FILES = $(subst checkpoints,tessdata_best,$(CHECKPOINT_FILES:%.checkpoint=%.traineddata))
+FASTMODEL_FILES = $(subst checkpoints,tessdata_fast,$(CHECKPOINT_FILES:%.checkpoint=%.traineddata))
 # Create best and fast .traineddata files from each .checkpoint file
-traineddata: $(OUTPUT_DIR)/tessdata_best $(OUTPUT_DIR)/tessdata_fast
-
-traineddata: $(subst checkpoints,tessdata_best,$(patsubst %.checkpoint,%.traineddata,$(CHECKPOINT_FILES)))
-traineddata: $(subst checkpoints,tessdata_fast,$(patsubst %.checkpoint,%.traineddata,$(CHECKPOINT_FILES)))
-$(OUTPUT_DIR)/tessdata_best $(OUTPUT_DIR)/tessdata_fast:
+traineddata: $(BESTMODEL_FILES)
+traineddata: $(FASTMODEL_FILES)
+$(OUTPUT_DIR)/tessdata_best $(OUTPUT_DIR)/tessdata_fast $(OUTPUT_DIR)/eval:
 	@mkdir -p $@
 $(OUTPUT_DIR)/tessdata_best/%.traineddata: $(OUTPUT_DIR)/checkpoints/%.checkpoint | $(OUTPUT_DIR)/tessdata_best
 	lstmtraining \
@@ -314,7 +318,8 @@ $(LAST_CHECKPOINT): unicharset lists $(PROTO_MODEL)
 	  --train_listfile $(OUTPUT_DIR)/list.train \
 	  --eval_listfile $(OUTPUT_DIR)/list.eval \
 	  --max_iterations $(MAX_ITERATIONS) \
-	  --target_error_rate $(TARGET_ERROR_RATE)
+	  --target_error_rate $(TARGET_ERROR_RATE) \
+	2>&1 | tee -a $(LOG_FILE)
 $(OUTPUT_DIR).traineddata: $(LAST_CHECKPOINT)
 	@echo
 	lstmtraining \
@@ -335,7 +340,8 @@ $(LAST_CHECKPOINT): unicharset lists $(PROTO_MODEL)
 	  --train_listfile $(OUTPUT_DIR)/list.train \
 	  --eval_listfile $(OUTPUT_DIR)/list.eval \
 	  --max_iterations $(MAX_ITERATIONS) \
-	  --target_error_rate $(TARGET_ERROR_RATE)
+	  --target_error_rate $(TARGET_ERROR_RATE) \
+	2>&1 | tee -a $(LOG_FILE)
 $(OUTPUT_DIR).traineddata: $(LAST_CHECKPOINT)
 	@echo
 	lstmtraining \
@@ -345,6 +351,73 @@ $(OUTPUT_DIR).traineddata: $(LAST_CHECKPOINT)
 	--model_output $@
 endif
 
+# plotting
+
+# Build lstmeval files list based on respective best traineddata models
+BEST_LSTMEVAL_FILES = $(subst tessdata_best,eval,$(BESTMODEL_FILES:%.traineddata=%.eval.log))
+$(BEST_LSTMEVAL_FILES): $(OUTPUT_DIR)/eval/%.eval.log: $(OUTPUT_DIR)/tessdata_best/%.traineddata | $(OUTPUT_DIR)/eval
+	time -p lstmeval  \
+		--verbosity=0 \
+		--model $< \
+		--eval_listfile $(OUTPUT_DIR)/list.eval 2>&1 | grep "^BCER eval" > $@
+# Make TSV with lstmeval CER and checkpoint filename parts
+TSV_LSTMEVAL = $(OUTPUT_DIR)/lstmeval.tsv
+.INTERMEDIATE: $(TSV_LSTMEVAL)
+$(TSV_LSTMEVAL): $(BEST_LSTMEVAL_FILES)
+	@echo "Name	CheckpointCER	LearningIteration	TrainingIteration	EvalCER	IterationCER	SubtrainerCER" > "$@"
+	@{ $(foreach F,$^,echo -n "$F "; grep BCER $F;) } | sort -rn | \
+	sed -e 's|^$(OUTPUT_DIR)/eval/$(MODEL_NAME)_\([0-9.]*\)_\([0-9]*\)_\([0-9]*\).eval.log BCER eval=\([0-9.]*\).*$$|\t\1\t\2\t\3\t\4\t\t|' >>  "$@"
+# Make TSV with CER at every 100 iterations.
+TSV_100_ITERATIONS = $(OUTPUT_DIR)/iteration.tsv
+.INTERMEDIATE: $(TSV_100_ITERATIONS)
+$(TSV_100_ITERATIONS): $(LOG_FILE)
+	@echo "Name	CheckpointCER	LearningIteration	TrainingIteration	EvalCER	IterationCER	SubtrainerCER" > "$@"
+	@grep 'At iteration' $< \
+		| sed -e '/^Sub/d' \
+		| sed -e '/^Update/d' \
+		| sed -e '/^ New worst BCER/d' \
+		| sed -e 's|At iteration \([0-9]*\)/\([0-9]*\)/.*BCER train=|\t\t\1\t\2\t\t|' \
+		| sed -e 's/%, BWER.*/\t/' >>  "$@"
+# Make TSV with Checkpoint CER.
+TSV_CHECKPOINT = $(OUTPUT_DIR)/checkpoint.tsv
+.INTERMEDIATE: $(TSV_CHECKPOINT)
+$(TSV_CHECKPOINT): $(LOG_FILE)
+	@echo "Name	CheckpointCER	LearningIteration	TrainingIteration	EvalCER	IterationCER	SubtrainerCER" > "$@"
+	@grep 'best model' $< \
+		| sed -e 's/^.*\///' \
+		| sed -e 's/\.checkpoint.*$$/\t\t\t/' \
+		| sed -e 's/_/\t/g' >>  "$@"
+# Make TSV with Eval CER.
+TSV_EVAL = $(OUTPUT_DIR)/eval.tsv
+.INTERMEDIATE: $(TSV_EVAL)
+$(TSV_EVAL): $(LOG_FILE)
+	@echo "Name	CheckpointCER	LearningIteration	TrainingIteration	EvalCER	IterationCER	SubtrainerCER" > "$@"
+	@grep 'BCER eval' $< \
+		| sed -e 's/^.*[0-9]At iteration //' \
+		| sed -e 's/,.* BCER eval=/\t\t/'  \
+		| sed -e 's/, BWER.*$$/\t\t/' \
+		| sed -e 's/^/\t\t/' >>  "$@"
+# Make TSV with Subtrainer CER.
+TSV_SUB = $(OUTPUT_DIR)/sub.tsv
+.INTERMEDIATE: $(TSV_SUB)
+$(TSV_SUB): $(LOG_FILE)
+	@echo "Name	CheckpointCER	LearningIteration	TrainingIteration	EvalCER	IterationCER	SubtrainerCER" > "$@"
+	@grep '^UpdateSubtrainer' $< \
+		| sed -e 's/^.*At iteration \([0-9]*\)\/\([0-9]*\)\/.*BCER train=/\t\t\1\t\2\t\t\t/' \
+		| sed -e 's/%, BWER.*//' >>  "$@"
+
+$(OUTPUT_DIR)/$(MODEL_NAME).plot_log.png: $(TSV_100_ITERATIONS) $(TSV_CHECKPOINT) $(TSV_EVAL) $(TSV_SUB)
-$(OUTPUT_DIR)/$(MODEL_NAME).plot_log.png: $(TSV_100_ITERATIONS) $(TSV_CHECKPOINT) $(TSV_EVAL) $(TSV_SUB)
+$(OUTPUT_DIR)/plot_log.png: $(TSV_100_ITERATIONS) $(TSV_CHECKPOINT) $(TSV_EVAL) $(TSV_SUB)
-$(OUTPUT_DIR)/$(MODEL_NAME).plot_log.png: $(TSV_100_ITERATIONS) $(TSV_CHECKPOINT) $(TSV_EVAL) $(TSV_SUB)
+$(OUTPUT_DIR)/plot_log.png: $(TSV_100_ITERATIONS) $(TSV_CHECKPOINT) $(TSV_EVAL) $(TSV_SUB)
+	$(PY_CMD) plot_log.py $@ $(MODEL_NAME) $^
+$(OUTPUT_DIR)/$(MODEL_NAME).plot_cer.png: $(TSV_100_ITERATIONS) $(TSV_CHECKPOINT) $(TSV_EVAL) $(TSV_SUB) $(TSV_LSTMEVAL)
-$(OUTPUT_DIR)/$(MODEL_NAME).plot_cer.png: $(TSV_100_ITERATIONS) $(TSV_CHECKPOINT) $(TSV_EVAL) $(TSV_SUB) $(TSV_LSTMEVAL)
+$(OUTPUT_DIR)/plot_cer.png: $(TSV_100_ITERATIONS) $(TSV_CHECKPOINT) $(TSV_EVAL) $(TSV_SUB) $(TSV_LSTMEVAL)
-$(OUTPUT_DIR)/$(MODEL_NAME).plot_cer.png: $(TSV_100_ITERATIONS) $(TSV_CHECKPOINT) $(TSV_EVAL) $(TSV_SUB) $(TSV_LSTMEVAL)
+$(OUTPUT_DIR)/plot_cer.png: $(TSV_100_ITERATIONS) $(TSV_CHECKPOINT) $(TSV_EVAL) $(TSV_SUB) $(TSV_LSTMEVAL)
+	$(PY_CMD) plot_cer.py $@ $(MODEL_NAME) $^
+
+.PHONY: evaluation plot
+# run lstmeval on list.eval data for each checkpoint model
+evaluation: $(BEST_LSTMEVAL_FILES)
+# combine TSV files with all required CER values, generated from training log and validation logs, then plot
+plot: $(OUTPUT_DIR)/$(MODEL_NAME).plot_cer.png $(OUTPUT_DIR)/$(MODEL_NAME).plot_log.png
-plot: $(OUTPUT_DIR)/$(MODEL_NAME).plot_cer.png $(OUTPUT_DIR)/$(MODEL_NAME).plot_log.png
+plot: $(OUTPUT_DIR)/plot_cer.png $(OUTPUT_DIR)/plot_log.png
-plot: $(OUTPUT_DIR)/$(MODEL_NAME).plot_cer.png $(OUTPUT_DIR)/$(MODEL_NAME).plot_log.png
+plot: $(OUTPUT_DIR)/plot_cer.png $(OUTPUT_DIR)/plot_log.png
+
+
 tesseract-langdata: $(TESSERACT_LANGDATA)
 
 $(TESSERACT_LANGDATA):

diff --git a/README.md b/README.md
@@ -2,7 +2,22 @@
 
 > Training workflow for Tesseract 5 as a Makefile for dependency tracking.
 
-## Install
+* [Installation](#installation)
+    * [Auxiliaries](#auxiliaries)
+    * [Leptonica, Tesseract](#leptonica-tesseract)
+       * [Windows](#windows)
+    * [Python](#python)
+    * [Language data](#language-data)
+* [Usage](#usage)    
+    * [Choose the model name](#choose-the-model-name)
+    * [Provide ground truth data](#provide-ground-truth-data)
+    * [Train](#train)
+    * [Change directory assumptions](#change-directory-assumptions)
+    * [Make model files (traineddata)](#make-model-files-traineddata)
+    * [Plotting CER](#plotting-cer)
+* [License](#license)
+
+## Installation
 
 ### Auxiliaries
 
@@ -39,8 +54,9 @@ To fetch them:
 (While this step is only needed once and implicitly included in the `training` target,
 you might want to run it explicitly beforehand.)
 
+## Usage
 
-## Choose the model name
+### Choose the model name
 
 Choose a name for your model. By convention, Tesseract stack models including
 language-specific resources use (lowercase) three-letter codes defined in
@@ -51,7 +67,7 @@ models use the capitalized name of the script type as an identifier. E.g.,
 `Hangul_vert` for Hangul script with vertical typesetting. In the following,
 the model name is referenced by `MODEL_NAME`.
 
-## Provide ground truth
+### Provide ground truth data
 
 Place ground truth consisting of line images and transcriptions in the folder
 `data/MODEL_NAME-ground-truth`. This list of files will be split into training and
@@ -72,7 +88,7 @@ page, see tips in [issue 7](https://github.com/OCR-D/ocrd-train/issues/7) and
 in particular [@Shreeshrii's shell
 script](https://github.com/OCR-D/ocrd-train/issues/7#issuecomment-419714852).
 
-## Train
+### Train
 
 Run
 
@@ -81,7 +97,7 @@ Run
 
 which is a shortcut for
 
-    make unicharset lists proto-model tesseract-langdata training
+    make unicharset lists proto-model tesseract-langdata training MODEL_NAME=name-of-the-resulting-model
 
 
 Run `make help` to see all the possible targets and variables:
@@ -94,40 +110,52 @@ Run `make help` to see all the possible targets and variables:
     unicharset       Create unicharset
     charfreq         Show character histogram
     lists            Create lists of lstmf filenames for training and eval
-    training         Start training
+    training         Start training (i.e. create .checkpoint files)
     traineddata      Create best and fast .traineddata files from each .checkpoint file
     proto-model      Build the proto model
     tesseract-langdata  Download stock unicharsets
+    evaluation       Evaluate .checkpoint models on eval dataset via lstmeval
+    plot             Generate train/eval error rate charts from training log
     clean            Clean all generated files
 
   Variables
 
     MODEL_NAME         Name of the model to be built. Default: foo
-    START_MODEL        Name of the model to continue from. Default: ''
-    PROTO_MODEL        Name of the proto model. Default: OUTPUT_DIR/MODEL_NAME.traineddata
+    START_MODEL        Name of the model to continue from (i.e. fine-tune). Default: ''
+    PROTO_MODEL        Name of the prototype model. Default: OUTPUT_DIR/MODEL_NAME.traineddata
     WORDLIST_FILE      Optional file for dictionary DAWG. Default: OUTPUT_DIR/MODEL_NAME.wordlist
     NUMBERS_FILE       Optional file for number patterns DAWG. Default: OUTPUT_DIR/MODEL_NAME.numbers
     PUNC_FILE          Optional file for punctuation DAWG. Default: OUTPUT_DIR/MODEL_NAME.punc
     DATA_DIR           Data directory for output files, proto model, start model, etc. Default: data
     OUTPUT_DIR         Output directory for generated files. Default: DATA_DIR/MODEL_NAME
     GROUND_TRUTH_DIR   Ground truth directory. Default: OUTPUT_DIR-ground-truth
     TESSDATA_REPO      Tesseract model repo to use (_fast or _best). Default: _best
-    TESSDATA           Path to the .traineddata directory to start finetuning from. Default: ./usr/share/tessdata
+    TESSDATA           Path to the directory containing START_MODEL.traineddata
+                       (for example tesseract-ocr/tessdata_best). Default: ./usr/share/tessdata
     MAX_ITERATIONS     Max iterations. Default: 10000
     EPOCHS             Set max iterations based on the number of lines for training. Default: none
     DEBUG_INTERVAL     Debug Interval. Default:  0
     LEARNING_RATE      Learning rate. Default: 0.0001 with START_MODEL, otherwise 0.002
-    NET_SPEC           Network specification. Default: [1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx256 O1c\#\#\#]
-    FINETUNE_TYPE      Finetune Training Type - Impact, Plus, Layer or blank. Default: ''
+    NET_SPEC           Network specification (in VGSL) for new model from scratch. Default: [1,36,0,1 Ct3,3,16 Mp3,3 Lfys48 Lfx96 Lrx96 Lfx256 O1c###]
+    FINETUNE_TYPE      Fine-tune Training Type - Impact, Plus, Layer or blank. Default: ''
     LANG_TYPE          Language Type - Indic, RTL or blank. Default: ''
     PSM                Page segmentation mode. Default: 13
     RANDOM_SEED        Random seed for shuffling of the training data. Default: 0
     RATIO_TRAIN        Ratio of train / eval training data. Default: 0.90
     TARGET_ERROR_RATE  Stop training if the character error rate (CER in percent) gets below this value. Default: 0.01
+    LOG_FILE           File to copy training output to and read plot figures from. Default: OUTPUT_DIR/training.log
 ```
 
 <!-- END-EVAL -->
 
+### Choose training regime
+
+First, decide what [kind of training](https://tesseract-ocr.github.io/tessdoc/tess5/TrainingTesseract-5.html#introduction)
+you want.
+
+* Fine-tuning: select (and install) a `START_MODEL`
+* From scratch: specify a `NET_SPEC` (see [documentation](https://tesseract-ocr.github.io/tessdoc/tess4/VGSLSpecs.html))
+
 ### Change directory assumptions
 
 To override the default path name requirements, just set the respective variables in the above list:
@@ -168,23 +196,37 @@ It is also possible to create models for selected checkpoints only. Examples:
 
 Add `MODEL_NAME` and `OUTPUT_DIR` and replace `data/foo` with the output directory if needed.
 
-## Plotting CER (experimental)
+### Plotting CER
 
-Training and Evaluation CER can be plotted using Matplotlib. A couple of scripts are provided
-as a starting point in the `plot` subdirectory for plotting different training scenarios. The training
-log is expected to be saved in `plot/TESSTRAIN.LOG`.
+Training and Evaluation Character Error Rate (CER) can be plotted using Matplotlib:
 
-As an example, use the training data provided in 
-[ocrd-testset.zip](./ocrd-testset.zip) to do training and generate the plots.
-Plotting can be done while training is running also to depict the training status till then.
-```
-unzip ocrd-testset.zip -d data/ocrd-ground-truth
-nohup make training MODEL_NAME=ocrd START_MODEL=deu_latf TESSDATA=~/tessdata_best MAX_ITERATIONS=10000 > plot/TESSTRAIN.LOG &
-```
-```
-cd ./plot
-./plot_cer.sh 
-```
+    # Make OUTPUT_DIR/MODEL_FILE.plot_*.png
+    make plot
+
+All the variables defined above apply, but there is no explicit dependency on `training`.
+
+Still, the target depends on the `LOG_FILE` captured during training (just will not trigger
+training itself). Besides analysing the log file, this also directly evaluates the trained models
+(for each checkpoint) on the eval dataset. The latter is also available as an independent target
+`evaluation`:
+
+    # Make OUTPUT_DIR/eval/MODEL_FILE*.*.log
+    make evaluation
+
+Plotting can even be done while training is still running, and  will depict the training status
+up to that point. (It can be rerun any time the `LOG_FILE` has changed or new checkpoints written.)
-Plotting can even be done while training is still running, and  will depict the training status
-up to that point. (It can be rerun any time the `LOG_FILE` has changed or new checkpoints written.)
+Plotting can even be done while training is still running, and  will show the training status
+up to that point. (It can be re-run any time the `LOG_FILE` is changed or new checkpoints are written.)
-Plotting can even be done while training is still running, and  will depict the training status
-up to that point. (It can be rerun any time the `LOG_FILE` has changed or new checkpoints written.)
+Plotting can even be done while training is still running, and  will show the training status
+up to that point. (It can be re-run any time the `LOG_FILE` is changed or new checkpoints are written.)
+
+As an example, use the training data provided in [ocrd-testset.zip](./ocrd-testset.zip) to do some
+training and generate the plots:
+
+    unzip ocrd-testset.zip -d data/ocrd-ground-truth
+    make training MODEL_NAME=ocrd START_MODEL=frk TESSDATA=~/tessdata_best MAX_ITERATIONS=10000 &
+    # Make data/ocrd/ocrd.plot_cer.png and plot_log.png (repeat during/after training)
+    make plot MODEL_NAME=ocrd
+
+Which should then look like this:
+
+![ocrd.plot_cer.png](./ocrd.plot_cer.png)
 
 ## License
 

diff --git a/ocrd.plot_cer.png b/ocrd.plot_cer.png