From 9122e6249e3ba00bf16acb63f1b06dec66c55c67 Mon Sep 17 00:00:00 2001 From: "James R. Barlow" Date: Sat, 24 Nov 2018 00:50:22 -0800 Subject: [PATCH] Autoreformat code This increases the deviation from the bash scripts so is done separately. --- src/training/language_specific.py | 843 +++++++++++++++++------------- src/training/tesstrain.py | 29 +- src/training/tesstrain_utils.py | 340 +++++++----- 3 files changed, 703 insertions(+), 509 deletions(-) diff --git a/src/training/language_specific.py b/src/training/language_specific.py index 76803717d2..1fe98aa34a 100644 --- a/src/training/language_specific.py +++ b/src/training/language_specific.py @@ -15,29 +15,32 @@ # tesstrain.sh # -#============================================================================= +# ============================================================================= # Language specific info -#============================================================================= +# ============================================================================= import os import logging + log = logging.getLogger(__name__) # Array of all valid language codes. -VALID_LANGUAGE_CODES=("afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat " - "ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo " - "ell eng enm epo est eus fas fil fin fra frk frm gle glg " - "grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old " - "jav jav_java jpn kan kat kat_old kaz khm kir kor kur lao lat " - "lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori " - "pan pol por pus ron rus san sin slk slv snd spa spa_old " - "sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur " - "uig ukr urd uzb uzb_cyrl vie yid gle_uncial ") +VALID_LANGUAGE_CODES = ( + "afr amh ara asm aze aze_cyrl bel ben bih bod bos bul cat " + "ceb ces chi_sim chi_tra chr cym cyr_lid dan deu div dzo " + "ell eng enm epo est eus fas fil fin fra frk frm gle glg " + "grc guj hat heb hin hrv hun hye iast iku ind isl ita ita_old " + "jav jav_java jpn kan kat kat_old kaz khm kir kor kur lao lat " + "lat_lid lav lit mal mar mkd mlt msa mya nep nld nor ori " + "pan pol por pus ron rus san sin slk slv snd spa spa_old " + "sqi srp srp_latn swa swe syr tam tel tgk tgl tha tir tur " + "uig ukr urd uzb uzb_cyrl vie yid gle_uncial " +) # Codes for which we have webtext but no fonts: -UNUSABLE_LANGUAGE_CODES="" +UNUSABLE_LANGUAGE_CODES = "" -FRAKTUR_FONTS=[ +FRAKTUR_FONTS = [ "CaslonishFraxx Medium", "Cloister Black, Light", "Proclamate Light", @@ -46,7 +49,7 @@ ] # List of fonts to train on -LATIN_FONTS=[ +LATIN_FONTS = [ "Arial Bold", "Arial Bold Italic", "Arial Italic", @@ -82,7 +85,7 @@ ] # List of fonts for printed/neo-Latin ('lat' language code, different from Latin script) -NEOLATIN_FONTS=[ +NEOLATIN_FONTS = [ "GFS Bodoni", "GFS Bodoni Bold", "GFS Bodoni Italic", @@ -114,33 +117,33 @@ "IM FELL Great Primer PRO Italic", ] -IRISH_UNCIAL_FONTS=[ - "Bunchlo Arsa Dubh GC", - "Bunchlo Arsa GC", - "Bunchlo Arsa GC Bold", - "Bunchlo Dubh GC", - "Bunchlo GC", - "Bunchlo GC Bold", - "Bunchlo Nua GC Bold", - "Bunchló na Nod GC", - "Gadelica", - "Glanchlo Dubh GC", - "Glanchlo GC", - "Glanchlo GC Bold", - "Seanchló Dubh GC", - "Seanchló GC", - "Seanchló GC Bold", - "Seanchló na Nod GC", - "Seanchló Ársa Dubh GC", - "Seanchló Ársa GC", - "Seanchló Ársa GC Bold", - "Tromchlo Beag GC", - "Tromchlo Mor GC", - "Urchlo GC", - "Urchlo GC Bold", +IRISH_UNCIAL_FONTS = [ + "Bunchlo Arsa Dubh GC", + "Bunchlo Arsa GC", + "Bunchlo Arsa GC Bold", + "Bunchlo Dubh GC", + "Bunchlo GC", + "Bunchlo GC Bold", + "Bunchlo Nua GC Bold", + "Bunchló na Nod GC", + "Gadelica", + "Glanchlo Dubh GC", + "Glanchlo GC", + "Glanchlo GC Bold", + "Seanchló Dubh GC", + "Seanchló GC", + "Seanchló GC Bold", + "Seanchló na Nod GC", + "Seanchló Ársa Dubh GC", + "Seanchló Ársa GC", + "Seanchló Ársa GC Bold", + "Tromchlo Beag GC", + "Tromchlo Mor GC", + "Urchlo GC", + "Urchlo GC Bold", ] -EARLY_LATIN_FONTS=[ +EARLY_LATIN_FONTS = [ *FRAKTUR_FONTS, *LATIN_FONTS, # The Wyld font family renders early modern ligatures encoded in the private @@ -151,7 +154,7 @@ "GentiumAlt", ] -VIETNAMESE_FONTS=[ +VIETNAMESE_FONTS = [ "Arial Unicode MS Bold", "Arial Bold Italic", "Arial Italic", @@ -186,7 +189,7 @@ "VL PGothic", ] -DEVANAGARI_FONTS=[ +DEVANAGARI_FONTS = [ "FreeSans", "Chandas", "Kalimati", @@ -208,7 +211,7 @@ "Santipur OT Medium", ] -KANNADA_FONTS=[ +KANNADA_FONTS = [ "Kedage Bold", "Kedage Italic", "Kedage", @@ -228,7 +231,7 @@ "Tunga Bold", ] -TELUGU_FONTS=[ +TELUGU_FONTS = [ "Pothana2000", "Vemana2000", "Lohit Telugu", @@ -256,7 +259,7 @@ "Gautami", ] -TAMIL_FONTS=[ +TAMIL_FONTS = [ "TAMu_Kadambri", "TAMu_Kalyani", "TAMu_Maduram", @@ -282,7 +285,7 @@ "Lohit Tamil Classical", ] -THAI_FONTS=[ +THAI_FONTS = [ "FreeSerif", "FreeSerif Italic", "Garuda", @@ -315,7 +318,7 @@ "Tahoma", ] -KOREAN_FONTS=[ +KOREAN_FONTS = [ "Arial Unicode MS", "Arial Unicode MS Bold", "Baekmuk Batang Patched", @@ -325,7 +328,7 @@ "Baekmuk Headline", ] -CHI_SIM_FONTS=[ +CHI_SIM_FONTS = [ "AR PL UKai CN", "AR PL UMing Patched Light", "Arial Unicode MS", @@ -333,7 +336,7 @@ "WenQuanYi Zen Hei Medium", ] -CHI_TRA_FONTS=[ +CHI_TRA_FONTS = [ "AR PL UKai TW", "AR PL UMing TW MBE Light", "AR PL UKai Patched", @@ -343,7 +346,7 @@ "WenQuanYi Zen Hei Medium", ] -JPN_FONTS=[ +JPN_FONTS = [ "TakaoExGothic", "TakaoExMincho", "TakaoGothic", @@ -356,7 +359,7 @@ "Noto Sans Japanese Light", ] -RUSSIAN_FONTS=[ +RUSSIAN_FONTS = [ "Arial Bold", "Arial Bold Italic", "Arial Italic", @@ -391,7 +394,7 @@ "DejaVu Sans Ultra-Light", ] -GREEK_FONTS=[ +GREEK_FONTS = [ "Arial Unicode MS", "Arial Unicode MS Bold", "DejaVu Sans Mono", @@ -424,7 +427,7 @@ "VL PGothic", ] -ANCIENT_GREEK_FONTS=[ +ANCIENT_GREEK_FONTS = [ "GFS Artemisia", "GFS Artemisia Bold", "GFS Artemisia Bold Italic", @@ -448,7 +451,7 @@ "GFS Solomos", ] -ARABIC_FONTS=[ +ARABIC_FONTS = [ "Arabic Transparent Bold", "Arabic Transparent", "Arab", @@ -483,7 +486,7 @@ "Traditional Arabic", ] -HEBREW_FONTS=[ +HEBREW_FONTS = [ "Arial Bold", "Arial Bold Italic", "Arial Italic", @@ -512,7 +515,7 @@ "Tahoma", ] -BENGALI_FONTS=[ +BENGALI_FONTS = [ "Bangla Medium", "Lohit Bengali", "Mukti Narrow", @@ -533,7 +536,7 @@ "Mitra Mono", ] -KYRGYZ_FONTS=[ +KYRGYZ_FONTS = [ "Arial", "Arial Bold", "Arial Italic", @@ -555,7 +558,7 @@ "FreeSerif Bold Italic", ] -PERSIAN_FONTS=[ +PERSIAN_FONTS = [ "Amiri Bold Italic", "Amiri Bold", "Amiri Italic", @@ -581,16 +584,15 @@ "Yakout Linotype", ] -AMHARIC_FONTS=[ - "Abyssinica SIL" - "Droid Sans Ethiopic Bold", +AMHARIC_FONTS = [ + "Abyssinica SIL" "Droid Sans Ethiopic Bold", "Droid Sans Ethiopic", "FreeSerif", "Noto Sans Ethiopic Bold", "Noto Sans Ethiopic", ] -ARMENIAN_FONTS=[ +ARMENIAN_FONTS = [ "Arial Unicode MS", "Arial Unicode MS Bold", "Ascender Uni", @@ -601,7 +603,7 @@ "FreeSans Oblique", ] -BURMESE_FONTS=[ +BURMESE_FONTS = [ "Myanmar Sans Pro", "Noto Sans Myanmar Bold", "Noto Sans Myanmar", @@ -610,11 +612,9 @@ "TharLon", ] -JAVANESE_FONTS=[ - "Prada", -] +JAVANESE_FONTS = ["Prada"] -NORTH_AMERICAN_ABORIGINAL_FONTS=[ +NORTH_AMERICAN_ABORIGINAL_FONTS = [ "Aboriginal Sans", "Aboriginal Sans Bold Italic", "Aboriginal Sans Italic", @@ -625,7 +625,7 @@ "Aboriginal Serif", ] -GEORGIAN_FONTS=[ +GEORGIAN_FONTS = [ "Arial Unicode MS Bold", "Arial Unicode MS", "BPG Algeti GPL\&GNU", @@ -660,7 +660,7 @@ "FreeSerif Italic", ] -OLD_GEORGIAN_FONTS=[ +OLD_GEORGIAN_FONTS = [ "Arial Unicode MS Bold", "Arial Unicode MS", "BPG Algeti GPL\&GNU", @@ -690,7 +690,7 @@ "FreeSerif Italic", ] -KHMER_FONTS=[ +KHMER_FONTS = [ "Khmer OS", "Khmer OS System", "Khmer OS Battambang", @@ -709,7 +709,7 @@ "Noto Serif Khmer Light", ] -KURDISH_FONTS=[ +KURDISH_FONTS = [ "Amiri Bold Italic", "Amiri Bold", "Amiri Italic", @@ -735,7 +735,7 @@ "Yakout Linotype", ] -LAOTHIAN_FONTS=[ +LAOTHIAN_FONTS = [ "Phetsarath OT", "Arial Unicode MS", "Arial Unicode MS Bold", @@ -756,7 +756,7 @@ "Souliyo Unicode", ] -GUJARATI_FONTS=[ +GUJARATI_FONTS = [ "Lohit Gujarati", "Rekha Medium", "Samyak Gujarati Medium", @@ -773,7 +773,7 @@ "Shruti Bold", ] -MALAYALAM_FONTS=[ +MALAYALAM_FONTS = [ "AnjaliOldLipi", "Arial Unicode MS", "Arial Unicode MS Bold", @@ -793,7 +793,7 @@ "suruma", ] -ORIYA_FONTS=[ +ORIYA_FONTS = [ "Arial Unicode MS", "Arial Unicode MS Bold", "Ascender Uni", @@ -802,7 +802,7 @@ "Lohit Oriya", ] -PUNJABI_FONTS=[ +PUNJABI_FONTS = [ "Arial Unicode MS", "Arial Unicode MS Bold", "Ascender Uni", @@ -815,7 +815,7 @@ "FreeSerif", ] -SINHALA_FONTS=[ +SINHALA_FONTS = [ "Noto Sans Sinhala Bold", "Noto Sans Sinhala", "OCRUnicode", @@ -824,7 +824,7 @@ "FreeSerif", ] -SYRIAC_FONTS=[ +SYRIAC_FONTS = [ "East Syriac Adiabene", "East Syriac Ctesiphon", "Estrangelo Antioch", @@ -848,11 +848,9 @@ "FreeSans", ] -THAANA_FONTS=[ - "FreeSerif", -] +THAANA_FONTS = ["FreeSerif"] -TIBETAN_FONTS=[ +TIBETAN_FONTS = [ "Arial Unicode MS", "Arial Unicode MS Bold", "Ascender Uni", @@ -866,7 +864,7 @@ ] # The following fonts will be rendered vertically in phase I. -VERTICAL_FONTS=[ +VERTICAL_FONTS = [ "TakaoExGothic", "TakaoExMincho", "AR PL UKai Patched", @@ -874,7 +872,7 @@ "Baekmuk Batang Patched", ] -FLAGS_webtext_prefix=os.environ.get('FLAGS_webtext_prefix', '') +FLAGS_webtext_prefix = os.environ.get("FLAGS_webtext_prefix", "") # Set language-specific values for several global variables, including # ${TEXT_CORPUS} @@ -893,386 +891,478 @@ # TEXT_CORPUS, etc. separately. def set_lang_specific_parameters(ctx, lang): # The default text location is now given directly from the language code. - TEXT_CORPUS=f"{FLAGS_webtext_prefix}/{lang}.corpus.txt" - FILTER_ARGUMENTS=[] - WORDLIST2DAWG_ARGUMENTS="" + TEXT_CORPUS = f"{FLAGS_webtext_prefix}/{lang}.corpus.txt" + FILTER_ARGUMENTS = [] + WORDLIST2DAWG_ARGUMENTS = "" # These dawg factors represent the fraction of the corpus not covered by the # dawg, and seem like reasonable defaults, but the optimal value is likely # to be highly corpus-dependent, as well as somewhat language-dependent. # Number dawg factor is the fraction of all numeric strings that are not # covered, which is why it is higher relative to the others. - PUNC_DAWG_FACTOR=None - NUMBER_DAWG_FACTOR=0.125 - WORD_DAWG_FACTOR=0.05 - BIGRAM_DAWG_FACTOR=0.015 - TRAINING_DATA_ARGUMENTS=[] - FRAGMENTS_DISABLED="y" - RUN_SHAPE_CLUSTERING=False - AMBIGS_FILTER_DENOMINATOR="100000" - LEADING=32 - MEAN_COUNT=40 # Default for latin script. + PUNC_DAWG_FACTOR = None + NUMBER_DAWG_FACTOR = 0.125 + WORD_DAWG_FACTOR = 0.05 + BIGRAM_DAWG_FACTOR = 0.015 + TRAINING_DATA_ARGUMENTS = [] + FRAGMENTS_DISABLED = "y" + RUN_SHAPE_CLUSTERING = False + AMBIGS_FILTER_DENOMINATOR = "100000" + LEADING = 32 + MEAN_COUNT = 40 # Default for latin script. # Language to mix with the language for maximum accuracy. Defaults to eng. # If no language is good, set to the base language. - MIX_LANG="eng" - FONTS=ctx.fonts - TEXT2IMAGE_EXTRA_ARGS=[] - EXPOSURES=[] - + MIX_LANG = "eng" + FONTS = ctx.fonts + TEXT2IMAGE_EXTRA_ARGS = [] + EXPOSURES = [] # Latin languages. - if lang == 'enm': - TEXT2IMAGE_EXTRA_ARGS+=["--ligatures"] # Add ligatures when supported - if not FONTS: FONTS = EARLY_LATIN_FONTS - elif lang == 'frm': - TEXT_CORPUS=f"{FLAGS_webtext_prefix}/fra.corpus.txt" + if lang == "enm": + TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported + if not FONTS: + FONTS = EARLY_LATIN_FONTS + elif lang == "frm": + TEXT_CORPUS = f"{FLAGS_webtext_prefix}/fra.corpus.txt" # Make long-s substitutions for Middle French text - FILTER_ARGUMENTS+=["--make_early_language_variant=fra"] - TEXT2IMAGE_EXTRA_ARGS+=["--ligatures"] # Add ligatures when supported. - if not FONTS: FONTS = EARLY_LATIN_FONTS - elif lang == 'frk': - TEXT_CORPUS=f"{FLAGS_webtext_prefix}/deu.corpus.txt" - if not FONTS: FONTS = FRAKTUR_FONTS - elif lang == 'ita_old': - TEXT_CORPUS=f"{FLAGS_webtext_prefix}/ita.corpus.txt" + FILTER_ARGUMENTS += ["--make_early_language_variant=fra"] + TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported. + if not FONTS: + FONTS = EARLY_LATIN_FONTS + elif lang == "frk": + TEXT_CORPUS = f"{FLAGS_webtext_prefix}/deu.corpus.txt" + if not FONTS: + FONTS = FRAKTUR_FONTS + elif lang == "ita_old": + TEXT_CORPUS = f"{FLAGS_webtext_prefix}/ita.corpus.txt" # Make long-s substitutions for Early Italian text - FILTER_ARGUMENTS+=["--make_early_language_variant=ita"] - TEXT2IMAGE_EXTRA_ARGS+=["--ligatures"] # Add ligatures when supported. - if not FONTS: FONTS = EARLY_LATIN_FONTS - elif lang == 'lat': - if not EXPOSURES: EXPOSURES="-3 -2 -1 0 1 2 3".split() - if not FONTS: FONTS = NEOLATIN_FONTS - elif lang == 'spa_old': - TEXT_CORPUS=f"{FLAGS_webtext_prefix}/spa.corpus.txt" + FILTER_ARGUMENTS += ["--make_early_language_variant=ita"] + TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported. + if not FONTS: + FONTS = EARLY_LATIN_FONTS + elif lang == "lat": + if not EXPOSURES: + EXPOSURES = "-3 -2 -1 0 1 2 3".split() + if not FONTS: + FONTS = NEOLATIN_FONTS + elif lang == "spa_old": + TEXT_CORPUS = f"{FLAGS_webtext_prefix}/spa.corpus.txt" # Make long-s substitutions for Early Spanish text - FILTER_ARGUMENTS+=["--make_early_language_variant=spa"] - TEXT2IMAGE_EXTRA_ARGS+=["--ligatures"] # Add ligatures when supported. - if not FONTS: FONTS = EARLY_LATIN_FONTS - elif lang == 'srp_latn': - TEXT_CORPUS=f"{FLAGS_webtext_prefix}/srp.corpus.txt" - elif lang == 'vie': - TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] - if not FONTS: FONTS = VIETNAMESE_FONTS + FILTER_ARGUMENTS += ["--make_early_language_variant=spa"] + TEXT2IMAGE_EXTRA_ARGS += ["--ligatures"] # Add ligatures when supported. + if not FONTS: + FONTS = EARLY_LATIN_FONTS + elif lang == "srp_latn": + TEXT_CORPUS = f"{FLAGS_webtext_prefix}/srp.corpus.txt" + elif lang == "vie": + TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"] + if not FONTS: + FONTS = VIETNAMESE_FONTS # Highly inflective languages get a bigger dawg size. # TODO(rays) Add more here! - elif lang == 'hun': - WORD_DAWG_SIZE=1000000 - elif lang == 'pol': - WORD_DAWG_SIZE=1000000 + elif lang == "hun": + WORD_DAWG_SIZE = 1_000_000 + elif lang == "pol": + WORD_DAWG_SIZE = 1_000_000 # Latin with default treatment. - elif lang == 'afr': + elif lang == "afr": pass - elif lang == 'aze': + elif lang == "aze": pass - elif lang == 'bos': + elif lang == "bos": pass - elif lang == 'cat': + elif lang == "cat": pass - elif lang == 'ceb': + elif lang == "ceb": pass - elif lang == 'ces': - PUNC_DAWG_FACTOR=0.004 - elif lang == 'cym': + elif lang == "ces": + PUNC_DAWG_FACTOR = 0.004 + elif lang == "cym": pass - elif lang == 'dan': + elif lang == "dan": pass - elif lang == 'deu': - WORD_DAWG_FACTOR=0.125 - elif lang == 'eng': - WORD_DAWG_FACTOR=0.03 - elif lang == 'epo': + elif lang == "deu": + WORD_DAWG_FACTOR = 0.125 + elif lang == "eng": + WORD_DAWG_FACTOR = 0.03 + elif lang == "epo": pass - elif lang == 'est': + elif lang == "est": pass - elif lang == 'eus': + elif lang == "eus": pass - elif lang == 'fil': + elif lang == "fil": pass - elif lang == 'fin': + elif lang == "fin": pass - elif lang == 'fra': - WORD_DAWG_FACTOR=0.08 - elif lang == 'gle': + elif lang == "fra": + WORD_DAWG_FACTOR = 0.08 + elif lang == "gle": pass - elif lang == 'gle_uncial': - if not FONTS: FONTS = IRISH_UNCIAL_FONTS - elif lang == 'glg': + elif lang == "gle_uncial": + if not FONTS: + FONTS = IRISH_UNCIAL_FONTS + elif lang == "glg": pass - elif lang == 'hat': + elif lang == "hat": pass - elif lang == 'hrv': + elif lang == "hrv": pass - elif lang == 'iast': + elif lang == "iast": pass - elif lang == 'ind': + elif lang == "ind": pass - elif lang == 'isl': + elif lang == "isl": pass - elif lang == 'ita': + elif lang == "ita": pass - elif lang == 'jav': + elif lang == "jav": pass - elif lang == 'lav': + elif lang == "lav": pass - elif lang == 'lit': + elif lang == "lit": pass - elif lang == 'mlt': + elif lang == "mlt": pass - elif lang == 'msa': + elif lang == "msa": pass - elif lang == 'nld': - WORD_DAWG_FACTOR=0.02 - elif lang == 'nor': + elif lang == "nld": + WORD_DAWG_FACTOR = 0.02 + elif lang == "nor": pass - elif lang == 'por': + elif lang == "por": pass - elif lang == 'ron': + elif lang == "ron": pass - elif lang == 'slk': + elif lang == "slk": pass - elif lang == 'slv': + elif lang == "slv": pass - elif lang == 'spa': + elif lang == "spa": pass - elif lang == 'sqi': + elif lang == "sqi": pass - elif lang == 'swa': + elif lang == "swa": pass - elif lang == 'swe': + elif lang == "swe": pass - elif lang == 'tgl': + elif lang == "tgl": pass - elif lang == 'tur': + elif lang == "tur": pass - elif lang == 'uzb': + elif lang == "uzb": pass - elif lang == 'zlm': + elif lang == "zlm": pass # Special code for performing language-id that is trained on # EFIGS+Latin+Vietnamese text with regular + fraktur fonts. - elif lang == 'lat_lid': - TEXT_CORPUS=f'{FLAGS_webtext_prefix}/lat_lid.corpus.txt' - TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] - GENERATE_WORD_BIGRAMS=0 + elif lang == "lat_lid": + TEXT_CORPUS = f"{FLAGS_webtext_prefix}/lat_lid.corpus.txt" + TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"] + GENERATE_WORD_BIGRAMS = 0 # Strip unrenderable words as not all fonts will render the extended # latin symbols found in Vietnamese text. - WORD_DAWG_SIZE=1000000 - if not FONTS: FONTS = EARLY_LATIN_FONTS + WORD_DAWG_SIZE = 1_000_000 + if not FONTS: + FONTS = EARLY_LATIN_FONTS # Cyrillic script-based languages. It is bad to mix Latin with Cyrillic. - elif lang == 'rus': - if not FONTS: FONTS = RUSSIAN_FONTS - MIX_LANG="rus" - NUMBER_DAWG_FACTOR=0.05 - WORD_DAWG_SIZE=1000000 - elif lang in ('aze_cyrl','bel','bul','kaz','mkd','srp','tgk','ukr','uzb_cyrl' ): - MIX_LANG=f"{lang}" - if not FONTS: FONTS = RUSSIAN_FONTS + elif lang == "rus": + if not FONTS: + FONTS = RUSSIAN_FONTS + MIX_LANG = "rus" + NUMBER_DAWG_FACTOR = 0.05 + WORD_DAWG_SIZE = 1_000_000 + elif lang in ( + "aze_cyrl", + "bel", + "bul", + "kaz", + "mkd", + "srp", + "tgk", + "ukr", + "uzb_cyrl", + ): + MIX_LANG = f"{lang}" + if not FONTS: + FONTS = RUSSIAN_FONTS # Special code for performing Cyrillic language-id that is trained on # Russian, Serbian, Ukrainian, Belarusian, Macedonian, Tajik and Mongolian # text with the list of Russian fonts. - elif lang == 'cyr_lid': - TEXT_CORPUS=f'{FLAGS_webtext_prefix}/cyr_lid.corpus.txt' - TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] - GENERATE_WORD_BIGRAMS=0 - WORD_DAWG_SIZE=1000000 - if not FONTS: FONTS = RUSSIAN_FONTS + elif lang == "cyr_lid": + TEXT_CORPUS = f"{FLAGS_webtext_prefix}/cyr_lid.corpus.txt" + TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"] + GENERATE_WORD_BIGRAMS = 0 + WORD_DAWG_SIZE = 1_000_000 + if not FONTS: + FONTS = RUSSIAN_FONTS # South Asian scripts mostly have a lot of different graphemes, so trim # down the MEAN_COUNT so as not to get a huge amount of text. - elif lang in ('asm','ben' ): - MEAN_COUNT=15 - WORD_DAWG_FACTOR=0.15 - if not FONTS: FONTS = BENGALI_FONTS - elif lang in ( 'bih','hin','mar','nep','san' ): - MEAN_COUNT=15 - WORD_DAWG_FACTOR=0.15 - if not FONTS: FONTS = DEVANAGARI_FONTS - elif lang == 'bod': - MEAN_COUNT=15 - WORD_DAWG_FACTOR=0.15 - if not FONTS: FONTS = TIBETAN_FONTS - elif lang == 'dzo': - WORD_DAWG_FACTOR=0.01 - if not FONTS: FONTS = TIBETAN_FONTS - elif lang == 'guj': - MEAN_COUNT=15 - WORD_DAWG_FACTOR=0.15 - if not FONTS: FONTS = GUJARATI_FONTS - elif lang == 'kan': - MEAN_COUNT=15 - WORD_DAWG_FACTOR=0.15 - TRAINING_DATA_ARGUMENTS+=["--no_newline_in_output"] - TEXT2IMAGE_EXTRA_ARGS+=["--char_spacing=0.5"] - if not FONTS: FONTS = KANNADA_FONTS - elif lang == 'mal': - MEAN_COUNT=15 - WORD_DAWG_FACTOR=0.15 - TRAINING_DATA_ARGUMENTS+=["--no_newline_in_output"] - TEXT2IMAGE_EXTRA_ARGS+=["--char_spacing=0.5"] - if not FONTS: FONTS = MALAYALAM_FONTS - elif lang == 'ori': - WORD_DAWG_FACTOR=0.01 - if not FONTS: FONTS = ORIYA_FONTS - elif lang == 'pan': - MEAN_COUNT=15 - WORD_DAWG_FACTOR=0.01 - if not FONTS: FONTS = PUNJABI_FONTS - elif lang == 'sin': - MEAN_COUNT=15 - WORD_DAWG_FACTOR=0.01 - if not FONTS: FONTS = SINHALA_FONTS - elif lang == 'tam': - MEAN_COUNT=30 - WORD_DAWG_FACTOR=0.15 - TRAINING_DATA_ARGUMENTS+=["--no_newline_in_output"] - TEXT2IMAGE_EXTRA_ARGS+=["--char_spacing=0.5"] - if not FONTS: FONTS = TAMIL_FONTS - elif lang == 'tel': - MEAN_COUNT=15 - WORD_DAWG_FACTOR=0.15 - TRAINING_DATA_ARGUMENTS+=["--no_newline_in_output"] - TEXT2IMAGE_EXTRA_ARGS+=["--char_spacing=0.5"] - if not FONTS: FONTS = TELUGU_FONTS + elif lang in ("asm", "ben"): + MEAN_COUNT = 15 + WORD_DAWG_FACTOR = 0.15 + if not FONTS: + FONTS = BENGALI_FONTS + elif lang in ("bih", "hin", "mar", "nep", "san"): + MEAN_COUNT = 15 + WORD_DAWG_FACTOR = 0.15 + if not FONTS: + FONTS = DEVANAGARI_FONTS + elif lang == "bod": + MEAN_COUNT = 15 + WORD_DAWG_FACTOR = 0.15 + if not FONTS: + FONTS = TIBETAN_FONTS + elif lang == "dzo": + WORD_DAWG_FACTOR = 0.01 + if not FONTS: + FONTS = TIBETAN_FONTS + elif lang == "guj": + MEAN_COUNT = 15 + WORD_DAWG_FACTOR = 0.15 + if not FONTS: + FONTS = GUJARATI_FONTS + elif lang == "kan": + MEAN_COUNT = 15 + WORD_DAWG_FACTOR = 0.15 + TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"] + TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"] + if not FONTS: + FONTS = KANNADA_FONTS + elif lang == "mal": + MEAN_COUNT = 15 + WORD_DAWG_FACTOR = 0.15 + TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"] + TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"] + if not FONTS: + FONTS = MALAYALAM_FONTS + elif lang == "ori": + WORD_DAWG_FACTOR = 0.01 + if not FONTS: + FONTS = ORIYA_FONTS + elif lang == "pan": + MEAN_COUNT = 15 + WORD_DAWG_FACTOR = 0.01 + if not FONTS: + FONTS = PUNJABI_FONTS + elif lang == "sin": + MEAN_COUNT = 15 + WORD_DAWG_FACTOR = 0.01 + if not FONTS: + FONTS = SINHALA_FONTS + elif lang == "tam": + MEAN_COUNT = 30 + WORD_DAWG_FACTOR = 0.15 + TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"] + TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"] + if not FONTS: + FONTS = TAMIL_FONTS + elif lang == "tel": + MEAN_COUNT = 15 + WORD_DAWG_FACTOR = 0.15 + TRAINING_DATA_ARGUMENTS += ["--no_newline_in_output"] + TEXT2IMAGE_EXTRA_ARGS += ["--char_spacing=0.5"] + if not FONTS: + FONTS = TELUGU_FONTS # SouthEast Asian scripts. - elif lang == 'jav_java': - MEAN_COUNT=15 - WORD_DAWG_FACTOR=0.15 - TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] - if not FONTS: FONTS = JAVANESE_FONTS - elif lang == 'khm': - MEAN_COUNT=15 - WORD_DAWG_FACTOR=0.15 - TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] - if not FONTS: FONTS = KHMER_FONTS - elif lang == 'lao': - MEAN_COUNT=15 - WORD_DAWG_FACTOR=0.15 - TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] - if not FONTS: FONTS = LAOTHIAN_FONTS - elif lang == 'mya': - MEAN_COUNT=12 - WORD_DAWG_FACTOR=0.15 - TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] - if not FONTS: FONTS = BURMESE_FONTS - elif lang == 'tha': - MEAN_COUNT=30 - WORD_DAWG_FACTOR=0.01 - TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] - FILTER_ARGUMENTS+=["--segmenter_lang=tha"] - TRAINING_DATA_ARGUMENTS+=["--no_space_in_output", "--desired_bigrams="] - AMBIGS_FILTER_DENOMINATOR="1000" - LEADING=48 - if not FONTS: FONTS = THAI_FONTS + elif lang == "jav_java": + MEAN_COUNT = 15 + WORD_DAWG_FACTOR = 0.15 + TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"] + if not FONTS: + FONTS = JAVANESE_FONTS + elif lang == "khm": + MEAN_COUNT = 15 + WORD_DAWG_FACTOR = 0.15 + TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"] + if not FONTS: + FONTS = KHMER_FONTS + elif lang == "lao": + MEAN_COUNT = 15 + WORD_DAWG_FACTOR = 0.15 + TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"] + if not FONTS: + FONTS = LAOTHIAN_FONTS + elif lang == "mya": + MEAN_COUNT = 12 + WORD_DAWG_FACTOR = 0.15 + TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"] + if not FONTS: + FONTS = BURMESE_FONTS + elif lang == "tha": + MEAN_COUNT = 30 + WORD_DAWG_FACTOR = 0.01 + TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"] + FILTER_ARGUMENTS += ["--segmenter_lang=tha"] + TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="] + AMBIGS_FILTER_DENOMINATOR = "1000" + LEADING = 48 + if not FONTS: + FONTS = THAI_FONTS # CJK - elif lang == 'chi_sim': - MEAN_COUNT=15 - PUNC_DAWG_FACTOR=0.015 - WORD_DAWG_FACTOR=0.015 - GENERATE_WORD_BIGRAMS=0 - TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] - TRAINING_DATA_ARGUMENTS+=["--no_space_in_output", "--desired_bigrams="] - FILTER_ARGUMENTS+=["--charset_filter=chi_sim", "--segmenter_lang=chi_sim"] - if not FONTS: FONTS = CHI_SIM_FONTS - elif lang == 'chi_tra': - MEAN_COUNT=15 - WORD_DAWG_FACTOR=0.015 - GENERATE_WORD_BIGRAMS=0 - TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] - TRAINING_DATA_ARGUMENTS+=["--no_space_in_output", "--desired_bigrams="] - FILTER_ARGUMENTS+=["--charset_filter=chi_tr", "--segmenter_lang=chi_tra"] - if not FONTS: FONTS = CHI_TRA_FONTS - elif lang == 'jpn': - MEAN_COUNT=15 - WORD_DAWG_FACTOR=0.015 - GENERATE_WORD_BIGRAMS=0 - TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] - TRAINING_DATA_ARGUMENTS+=["--no_space_in_output", "--desired_bigrams="] - FILTER_ARGUMENTS+=["--charset_filter=jpn", "--segmenter_lang=jpn"] - if not FONTS: FONTS = JPN_FONTS - elif lang == 'kor': - MEAN_COUNT=20 - WORD_DAWG_FACTOR=0.015 - NUMBER_DAWG_FACTOR=0.05 - TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=10000"] - TRAINING_DATA_ARGUMENTS+=["--desired_bigrams="] - GENERATE_WORD_BIGRAMS=0 - FILTER_ARGUMENTS+=["--charset_filter=kor","--segmenter_lang=kor"] - if not FONTS: FONTS = KOREAN_FONTS + elif lang == "chi_sim": + MEAN_COUNT = 15 + PUNC_DAWG_FACTOR = 0.015 + WORD_DAWG_FACTOR = 0.015 + GENERATE_WORD_BIGRAMS = 0 + TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"] + TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="] + FILTER_ARGUMENTS += ["--charset_filter=chi_sim", "--segmenter_lang=chi_sim"] + if not FONTS: + FONTS = CHI_SIM_FONTS + elif lang == "chi_tra": + MEAN_COUNT = 15 + WORD_DAWG_FACTOR = 0.015 + GENERATE_WORD_BIGRAMS = 0 + TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"] + TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="] + FILTER_ARGUMENTS += ["--charset_filter=chi_tr", "--segmenter_lang=chi_tra"] + if not FONTS: + FONTS = CHI_TRA_FONTS + elif lang == "jpn": + MEAN_COUNT = 15 + WORD_DAWG_FACTOR = 0.015 + GENERATE_WORD_BIGRAMS = 0 + TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"] + TRAINING_DATA_ARGUMENTS += ["--no_space_in_output", "--desired_bigrams="] + FILTER_ARGUMENTS += ["--charset_filter=jpn", "--segmenter_lang=jpn"] + if not FONTS: + FONTS = JPN_FONTS + elif lang == "kor": + MEAN_COUNT = 20 + WORD_DAWG_FACTOR = 0.015 + NUMBER_DAWG_FACTOR = 0.05 + TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=10000"] + TRAINING_DATA_ARGUMENTS += ["--desired_bigrams="] + GENERATE_WORD_BIGRAMS = 0 + FILTER_ARGUMENTS += ["--charset_filter=kor", "--segmenter_lang=kor"] + if not FONTS: + FONTS = KOREAN_FONTS # Middle-Eastern scripts. - elif lang == 'ara': - if not FONTS: FONTS = ARABIC_FONTS - elif lang == 'div': - if not FONTS: FONTS = THAANA_FONTS - elif lang in ('fas','pus','snd','uig','urd' ): - if not FONTS: FONTS = PERSIAN_FONTS - elif lang in ('heb','yid' ): - NUMBER_DAWG_FACTOR=0.05 - WORD_DAWG_FACTOR=0.08 - if not FONTS: FONTS = HEBREW_FONTS - elif lang == 'syr': - if not FONTS: FONTS = SYRIAC_FONTS + elif lang == "ara": + if not FONTS: + FONTS = ARABIC_FONTS + elif lang == "div": + if not FONTS: + FONTS = THAANA_FONTS + elif lang in ("fas", "pus", "snd", "uig", "urd"): + if not FONTS: + FONTS = PERSIAN_FONTS + elif lang in ("heb", "yid"): + NUMBER_DAWG_FACTOR = 0.05 + WORD_DAWG_FACTOR = 0.08 + if not FONTS: + FONTS = HEBREW_FONTS + elif lang == "syr": + if not FONTS: + FONTS = SYRIAC_FONTS # Other scripts. - elif lang in ('amh','tir'): - if not FONTS: FONTS = AMHARIC_FONTS - elif lang == 'chr': + elif lang in ("amh", "tir"): + if not FONTS: + FONTS = AMHARIC_FONTS + elif lang == "chr": if not FONTS: FONTS = [*NORTH_AMERICAN_ABORIGINAL_FONTS, "Noto Sans Cherokee"] - elif lang == 'ell': - NUMBER_DAWG_FACTOR=0.05 - WORD_DAWG_FACTOR=0.08 - if not FONTS: FONTS = GREEK_FONTS - elif lang == 'grc': - if not EXPOSURES: EXPOSURES="-3 -2 -1 0 1 2 3".split() - if not FONTS: FONTS = ANCIENT_GREEK_FONTS - elif lang == 'hye': - if not FONTS: FONTS = ARMENIAN_FONTS - elif lang == 'iku': - if not FONTS: FONTS = NORTH_AMERICAN_ABORIGINAL_FONTS - elif lang == 'kat': - if not FONTS: FONTS = GEORGIAN_FONTS - elif lang == 'kat_old': - TEXT_CORPUS=f"{FLAGS_webtext_prefix}/kat.corpus.txt" - if not FONTS: FONTS = OLD_GEORGIAN_FONTS - elif lang == 'kir': - if not FONTS: FONTS = KYRGYZ_FONTS - TRAINING_DATA_ARGUMENTS+=["--infrequent_ratio=100"] - elif lang == 'kur': - if not FONTS: FONTS = KURDISH_FONTS + elif lang == "ell": + NUMBER_DAWG_FACTOR = 0.05 + WORD_DAWG_FACTOR = 0.08 + if not FONTS: + FONTS = GREEK_FONTS + elif lang == "grc": + if not EXPOSURES: + EXPOSURES = "-3 -2 -1 0 1 2 3".split() + if not FONTS: + FONTS = ANCIENT_GREEK_FONTS + elif lang == "hye": + if not FONTS: + FONTS = ARMENIAN_FONTS + elif lang == "iku": + if not FONTS: + FONTS = NORTH_AMERICAN_ABORIGINAL_FONTS + elif lang == "kat": + if not FONTS: + FONTS = GEORGIAN_FONTS + elif lang == "kat_old": + TEXT_CORPUS = f"{FLAGS_webtext_prefix}/kat.corpus.txt" + if not FONTS: + FONTS = OLD_GEORGIAN_FONTS + elif lang == "kir": + if not FONTS: + FONTS = KYRGYZ_FONTS + TRAINING_DATA_ARGUMENTS += ["--infrequent_ratio=100"] + elif lang == "kur": + if not FONTS: + FONTS = KURDISH_FONTS else: raise ValueError(f"Error: {lang} is not a valid language code") - - FLAGS_mean_count = int(os.environ.get('FLAGS_mean_count', -1)) + FLAGS_mean_count = int(os.environ.get("FLAGS_mean_count", -1)) if FLAGS_mean_count > 0: - TRAINING_DATA_ARGUMENTS+=[f"--mean_count={FLAGS_mean_count}"] + TRAINING_DATA_ARGUMENTS += [f"--mean_count={FLAGS_mean_count}"] elif not MEAN_COUNT: - TRAINING_DATA_ARGUMENTS+=[f"--mean_count={MEAN_COUNT}"] + TRAINING_DATA_ARGUMENTS += [f"--mean_count={MEAN_COUNT}"] # Default to Latin fonts if none have been set - if not FONTS: FONTS = LATIN_FONTS + if not FONTS: + FONTS = LATIN_FONTS # Default to 0 exposure if it hasn't been set - if not EXPOSURES: EXPOSURES=[0] + if not EXPOSURES: + EXPOSURES = [0] # Set right-to-left and normalization mode. - if lang in ('ara','div', 'fas','pus','snd','syr','uig','urd','kur_ara','heb','yid'): - LANG_IS_RTL=True - NORM_MODE=2 + if lang in ( + "ara", + "div", + "fas", + "pus", + "snd", + "syr", + "uig", + "urd", + "kur_ara", + "heb", + "yid", + ): + LANG_IS_RTL = True + NORM_MODE = 2 elif lang in ( - 'asm','ben','bih','hin','mar','nep','guj','kan','mal','tam','tel','pan', - 'dzo','sin','san','bod','ori','khm','mya','tha','lao','jav ','jav_java' - ): - LANG_IS_RTL=False - NORM_MODE=2 + "asm", + "ben", + "bih", + "hin", + "mar", + "nep", + "guj", + "kan", + "mal", + "tam", + "tel", + "pan", + "dzo", + "sin", + "san", + "bod", + "ori", + "khm", + "mya", + "tha", + "lao", + "jav ", + "jav_java", + ): + LANG_IS_RTL = False + NORM_MODE = 2 else: - LANG_IS_RTL=False - NORM_MODE=1 + LANG_IS_RTL = False + NORM_MODE = 1 for var in [v for v in locals()]: if var.isupper(): @@ -1289,6 +1379,7 @@ def set_lang_specific_parameters(ctx, lang): return ctx -#============================================================================= + +# ============================================================================= # END of Language specific info -#============================================================================= +# ============================================================================= diff --git a/src/training/tesstrain.py b/src/training/tesstrain.py index a6aa6276b9..6a0e929067 100644 --- a/src/training/tesstrain.py +++ b/src/training/tesstrain.py @@ -14,23 +14,30 @@ # Tesseract. For a detailed description of the phases, see # https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract # -import sys,os,subprocess, logging +import sys, os, subprocess, logging sys.path.insert(0, os.path.dirname(__file__)) -from tesstrain_utils import parse_flags, initialize_fontconfig, phase_I_generate_image, \ - phase_UP_generate_unicharset, phase_E_extract_features, make_lstmdata, cleanup +from tesstrain_utils import ( + parse_flags, + initialize_fontconfig, + phase_I_generate_image, + phase_UP_generate_unicharset, + phase_E_extract_features, + make_lstmdata, + cleanup, +) import language_specific log = logging.getLogger() + def setup_logging(logfile): log.setLevel(logging.DEBUG) console = logging.StreamHandler() console.setLevel(logging.INFO) console_formatter = logging.Formatter( - '[%(asctime)s] %(levelname)s - %(message)s', - datefmt='%H:%M:%S' + "[%(asctime)s] %(levelname)s - %(message)s", datefmt="%H:%M:%S" ) console.setFormatter(console_formatter) log.addHandler(console) @@ -38,16 +45,17 @@ def setup_logging(logfile): logfile = logging.FileHandler(logfile) logfile.setLevel(logging.DEBUG) logfile_formatter = logging.Formatter( - '[%(asctime)s] - %(levelname)s - %(name)s - %(message)s' + "[%(asctime)s] - %(levelname)s - %(name)s - %(message)s" ) logfile.setFormatter(logfile_formatter) log.addHandler(logfile) + def main(): ctx = parse_flags() setup_logging(ctx.log_file) if not ctx.linedata: - log.error('--linedata_only is required since only LSTM is supported') + log.error("--linedata_only is required since only LSTM is supported") sys.exit(1) log.info(f"=== Starting training for language {ctx.lang_code}") @@ -58,14 +66,15 @@ def main(): phase_UP_generate_unicharset(ctx) if ctx.linedata: - phase_E_extract_features(ctx, ['--psm', '6', 'lstm.train'], 'lstmf') + phase_E_extract_features(ctx, ["--psm", "6", "lstm.train"], "lstmf") make_lstmdata(ctx) cleanup(ctx) log.info("All done!") return 0 -if __name__ == '__main__': + +if __name__ == "__main__": main() @@ -76,7 +85,7 @@ def main(): # _rc0 = subprocess.call(["phase_I_generate_image","8"],shell=True) # _rc0 = subprocess.call(["phase_UP_generate_unicharset"],shell=True) # if (LINEDATA ): - #subprocess.call(["phase_E_extract_features"," --psm 6 lstm.train ","8","lstmf"],shell=True) +# subprocess.call(["phase_E_extract_features"," --psm 6 lstm.train ","8","lstmf"],shell=True) # subprocess.call(["make__lstmdata"],shell=True) # subprocess.call(["tlog","\nCreated starter traineddata for language '"+str(LANG_CODE.val)+"'\n"],shell=True) # subprocess.call(["tlog","\nRun lstmtraining to do the LSTM training for language '"+str(LANG_CODE.val)+"'\n"],shell=True) diff --git a/src/training/tesstrain_utils.py b/src/training/tesstrain_utils.py index d7acff8bfc..8c006e6837 100644 --- a/src/training/tesstrain_utils.py +++ b/src/training/tesstrain_utils.py @@ -34,24 +34,27 @@ log = logging.getLogger(__name__) + class TrainingArgs(argparse.Namespace): def __init__(self): self.uname = os.uname().sysname.lower() - self.lang_code="eng" - self.timestamp=str(date.today()) - - self._font_config_cache = TemporaryDirectory(prefix='font_tmp') - self.font_config_cache =self._font_config_cache.name - self.fonts_dir="/Library/Fonts/" if 'darwin' in self.uname else "/usr/share/fonts/" - - self.max_pages=0 - self.save_box_tiff=False - self.output_dir="/tmp/tesstrain/tessdata" - self.overwrite=False - self.linedata=False - self.run_shape_clustering=False - self.extract_font_properties=True - self._workspace_dir=TemporaryDirectory(prefix='tesstrain') + self.lang_code = "eng" + self.timestamp = str(date.today()) + + self._font_config_cache = TemporaryDirectory(prefix="font_tmp") + self.font_config_cache = self._font_config_cache.name + self.fonts_dir = ( + "/Library/Fonts/" if "darwin" in self.uname else "/usr/share/fonts/" + ) + + self.max_pages = 0 + self.save_box_tiff = False + self.output_dir = "/tmp/tesstrain/tessdata" + self.overwrite = False + self.linedata = False + self.run_shape_clustering = False + self.extract_font_properties = True + self._workspace_dir = TemporaryDirectory(prefix="tesstrain") self.workspace_dir = self._workspace_dir.name @@ -59,12 +62,13 @@ def err_exit(msg): log.critical(msg) sys.exit(1) + # Helper function to run a command and append its output to a log. Aborts early # if the program file is not found. # Usage: run_command CMD ARG1 ARG2... def run_command(cmd, *args, env=None): - for d in ('', 'api/', 'training/'): - testcmd = which(f'{d}{cmd}') + for d in ("", "api/", "training/"): + testcmd = which(f"{d}{cmd}") if which(testcmd): cmd = testcmd break @@ -75,13 +79,15 @@ def run_command(cmd, *args, env=None): for arg in args: log.debug(arg) - proc = subprocess.run([cmd, *args], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env) + proc = subprocess.run( + [cmd, *args], stdout=subprocess.PIPE, stderr=subprocess.STDOUT, env=env + ) proclog = logging.getLogger(cmd) if proc.returncode == 0: - proclog.debug(proc.stdout.decode('utf-8', errors='replace')) + proclog.debug(proc.stdout.decode("utf-8", errors="replace")) else: try: - proclog.error(proc.stdout.decode('utf-8', errors='replace')) + proclog.error(proc.stdout.decode("utf-8", errors="replace")) except Exception: pass err_exit(f"Program {cmd} failed with return code {proc.returncode}. Abort.") @@ -106,42 +112,91 @@ def check_file_readable(*filenames): return True - parser = argparse.ArgumentParser( epilog=""" The font names specified in --fontlist need to be recognizable by Pango using fontconfig. An easy way to list the canonical names of all fonts available on your system is to run text2image with --list_available_fonts and the appropriate --fonts_dir path. - """, + """ +) +parser.add_argument( + "--fontlist", + dest="fonts", + nargs="+", + type=str, + help="A list of fontnames to train on.", +) +parser.add_argument("--fonts_dir", help="Path to font files.") +parser.add_argument( + "--lang", metavar="LANG_CODE", dest="lang_code", help="ISO 639 code." +) +parser.add_argument( + "--langdata_dir", + metavar="DATADIR", + help="Path to tesseract/training/langdata directory.", +) +parser.add_argument("--maxpages", type=int, dest="max_pages") +parser.add_argument( + "--output_dir", metavar="OUTPUTDIR", help="Location of output traineddata file." +) +parser.add_argument( + "--overwrite", action="store_true", help="Safe to overwrite files in output_dir." +) +parser.add_argument( + "--save_box_tiff", + action="store_true", + help="Save box/tiff pairs along with lstmf files.", +) +parser.add_argument( + "--linedata_only", + dest="linedata", + action="store_true", + help="Only generate training data for lstmtraining.", ) -parser.add_argument('--fontlist', dest='fonts', nargs='+', type=str, help='A list of fontnames to train on.') -parser.add_argument('--fonts_dir', help='Path to font files.') -parser.add_argument('--lang', metavar='LANG_CODE', dest='lang_code', help='ISO 639 code.') -parser.add_argument('--langdata_dir', metavar='DATADIR', help='Path to tesseract/training/langdata directory.') -parser.add_argument('--maxpages', type=int, dest='max_pages') -parser.add_argument('--output_dir', metavar='OUTPUTDIR', help='Location of output traineddata file.') -parser.add_argument('--overwrite', action='store_true', help='Safe to overwrite files in output_dir.') -parser.add_argument('--save_box_tiff', action='store_true', help='Save box/tiff pairs along with lstmf files.') -parser.add_argument('--linedata_only', dest='linedata', action='store_true', help='Only generate training data for lstmtraining.') -inputdata_group = parser.add_argument_group('inputdata', 'OPTIONAL flags for input data. If unspecified we will look for them in the langdata_dir directory.') -inputdata_group.add_argument('--training_text', metavar='TEXTFILE',help='Text to render and use for training.') -inputdata_group.add_argument('--wordlist', dest='wordlist_file', metavar='WORDFILE', help='Word list for the language ordered by decreasing frequency.') +inputdata_group = parser.add_argument_group( + "inputdata", + "OPTIONAL flags for input data. If unspecified we will look for them in the langdata_dir directory.", +) +inputdata_group.add_argument( + "--training_text", metavar="TEXTFILE", help="Text to render and use for training." +) +inputdata_group.add_argument( + "--wordlist", + dest="wordlist_file", + metavar="WORDFILE", + help="Word list for the language ordered by decreasing frequency.", +) -parser.add_argument('--extract_font_properties', action='store_true') -parser.add_argument('--noextract_font_properties', dest='extract_font_properties', action='store_false') +parser.add_argument("--extract_font_properties", action="store_true") +parser.add_argument( + "--noextract_font_properties", dest="extract_font_properties", action="store_false" +) -tessdata_group = parser.add_argument_group('tessdata', 'OPTIONAL flag to specify location of existing traineddata files, required during feature extraction. If unspecified will use TESSDATA_PREFIX defined in the current environment.') -tessdata_group.add_argument('--tessdata_dir', metavar='TESSDATADIR', help='Path to tesseract/tessdata directory.') +tessdata_group = parser.add_argument_group( + "tessdata", + "OPTIONAL flag to specify location of existing traineddata files, required during feature extraction. If unspecified will use TESSDATA_PREFIX defined in the current environment.", +) +tessdata_group.add_argument( + "--tessdata_dir", + metavar="TESSDATADIR", + help="Path to tesseract/tessdata directory.", +) -parser.add_argument('--exposures', metavar='EXPOSURES', action='append', nargs='+', help='A list of exposure levels to use (e.g. -1,0,1).') -parser.add_argument('--workspace_dir') +parser.add_argument( + "--exposures", + metavar="EXPOSURES", + action="append", + nargs="+", + help="A list of exposure levels to use (e.g. -1,0,1).", +) +parser.add_argument("--workspace_dir") # Does simple command-line parsing and initialization. def parse_flags(argv=None): - ctx =TrainingArgs() + ctx = TrainingArgs() log.debug(ctx) parser.parse_args(args=argv, namespace=ctx) log.debug(ctx) @@ -152,10 +207,12 @@ def parse_flags(argv=None): if not ctx.langdata_dir: err_exit("Need to specify path to language files --langdata_dir") if not ctx.tessdata_dir: - tessdata_prefix=os.environ.get('TESSDATA_PREFIX', '') + tessdata_prefix = os.environ.get("TESSDATA_PREFIX", "") if not tessdata_prefix: - err_exit("Need to specify a --tessdata_dir or have a " - "TESSDATA_PREFIX variable defined in your environment") + err_exit( + "Need to specify a --tessdata_dir or have a " + "TESSDATA_PREFIX variable defined in your environment" + ) else: ctx.tessdata_dir = tessdata_prefix @@ -170,22 +227,37 @@ def show_tmpdir_location(training_dir): # know where the log is if Path(training_dir).exists(): print(f"Temporary files retained at: {training_dir}") + atexit.register(show_tmpdir_location, ctx.training_dir) # Take training text and wordlist from the langdata directory if not # specified in the command-line. if not ctx.training_text: - ctx.training_text = Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.training_text" + ctx.training_text = ( + Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.training_text" + ) if not ctx.wordlist_file: - ctx.wordlist_file = Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.wordlist" - - ctx.word_bigrams_file=Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.word.bigrams" - ctx.numbers_file=Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.numbers" - ctx.punc_file=Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.punc" - ctx.bigram_freqs_file=Path(ctx.training_text).with_suffix(".training_text.bigram_freqs") - ctx.unigram_freqs_file=Path(ctx.training_text).with_suffix( ".training_text.unigram_freqs") - ctx.train_ngrams_file=Path(ctx.training_text).with_suffix( ".training_text.train_ngrams") - ctx.generate_dawgs=1 + ctx.wordlist_file = ( + Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.wordlist" + ) + + ctx.word_bigrams_file = ( + Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.word.bigrams" + ) + ctx.numbers_file = ( + Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.numbers" + ) + ctx.punc_file = Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.punc" + ctx.bigram_freqs_file = Path(ctx.training_text).with_suffix( + ".training_text.bigram_freqs" + ) + ctx.unigram_freqs_file = Path(ctx.training_text).with_suffix( + ".training_text.unigram_freqs" + ) + ctx.train_ngrams_file = Path(ctx.training_text).with_suffix( + ".training_text.train_ngrams" + ) + ctx.generate_dawgs = 1 log.debug(ctx) return ctx @@ -195,33 +267,39 @@ def cleanup(ctx): shutil.rmtree(ctx.training_dir) return + # Function initializes font config with a unique font cache dir. def initialize_fontconfig(ctx): - sample_path=Path(ctx.font_config_cache)/'sample_text.txt' - Path(sample_path).write_text('Text\n') + sample_path = Path(ctx.font_config_cache) / "sample_text.txt" + Path(sample_path).write_text("Text\n") log.info(f"Testing font: {ctx.fonts[0]}") run_command( - 'text2image', f'--fonts_dir={ctx.fonts_dir}', - f"--font={ctx.fonts[0]}", f"--outputbase={sample_path}", f"--text={sample_path}", - f"--fontconfig_tmpdir={ctx.font_config_cache}" + "text2image", + f"--fonts_dir={ctx.fonts_dir}", + f"--font={ctx.fonts[0]}", + f"--outputbase={sample_path}", + f"--text={sample_path}", + f"--fontconfig_tmpdir={ctx.font_config_cache}", ) def make_fontname(font): - return font.replace(' ', '_').replace(',', '') + return font.replace(" ", "_").replace(",", "") + + +def make_outbase(ctx, fontname, exposure): + return Path(ctx.training_dir) / f"{ctx.lang_code}.{fontname}.exp{exposure}" -def make_outbase(ctx, fontname,exposure): - return Path(ctx.training_dir)/f"{ctx.lang_code}.{fontname}.exp{exposure}" # Helper function for phaseI_generate_image. Generates the image for a single # language/font combination in a way that can be run in parallel. def generate_font_image(ctx, font, exposure, char_spacing): log.info(f"Rendering using {font}") - fontname=make_fontname(font) - outbase=make_outbase(ctx, fontname, exposure) + fontname = make_fontname(font) + outbase = make_outbase(ctx, fontname, exposure) - common_args=[ + common_args = [ f"--fontconfig_tmpdir={ctx.font_config_cache}", f"--fonts_dir={ctx.fonts_dir}", f"--strip_unrenderable_words", @@ -235,31 +313,32 @@ def generate_font_image(ctx, font, exposure, char_spacing): # add --writing_mode=vertical-upright to common_args if the font is # specified to be rendered vertically. if font in VERTICAL_FONTS: - common_args.append('--writing_mode=vertical-upright') + common_args.append("--writing_mode=vertical-upright") run_command( - 'text2image', + "text2image", *common_args, f"--font={font}", f"--text={ctx.training_text}", - *ctx.text2image_extra_args + *ctx.text2image_extra_args, ) - check_file_readable(str(outbase) + '.box', str(outbase) + '.tif') + check_file_readable(str(outbase) + ".box", str(outbase) + ".tif") if ctx.extract_font_properties and Path(ctx.train_ngrams_file).exists(): log.info(f"Extracting font properties of {font}") run_command( - 'text2image', + "text2image", *common_args, f"--font={font}", f"--ligatures=false", f"--text={ctx.train_ngrams_file}", f"--only_extract_font_properties", - f"--ptsize=32" + f"--ptsize=32", ) - check_file_readable(str(outbase) + '.fontinfo') - return f'{font}-{exposure}' + check_file_readable(str(outbase) + ".fontinfo") + return f"{font}-{exposure}" + # Phase I : Generate (I)mages from training text for each font. def phase_I_generate_image(ctx, par_factor): @@ -269,30 +348,31 @@ def phase_I_generate_image(ctx, par_factor): log.info("=== Phase I: Generating training images ===") check_file_readable(ctx.training_text) - char_spacing=0.0 + char_spacing = 0.0 for exposure in ctx.exposures: if ctx.extract_font_properties and Path(ctx.bigram_freqs_file).exists(): # Parse .bigram_freqs file and compose a .train_ngrams file with text # for tesseract to recognize during training. Take only the ngrams whose # combined weight accounts for 95% of all the bigrams in the language. - lines = Path(ctx.bigram_freqs_file).read_text(encoding='utf-8').split('\n') - records = (line.split(' ') for line in splittable_lines) - p = .99 + lines = Path(ctx.bigram_freqs_file).read_text(encoding="utf-8").split("\n") + records = (line.split(" ") for line in splittable_lines) + p = 0.99 ngram_frac = p * sum(int(rec[1]) for rec in records if len(rec) >= 2) - with Path(ctx.train_ngrams_file).open('w', encoding='utf-8') as f: + with Path(ctx.train_ngrams_file).open("w", encoding="utf-8") as f: cumsum = 0 for bigram, count in sorted(records, key=itemgetter(1), reverse=True): if cumsum > ngram_frac: break - f.write(bigram + ' ') + f.write(bigram + " ") cumsum += count check_file_readable(ctx.train_ngrams_file) - with tqdm(total=len(ctx.fonts)) as pbar, \ - concurrent.futures.ThreadPoolExecutor() as executor: + with tqdm( + total=len(ctx.fonts) + ) as pbar, concurrent.futures.ThreadPoolExecutor() as executor: futures = [ executor.submit(generate_font_image, ctx, font, exposure, char_spacing) for font in ctx.fonts @@ -307,36 +387,40 @@ def phase_I_generate_image(ctx, par_factor): # Check that each process was successful. for font in ctx.fonts: - fontname=make_fontname(font) - outbase=make_outbase(ctx, fontname, exposure) - check_file_readable(str(outbase) + '.box', str(outbase) + '.tif') + fontname = make_fontname(font) + outbase = make_outbase(ctx, fontname, exposure) + check_file_readable(str(outbase) + ".box", str(outbase) + ".tif") return - # Phase UP : Generate (U)nicharset and (P)roperties file. def phase_UP_generate_unicharset(ctx): log.info("=== Phase UP: Generating unicharset and unichar properties files ===") - box_files=Path(ctx.training_dir).glob('*.box') + box_files = Path(ctx.training_dir).glob("*.box") - ctx.unicharset_file=Path(ctx.training_dir) / f'{ctx.lang_code}.unicharset' + ctx.unicharset_file = Path(ctx.training_dir) / f"{ctx.lang_code}.unicharset" run_command( - 'unicharset_extractor', - '--output_unicharset', f"{ctx.unicharset_file}", - '--norm_mode', f"{ctx.norm_mode}", - *box_files + "unicharset_extractor", + "--output_unicharset", + f"{ctx.unicharset_file}", + "--norm_mode", + f"{ctx.norm_mode}", + *box_files, ) check_file_readable(ctx.unicharset_file) - ctx.xheights_file=Path(ctx.training_dir) / f'{ctx.lang_code}.xheights' + ctx.xheights_file = Path(ctx.training_dir) / f"{ctx.lang_code}.xheights" run_command( - 'set_unicharset_properties', - '-U', f'{ctx.unicharset_file}', - '-O', f'{ctx.unicharset_file}', - '-X', f'{ctx.xheights_file}', - f'--script_dir={ctx.langdata_dir}' + "set_unicharset_properties", + "-U", + f"{ctx.unicharset_file}", + "-O", + f"{ctx.unicharset_file}", + "-X", + f"{ctx.xheights_file}", + f"--script_dir={ctx.langdata_dir}", ) check_file_readable(ctx.xheights_file) @@ -417,33 +501,34 @@ def phase_UP_generate_unicharset(ctx): def phase_E_extract_features(ctx, box_config, ext): log.info(f"=== Phase E: Generating {ext} files ===") - img_files=list(Path(ctx.training_dir).glob('*.exp*.tif')) + img_files = list(Path(ctx.training_dir).glob("*.exp*.tif")) log.debug(img_files) # Use any available language-specific configs. - config="" - testconfig = Path(ctx.langdata_dir) / ctx.lang_code / f'{ctx.lang_code}.config' + config = "" + testconfig = Path(ctx.langdata_dir) / ctx.lang_code / f"{ctx.lang_code}.config" if testconfig.exists(): config = testconfig log.info(f"Using {ctx.lang_code}.config") tessdata_environ = os.environ.copy() - tessdata_environ['TESSDATA_PREFIX'] = str(ctx.tessdata_dir) + tessdata_environ["TESSDATA_PREFIX"] = str(ctx.tessdata_dir) log.info(f"Using TESSDATA_PREFIX={tessdata_environ['TESSDATA_PREFIX']}") - with tqdm(total=len(img_files)) as pbar, \ - concurrent.futures.ThreadPoolExecutor(max_workers=2) as executor: + with tqdm(total=len(img_files)) as pbar, concurrent.futures.ThreadPoolExecutor( + max_workers=2 + ) as executor: futures = [] for img_file in img_files: future = executor.submit( run_command, - 'tesseract', + "tesseract", img_file, - Path(img_file).with_suffix(''), + Path(img_file).with_suffix(""), *box_config, config, - env=tessdata_environ + env=tessdata_environ, ) futures.append(future) @@ -456,10 +541,11 @@ def phase_E_extract_features(ctx, box_config, ext): pbar.update(1) # Check that all the output files were produced. for img_file in img_files: - check_file_readable(Path(img_file.with_suffix('.' + ext))) + check_file_readable(Path(img_file.with_suffix("." + ext))) return + # # Phase C : (C)luster feature prototypes in .tr into normproto file (cnTraining) # # phaseC_cluster_prototypes ${TRAINING_DIR}/${LANG_CODE}.normproto # phase_C_cluster_prototypes() { @@ -539,9 +625,10 @@ def phase_E_extract_features(ctx, box_config, ext): # # TODO: Add support for generating ambiguities automatically. # } + def make_lstmdata(ctx): log.info("=== Constructing LSTM training data ===") - lang_prefix=f"{ctx.langdata_dir}/{ctx.lang_code}/{ctx.lang_code}" + lang_prefix = f"{ctx.langdata_dir}/{ctx.lang_code}/{ctx.lang_code}" path_output = Path(ctx.output_dir) if not path_output.is_dir(): log.info(f"Creating new directory {ctx.output_dir}") @@ -555,33 +642,40 @@ def make_lstmdata(ctx): # Build the starter traineddata from the inputs. run_command( - 'combine_lang_model', - '--input_unicharset', f"{ctx.training_dir}/{ctx.lang_code}.unicharset", - '--script_dir', f"{ctx.langdata_dir}", - '--words', f"{lang_prefix}.wordlist", - '--numbers', f"{lang_prefix}.numbers", - '--puncs', f"{lang_prefix}.punc", - '--output_dir', f"{ctx.output_dir}", - '--lang', f"{ctx.lang_code}", - *args + "combine_lang_model", + "--input_unicharset", + f"{ctx.training_dir}/{ctx.lang_code}.unicharset", + "--script_dir", + f"{ctx.langdata_dir}", + "--words", + f"{lang_prefix}.wordlist", + "--numbers", + f"{lang_prefix}.numbers", + "--puncs", + f"{lang_prefix}.punc", + "--output_dir", + f"{ctx.output_dir}", + "--lang", + f"{ctx.lang_code}", + *args, ) def get_file_list(): training_path = Path(ctx.training_dir) if ctx.save_box_tiff: log.info("=== Saving box/tiff pairs for training data ===") - yield from training_path.glob(f'{ctx.lang_code}*.box') - yield from training_path.glob(f'{ctx.lang_code}*.tif') + yield from training_path.glob(f"{ctx.lang_code}*.box") + yield from training_path.glob(f"{ctx.lang_code}*.tif") log.info("=== Moving lstmf files for training data ===") - yield from training_path.glob(f'{ctx.lang_code}.*.lstmf') + yield from training_path.glob(f"{ctx.lang_code}.*.lstmf") for f in get_file_list(): log.debug(f"Moving {f} to {path_output / f.name}") shutil.move(str(f), path_output / f.name) - lstm_list=f"{ctx.output_dir}/{ctx.lang_code}.training_files.txt" - dir_listing = (str(p) for p in path_output.glob(f'{ctx.lang_code}.*.lstmf')) - Path(lstm_list).write_text('\n'.join(dir_listing)) + lstm_list = f"{ctx.output_dir}/{ctx.lang_code}.training_files.txt" + dir_listing = (str(p) for p in path_output.glob(f"{ctx.lang_code}.*.lstmf")) + Path(lstm_list).write_text("\n".join(dir_listing)) # make__traineddata() {