# Step 3. Manual-code categorization of surface-forms and errors 

Among the corrections made by the LLM in the previous step

In [1]:
import json
import unicodedata
import os
import re
import pandas as pd
from string import punctuation
import difflib
punctuation += " "

In [2]:
def normalize(c): return unicodedata.normalize('NFKD', c).encode('ASCII', 'ignore').decode()

def useful_chars(string): return re.sub(r'[^a-zA-ZÀ-ÿ]', '', string)
def isuseful_chars(string): return re.compile(r'^[a-zA-ZáéíóúÁÉÍÓÚüÜñÑ\s,.!?¿¡]*$').match(string)

def count_accent_changes(str1, str2):
    if len(str1) != len(str2):
        return -1

    changes = 0
    for char1, char2 in zip(str1, str2):
        if char1 != char2 and normalize(char1) == normalize(char2):
            changes += 1
    return changes


*Go back to this point to reset all the applied categorizations and start over the debugging process, when required.*

In [3]:
CORRECTIONS_FILE = "./correctionsLatam.json"
SURFACE_FORMS_FILE = "../data/surfaceForms.json"
SURFACE_FORMS_FILE_NONACC = "../data/surfaceFormsNonAccents.json"
ORTHOGRAPHIC_ERRORS_FILE = "../data/orthographicErrors.json"
COLOR_PRINTING = False # for large corpus, set it to False
current_step = 0

df = pd.read_parquet("../data/pre-corrected-latam-xix.parquet")

with open(CORRECTIONS_FILE, 'r') as infile:
    fixes = json.load(infile)

if not os.path.exists(SURFACE_FORMS_FILE):
    with open(SURFACE_FORMS_FILE, 'w') as outfile:
        json.dump({}, outfile)
surface_forms = dict()

if not os.path.exists(SURFACE_FORMS_FILE_NONACC):
    with open(SURFACE_FORMS_FILE_NONACC, 'w') as outfile:
        json.dump({}, outfile)
surface_forms_nacc = dict()

if not os.path.exists(ORTHOGRAPHIC_ERRORS_FILE):
    with open(ORTHOGRAPHIC_ERRORS_FILE, 'w') as outfile:
        json.dump([], outfile)
orthographic_errors = dict() # will be changed to list before saving

In [4]:
def add_to_surface(wrong, good, freq):
    sf_t, real_t = wrong.lower(), good.lower()
    sf_ws, real_ws = re.findall(r'\w+', sf_t), re.findall(r'\w+', real_t)

    if len(sf_ws) == len(real_ws):
        for sf,real in zip(sf_ws, real_ws):
            if sf != real:
                numaccentchanges = count_accent_changes(real, sf)
                surface_forms[real] = surface_forms.get(real, dict())
                surface_forms[real][sf] = surface_forms[real].get(sf, 0) + freq
                if numaccentchanges < 1:
                    surface_forms_nacc[real] = surface_forms_nacc.get(real, dict())
                    surface_forms_nacc[real][sf] = surface_forms_nacc[real].get(sf, 0) + freq

    else:
        if sf_t != real_t:
            numaccentchanges = count_accent_changes(real_t, sf_t)
            surface_forms[real_t] = surface_forms.get(real_t, dict())
            surface_forms[real_t][sf_t] = surface_forms[real_t].get(sf_t, 0) + freq
            if numaccentchanges < 1:
                surface_forms_nacc[real_t] = surface_forms_nacc.get(real_t, dict())
                surface_forms_nacc[real_t][sf_t] = surface_forms_nacc[real_t].get(sf_t, 0) + freq

def add_to_errors(fix):
    for idx,widx1,widx2,ctx in fix['usages']:
        orthographic_errors[(idx,widx1,widx2)] = {
            "prv": fix['change'][0],
            "mod": fix['change'][1],
            "ctx": ctx
        }

def print_fix(wrong, good, freq, category):
    if COLOR_PRINTING:
        if category == "SFRM": 
            category = f"[\033[93m{category}"
        elif category == "ERRR":
            category = f"[\033[95m{category}"
        else:
            category = f"[\033[90m{category}"
        print(f"{category}\033[0m] (\033[92m{freq}\033[0m) \033[91m{wrong}\033[0m - \033[94m{good}\033[0m")
    else:
        print(f"[{category}] ({freq}) {wrong} - {good}")

def execute_prev_steps(step):
    if step <= current_step + 1:
        return
    elif step - 2 > current_step:
        raise Exception(f"Execute step {step-2} first")
    else:
        if step >= 2:
            i = step-1
            print(f"Executing step {i}...")
            globals()[f"step{i}"](False)
            status()

status = lambda: print(f"{len(fixes)} fixes to check\n{len(surface_forms)} surface forms found\n{len(orthographic_errors)} orthographic errors found")
status()

830951 fixes to check
0 surface forms found
0 orthographic errors found


In [5]:
df.loc[0, "text"]

'Cualquier cosa, pues solo se hizo circular en hoja suelta: pero lo haremos conocer próximamente, AREQUIPA MARZO 5 DB 1884. Pues tenemos el propósito de no omitir : esfuerzo hasta conseguirla. Mientras tanto, como reminiscencias que de ella se conservan en la memoria, insertamos uno de sus fragmentos, que poco mas ó menos dice a "Casas sin techo, Rio sin agua, Arboles sin hojas, Muchacho malcriado ... Todo esto era el Perú a la muerte del General Castilla." Si hay alguna alteracion en esta parte, el autor por interes propio debe exhibir la Oda para rectificarla, previniendose que aun cuando parezca exagerado, este fragmento es de los menos malos. Y a proposito. ¿ Que daño pudo haberle hecho a este pedante el ilustre General Castilia, que con tanto desden miraba a los pequeños, para que intentase escarnecer y poner en rídiculo su respetada memoria? Nos esplicamos el motivo del encono que abriga el Redactor de "El Peru" para el país de su nacimiento, pero ... la saña que revela no solo l

## Manually-written steps for categorization:

**NOTES:**

- Each correction made by the LLM may be categorized as:
  * **surface form**: with the method `add_to_surface`
  * **orthographic error**: with the method `add_to_errors`
  * **none**: just `pass` (skip) the correction

- Each step has a `JUST_PRINT` variable; if it's True, the changes won't affect the variables. For debugging a particular step:
  * First, **ALWAYS run all the previous steps with the `JUST_PRINT` variable set to False**
  * Then, set the `JUST_PRINT` variable of the step you want to debug to True and run the step
  * If you run the next step without running the previous one (with `JUST_PRINT`=False), it'll run the previous automatically.

In [6]:
SF_CHANGES = [
    ('á','a'), ('a','á'),
    ('é','e'), ('e','é'),
    ('í','i'), ('i','í'),
    ('ó','o'), ('o','ó'),
    ('ú','u'), ('u','ú'),
    ('i','y'), ('y','i'),
    ('j','g'), ('g','j'),
    ('v','b'), ('b','v'),
    ('s','x'), ('x','s'),
    ('j','x'), ('x','j'),
    ('c','s'), ('s','c'),
    ('s','z'), ('z','s'),
    ('z','c'),
    ('q','c'), # quatro
    ('n','ñ'), # senor
    ('ni','ñ'), # senior
    ('k','qu'), # nikel
    ('k','c'), # kiosko
    ('ou','u'), # boulevares
    ('s','bs'), ('bs','s'), # suscriciones, obscuro
    ('c','pc'), # suscriciones
    ('s','ns'), # trasportar
    ('t','pt'), # Setiembre
    ('rt','r'), # libertar
    ('rr','r'), ('r', 'rr'), # vireinato
]
# other common form is '...lo' -> 'lo ...' (e.g. cambiólo -> lo cambió)

ERR_CHANGES = [
    ('6','ó'), ('6','o'), ('1','y'), ('0','o'), ('4','a')
]

SF_EXCEPTIONS = [
    "presidenta", "sr.", "q'", "ud.", "d.", "usté", "apuntaciones", "comprofesores",
    "diez y seis", "bien que", "de el", "costarrica", "hispano-america", "eleccionario",
    "medio dia", "fortísimos"
]

ERR_EXCEPTIONS = [
    
]

SKIP = [
    "sugestiones", "suerte", "camonel", "mas"
]

### Step 1.

In [7]:
def diff(text1, text2):
    sm = difflib.SequenceMatcher(None, text1, text2)
    added = []
    removed = []
    modified = []
    add_modified = []
    rmv_modified = []
    for opcode, a0, a1, b0, b1 in sm.get_opcodes():
        sa = sm.a[a0:a1]
        sb = sm.b[b0:b1]
        match opcode:
            case 'insert': 
                added.append(sb)
                if sm.a[a0:a1+1] != '': add_modified.append((sm.a[a0:a1+1], sm.b[b0:b1+1]))
            case 'delete': 
                removed.append(sa)
                if sm.b[b0:b1+1] != '': rmv_modified.append((sm.a[a0:a1+1], sm.b[b0:b1+1]))
            case 'replace': modified.append((sa, sb))
            case _: pass
    return added, removed, modified, add_modified, rmv_modified

print(diff('suscriciones', 'subscripciones'))   # added example
print(diff('obscuro', 'oscuro'))                # removed example
print(diff('ejercito', 'ejército'))             # modified example

(['b', 'p'], [], [], [('s', 'bs'), ('c', 'pc')], [])
([], ['b'], [], [], [('bs', 's')])
([], [], [('e', 'é')], [], [])


In [8]:
print(diff('trascurso', 'transcurso'))
print(diff('do', 'donde'))
print(diff('y_', 'y'))
print(diff('yde', 'y'))
print(diff('republi', "repúblic"))
print(diff('nikel', "niquel"))

(['n'], [], [], [('s', 'ns')], [])
(['nde'], [], [], [], [])
([], ['_'], [], [], [])
([], ['de'], [], [], [])
(['c'], [], [('u', 'ú')], [], [])
([], [], [('k', 'qu')], [], [])


In [9]:
def similarity(text1, text2):
    return difflib.SequenceMatcher(None, text1, text2).ratio()

similarity('apropósito', 'apropiado')

0.7368421052631579

In [10]:
JUST_PRINT = False
def sfrm(fix):
    if JUST_PRINT: print_fix(*fix['change'], fix['freq'], "SFRM")
    else: add_to_surface(*fix['change'], fix['freq'])

def errr(fix):
    if JUST_PRINT: print_fix(*fix['change'], fix['freq'], "ERRR")
    else: add_to_errors(fix)

def none(fix):
    if JUST_PRINT: print_fix(*fix['change'], fix['freq'], "NONE")
    #else: pass

In [11]:
JUST_PRINT = False

def step1():
    global curent_step, JUST_PRINT
    execute_prev_steps(1)
    if JUST_PRINT == False: curent_step=1

    idx_remove = []
    for i,fix in enumerate(fixes):
        if fix['change'][0].lower() in SF_EXCEPTIONS: sfrm(fix)
        elif fix['change'][0].lower() in ERR_EXCEPTIONS: errr(fix)
        elif fix['change'][0].lower() in SKIP: none(fix)
        else:
            added, removed, modified, add_modified, rmv_modified = diff(fix['change'][0].lower(), fix['change'][1].lower())
            added_nz, removed_nz, modified_nz, add_modified_nz, rmv_modified_nz = diff(normalize(fix['change'][0].lower()), normalize(fix['change'][1].lower()))
            if len(re.findall(r'\w+', fix['change'][0])) == len(re.findall(r'\w+', fix['change'][1])):
                allchanges = modified+add_modified+rmv_modified
                allchanges_nz = modified_nz+add_modified_nz+rmv_modified_nz
                if (all([i in SF_CHANGES for i in allchanges]) and len(allchanges) != 0 and len(added) == len(add_modified) and len(removed) == len(rmv_modified)):
                    sfrm(fix)
                elif (all([i in SF_CHANGES for i in allchanges_nz]) and len(allchanges_nz) != 0 and len(added_nz) == len(add_modified_nz) and len(removed_nz) == len(rmv_modified_nz)):
                    if isuseful_chars(fix['change'][0]): sfrm(fix)
                    else: none(fix)
                elif (all([i in ERR_CHANGES for i in allchanges]) and len(allchanges) != 0 and len(added) == len(add_modified) and len(removed) == len(rmv_modified)) or (all([i in ERR_CHANGES for i in allchanges_nz]) and len(allchanges_nz) != 0 and len(added_nz) == len(add_modified_nz) and len(removed_nz) == len(rmv_modified_nz)): 
                    errr(fix)
            else:
                if (len(added_nz) == len(removed_nz) == 1)  and (added_nz[0] == removed_nz[0]+" "): 
                    #temp, JUST_PRINT = JUST_PRINT, True
                    ch0 = fix['change'][0].split()[0]
                    ch1 = ' '.join(fix['change'][1].split()[:2])
                    if normalize(ch0[:-len(added_nz[0])+1].lower()) == normalize(fix['change'][1].split()[1].lower()): # vióse, cambiólo, ...
                        fix['change'][0] = ch0
                        fix['change'][1] = ch1
                        sfrm(fix)
                    else: none(fix)
                    #JUST_PRINT = temp
                elif (fix['freq'] >= 5) or (similarity(*fix['change']) > 0.75): errr(fix)
                else: none(fix)
        idx_remove.append(i)

    #if JUST_PRINT: print(len(idx_remove))
    #else: 
    #    for i in reversed(idx_remove): fixes.pop(i)

step1()
status()

830951 fixes to check
37475 surface forms found
102689 orthographic errors found


In [12]:
surface_forms = {k: v for k,v in sorted(surface_forms.items(), key=lambda x: sum(x[1].values()), reverse=True)}
surface_forms_nacc = {k: v for k,v in sorted(surface_forms_nacc.items(), key=lambda x: sum(x[1].values()), reverse=True)}

with open(SURFACE_FORMS_FILE_NONACC, 'w', encoding='utf-8') as outfile:
    json.dump(surface_forms_nacc, outfile, indent=4, ensure_ascii=False)

with open(SURFACE_FORMS_FILE, 'w', encoding='utf-8') as outfile:
    json.dump(surface_forms, outfile, indent=4, ensure_ascii=False)

In [13]:
orthographic_errors = dict(sorted(orthographic_errors.items(), key=lambda item: (item[0][0], item[0][1])))

errors = [[] for _ in range(len(df))]
for k,v in orthographic_errors.items():
    v['idx1'] = k[1]
    v['idx2'] = k[2]
    errors[k[0]].append(v)

with open(ORTHOGRAPHIC_ERRORS_FILE, 'w', encoding='utf-8') as outfile:
    json.dump(errors, outfile, indent=4, ensure_ascii=False)

[]

In [17]:
errors[2]

[{'prv': 'oiga mosle á',
  'mod': 'oigámosle a',
  'ctx': 'exhibir la indicada muestra , oiga mosle á él en su prólogo que',
  'idx1': 1420,
  'idx2': 1432},
 {'prv': 'bellisímas traduccio. nes',
  'mod': 'bellísimas traducciones',
  'ctx': 'embargo están escritos . Las bellisímas traduccio . nes de Victor - Hugo por',
  'idx1': 2371,
  'idx2': 2396}]

In [18]:
df.loc[2, "text"]

'Se meten á escribir: y no veo yo lejano el dia en que cada pueblo sacrifique un centenar de estos feroces vípedos, de la misma manera que los antiguos sacrificaban bueyes. Y en efecto, esta clase de escritores, que quieren suplir con la audacia In falta de ciencia y de conciencia, son perjudiciales, no por el daño que hacen á los pueblos, a las institucciones ó á las doctrinas, que nada tienen que temer de tan débiles enemigos, sino por los errores, por las preocupaciones que siembran entre las gentes sencillas, que reciben como articulo de fé todo lo que ven en letra de moldey. Concluiremos repitiendo con uno de nuestros eminentes estadistas, que: «Ningun hombre de bien te tenga segura su conciencia, debe inquietarse por los denuestos de la prensa, pues las calumnias que vierte entre nosotros, algo de honroso dicen á favor de las víctimas de sus furores, siendo la mejor venganza contra esas vocinglerias, mostrarse cada dia mas indiferente a ellas y mas honrado». : 001 4 -201 050G eIP

In [19]:
for i,e in enumerate(errors):
    text = df.loc[i, "text"]
    chdif = 0
    for v in e:
        p = v['prv']
        m = v['mod']
        ctx = v['ctx']
        idx1 = v['idx1'] - chdif
        idx2 = v['idx2'] - chdif
        assert p == text[idx1:idx2], f"ERROR at {i}. Expected {p} but got {text[idx1:idx2]}"
        text = text[:idx1] + m + text[idx2:]
        chdif += len(p) - len(m)
    df.loc[i, "text"] = text


In [20]:
df.loc[2, "text"]

'Se meten á escribir: y no veo yo lejano el dia en que cada pueblo sacrifique un centenar de estos feroces vípedos, de la misma manera que los antiguos sacrificaban bueyes. Y en efecto, esta clase de escritores, que quieren suplir con la audacia In falta de ciencia y de conciencia, son perjudiciales, no por el daño que hacen á los pueblos, a las institucciones ó á las doctrinas, que nada tienen que temer de tan débiles enemigos, sino por los errores, por las preocupaciones que siembran entre las gentes sencillas, que reciben como articulo de fé todo lo que ven en letra de moldey. Concluiremos repitiendo con uno de nuestros eminentes estadistas, que: «Ningun hombre de bien te tenga segura su conciencia, debe inquietarse por los denuestos de la prensa, pues las calumnias que vierte entre nosotros, algo de honroso dicen á favor de las víctimas de sus furores, siendo la mejor venganza contra esas vocinglerias, mostrarse cada dia mas indiferente a ellas y mas honrado». : 001 4 -201 050G eIP

In [21]:
df.to_csv("../data/corrected-latam-xix.tsv", sep="\t", index=False)
df.to_parquet('../data/corrected-latam-xix.parquet')