# Step 3. Manual-code categorization of surface-forms and errors 

Among the corrections made by the LLM in the previous step

In [57]:
import json
import unicodedata
import os
import re
import pandas as pd
from string import punctuation
from difflib import SequenceMatcher
punctuation += " "

In [58]:
def normalize(c): return unicodedata.normalize('NFKD', c).encode('ASCII', 'ignore').decode()

def useful_chars(string):
    return re.sub(r'[^a-zA-ZÀ-ÿ]', '', string)

def count_accent_changes(str1, str2):
    if len(str1) != len(str2):
        return -1

    changes = 0
    for char1, char2 in zip(str1, str2):
        if char1 != char2 and normalize(char1) == normalize(char2):
            changes += 1
    return changes

def categorize_char(c):
    if c.isdigit():
        return 'd'
    elif c.isalpha():
        return 'a'
    elif c in punctuation:
        return 's'
    else:
        return 'u'

def nonaccent_changes(str1, str2):
    #str1 = useful_chars(str1)
    #str2 = useful_chars(str2)
    if len(str1) != len(str2):
        return [], []

    typechanges = [] # d (digit), a (alpha character), s (special), u (unknown). All changes are a couple types such as "dc" (digit changed to consonant)
    changes = []
    for char1, char2 in zip(str1, str2):
        nchar1 = normalize(char1)
        nchar2 = normalize(char2)
        if char1 != char2 and nchar1 != nchar2:
            typechanges.append(categorize_char(char1) + categorize_char(char2))
            changes.append(char1+char2)

    return typechanges, changes

print(count_accent_changes("holá", "hola"), nonaccent_changes("holá", "hola"))   # Output: 1 (á -> a)
print(count_accent_changes("feñhá", "fenfa"), nonaccent_changes("feñhá", "fenfa")) # Output: 2 (ñ -> n, á -> a) the rest of the changes are not due to the accent
print(count_accent_changes("olh", "ola"), nonaccent_changes("olh", "ola"))     # Output: 0 (no accent changes)

1 ([], [])
2 (['aa'], ['hf'])
0 (['aa'], ['ha'])


*Go back to this point to reset all the applied categorizations and start over the debugging process, when required.*

In [59]:
CORRECTIONS_FILE = "./correctionsLatam.json"
SURFACE_FORMS_FILE = "../data/surfaceForms.json"
SURFACE_FORMS_FILE_NONACC = "../data/surfaceFormsNonAccents.json"
ORTHOGRAPHIC_ERRORS_FILE = "../data/orthographicErrors.json"
COLOR_PRINTING = False # for large corpus, set it to False
current_step = 0

df = pd.read_parquet("../data/pre-corrected-latam-xix.parquet")

with open(CORRECTIONS_FILE, 'r') as infile:
    fixes = json.load(infile)

if not os.path.exists(SURFACE_FORMS_FILE):
    with open(SURFACE_FORMS_FILE, 'w') as outfile:
        json.dump({}, outfile)
surface_forms = dict()

if not os.path.exists(SURFACE_FORMS_FILE_NONACC):
    with open(SURFACE_FORMS_FILE_NONACC, 'w') as outfile:
        json.dump({}, outfile)
surface_forms_nacc = dict()

if not os.path.exists(ORTHOGRAPHIC_ERRORS_FILE):
    with open(ORTHOGRAPHIC_ERRORS_FILE, 'w') as outfile:
        json.dump([], outfile)
orthographic_errors = dict() # will be changed to list before saving

In [60]:
df.loc[0, "text"]

'La publicacion del Oso se harà dos veces cada semana, y constará de un pliego en cuarto ; ofreciendo à mas sus redactores, dar los gravados oportunos, siempre que lo exija el asunto de que trate. Redactado por un Num. 8. TEMA del Periodico. POLITICA MILITAR. OCTAVA SESION. Abierta la sesion á las dore y un minuto de la noche , 25 de Febrero de 1845 , con asistencia de todos los Señores Representantes, se leyó y aprobó la acta de la Asamblea anterior , ménos en lo tocante à la torre del Convento de Santo Domingo, punto que quedó para ventilarse en mejor ocasion. Enseguida se dió cuenta de una nota del Ejecutivo , referente à que urjía la necesidad de organizar un Ejército ; pues decia el Excmo. Decano: - "Un poder sin bayonetas vale tanto como un cero puesto á la izquierda."'

In [61]:
def add_to_surface(wrong, good, freq):
    sf_t, real_t = wrong.lower(), good.lower()
    sf_ws, real_ws = re.findall(r'\w+', sf_t), re.findall(r'\w+', real_t)

    if len(sf_ws) == len(real_ws):
        for sf,real in zip(sf_ws, real_ws):
            if sf != real:
                numaccentchanges = count_accent_changes(real, sf)
                surface_forms[real] = surface_forms.get(real, dict())
                surface_forms[real][sf] = surface_forms[real].get(sf, 0) + freq
                if numaccentchanges < 1:
                    surface_forms_nacc[real] = surface_forms_nacc.get(real, dict())
                    surface_forms_nacc[real][sf] = surface_forms_nacc[real].get(sf, 0) + freq

    else:
        if sf_t != real_t:
            numaccentchanges = count_accent_changes(real_t, sf_t)
            surface_forms[real_t] = surface_forms.get(real_t, dict())
            surface_forms[real_t][sf_t] = surface_forms[real_t].get(sf_t, 0) + freq
            if numaccentchanges < 1:
                surface_forms_nacc[real_t] = surface_forms_nacc.get(real_t, dict())
                surface_forms_nacc[real_t][sf_t] = surface_forms_nacc[real_t].get(sf_t, 0) + freq

def add_to_errors(fix):
    for idx,widx1,widx2,ctx in fix['usages']:
        orthographic_errors[(idx,widx1,widx2)] = {
            "prv": fix['change'][0],
            "mod": fix['change'][1],
            "ctx": ctx
        }

def print_fix(wrong, good, freq, category):
    if COLOR_PRINTING:
        if category == "SFRM": 
            category = f"[\033[93m{category}"
        elif category == "ERRR":
            category = f"[\033[95m{category}"
        else:
            category = f"[\033[90m{category}"
        print(f"{category}\033[0m] (\033[92m{freq}\033[0m) \033[91m{wrong}\033[0m - \033[94m{good}\033[0m")
    else:
        print(f"[{category}] ({freq}) {wrong} - {good}")

def execute_prev_steps(step):
    if step <= current_step + 1:
        return
    elif step - 2 > current_step:
        raise Exception(f"Execute step {step-2} first")
    else:
        if step >= 2:
            i = step-1
            print(f"Executing step {i}...")
            globals()[f"step{i}"](False)
            status()

status = lambda: print(f"{len(fixes)} fixes to check\n{len(surface_forms)} surface forms found\n{len(orthographic_errors)} orthographic errors found")
status()

114411 fixes to check
0 surface forms found
0 orthographic errors found


## Manually-written steps for categorization:

**NOTES:**

- Each correction made by the LLM may be categorized as:
  * **surface form**: with the method `add_to_surface`
  * **orthographic error**: with the method `add_to_errors`
  * **none**: just `pass` (skip) the correction

- Each step has a `JUST_PRINT` variable; if it's True, the changes won't affect the variables. For debugging a particular step:
  * First, **ALWAYS run all the previous steps with the `JUST_PRINT` variable set to False**
  * Then, set the `JUST_PRINT` variable of the step you want to debug to True and run the step
  * If you run the next step without running the previous one (with `JUST_PRINT`=False), it'll run the previous automatically.

### 1. ALL texts that are the same except for characters with an accent, such as "barbaros" -> "bárbaros"
**surface forms**

In [62]:
JUST_PRINT = False

def step1(just_print):
    global current_step
    execute_prev_steps(1)
    if just_print == False: current_step = 1

    idx_remove = []
    for i,fix in enumerate(fixes):
        numaccentchanges = count_accent_changes(fix['change'][0], fix['change'][1])
        _, changes = nonaccent_changes(fix['change'][0].lower(), fix['change'][1].lower())
        if ( True
            ) and (len(re.findall(r'\w+', fix['change'][0])) == len(re.findall(r'\w+', fix['change'][1])) # have the same length
            ) and (numaccentchanges >= 1 # has any different accent letter
            ) and (len(changes) == 0 # is the same word
        ):
            for s,c in zip(re.findall(r'\w+', fix['change'][0]), re.findall(r'\w+', fix['change'][1])):
                if (len(s) == len(c) != 0) and (s != c) and (s not in punctuation):
                    if just_print: print_fix(*fix['change'], fix['freq'], "SFRM")
                    else: add_to_surface(*fix['change'], fix['freq'])
            idx_remove.append(i)

    if just_print: print(len(idx_remove))
    else: 
        for i in reversed(idx_remove): fixes.pop(i)

step1(JUST_PRINT)
status()

99392 fixes to check
9486 surface forms found
0 orthographic errors found


### 2. SPECIFIC type of changes
**surface forms or orthographic errors**

In [63]:
JUST_PRINT = False

def step2(just_print):
    global current_step
    execute_prev_steps(2)
    if just_print == False: current_step = 2

    idx_remove = []
    sforms_specifications = ['iy', 'jg', 'sx', 'xj', 'vb', 'bv', 'sz', 'zc', 'zs', 'sx', 'yi', 'cs', 'qc', 'gj', 'xs', 'sc', 'jx']
    errors_specifications = ['6ó', '1y', '0o', '4a']
    specifications = sforms_specifications+errors_specifications

    for i,fix in enumerate(fixes):
        _, changes = nonaccent_changes(fix['change'][0].lower(), fix['change'][1].lower())
        if ( True
            ) and (len(re.findall(r'\w+', fix['change'][0])) == len(re.findall(r'\w+', fix['change'][1])) # have the same length
            ) and (any(s in changes for s in specifications) # has any change in the list such as 'soi' -> 'soy'
        ):
            for s,c in zip(re.findall(r'\w+', fix['change'][0]), re.findall(r'\w+', fix['change'][1])):
                if (len(s) == len(c) != 0) and (s != c) and (s not in punctuation):
                    tc, ac = nonaccent_changes(s.lower(),c.lower())
                    if all([k in sforms_specifications for k in ac]):
                        if just_print: print_fix(s, c, fix['freq'], "SFRM")
                        else: add_to_surface(s,c,fix['freq'])
                    elif all([k in errors_specifications for k in ac]):
                        if just_print: print_fix(s, c, fix['freq'], "ERRR")
                        else: add_to_errors(fix)
                    else:
                        if just_print: print_fix(*fix['change'], fix['freq'], "NONE")
                        else: pass
            idx_remove.append(i)

    if just_print: print(len(idx_remove))
    else: 
        for i in reversed(idx_remove): fixes.pop(i)

step2(JUST_PRINT)
status()

94847 fixes to check
11329 surface forms found
374 orthographic errors found


### 3. ALL texts where there are letter-letter changes, and have same length
**orthographic errors**

In [64]:
JUST_PRINT = False

def step3(just_print):
    global current_step
    execute_prev_steps(3)
    if just_print == False: current_step = 3

    idx_remove = []
    nonconsider = ["sugestiones", "suerte"] # neither surface-forms nor orthographic-errors 
    exceptions = ["Presidenta"]

    for i,fix in enumerate(fixes):
        typechanges, _ = nonaccent_changes(fix['change'][0].lower(), fix['change'][1].lower())
        if fix['change'][0] in nonconsider:
            if just_print: print_fix(*fix['change'], fix['freq'], "NONE")
            else: pass
        elif fix['change'][0] in exceptions:
            if just_print: print_fix(*fix['change'], fix['freq'], "SFRM")
            else: add_to_surface(*fix['change'], fix['freq'])
        elif ( True
            ) and (len(re.findall(r'\w+', fix['change'][0])) == len(re.findall(r'\w+', fix['change'][1])) # have the same length
            ) and ('aa' in typechanges # has a letter-letter change
        ):
            if just_print: print_fix(*fix['change'], fix['freq'], "ERRR")
            else: add_to_errors(fix)
            idx_remove.append(i)

    if just_print: print(len(idx_remove))
    else: 
        for i in reversed(idx_remove): fixes.pop(i)

step3(JUST_PRINT)
status()

67246 fixes to check
11329 surface forms found
39454 orthographic errors found


### 4. The remaining changes

Finally, the remaining rows may be very different. A good way to know if the corrections are mostly truthy, it's possible to compute a similarity ratio between two texts, using difflib:

In [65]:
def similarity(text1, text2):
    return SequenceMatcher(None, text1, text2).ratio()

similarity("que", "como")

0.0

### 4. ALL texts where there are non letter-letter changes, or have different length
**orthographic errors**

In [66]:
JUST_PRINT = False
MIN_SIMILARITY = .55

def step4(just_print):
    global current_step
    execute_prev_steps(4)
    if just_print == False: current_step = 4

    idx_remove = []
    nonconsider = []
    exceptions = ['trasformación', 'suscriciones', 'trasportar', 'trasmitir', 'trascurso', 'trasparente', 
                'trasporte', 'trascurrido', 'Setiembre', 'trasparenta', 'Hispano-America', 'Dirigióse', 
                'Viólo', 'Costarrica', 'COSTARRICA', 'distinguióse', 'comprofesores', 'diez y seis',
                'bien que', 'de el', 'D.', 'suscritores', "q'", 'usté', 'incorporóse', 'estraños', 'chicuelos',
                'trascurría', 'trasmita', 'Ud.', 'Sr.', 'trasportes', 'trasmisión', 'libertar', 'nikel', 'kioskos',
                'apuntaciones', 'trasformaciones', 'boulevares'] # TODO: continue...

    for i,fix in enumerate(fixes):
        if fix['change'][0] in nonconsider:
            if just_print: print_fix(*fix['change'], fix['freq'], "NONE")
            else: pass
        elif fix['change'][0] in exceptions:
            if just_print: print_fix(*fix['change'], fix['freq'], "SFRM")
            else: add_to_surface(*fix['change'], fix['freq'])
        else:
            sim = similarity(fix['change'][0].lower(), fix['change'][1].lower())
            if ( True
                and (
                       (len(re.findall(r'\w+', fix['change'][0])) > 5) 
                    or (len(re.findall(r'\w+', fix['change'][1])) > 5) 
                    or  sim < MIN_SIMILARITY 
                    )
                and (fix['freq'] <= 3)
               ):
                if just_print: print_fix(*fix['change'], fix['freq'], "NONE")
                else: pass
            else:
                if just_print: print_fix(*fix['change'], fix['freq'], "ERRR")
                else: add_to_errors(fix)
        idx_remove.append(i)

    if just_print: print(len(idx_remove))
    else: 
        for i in reversed(idx_remove): fixes.pop(i)

step4(JUST_PRINT)
status()

0 fixes to check
11361 surface forms found
96912 orthographic errors found


In [67]:
surface_forms = {k: v for k,v in sorted(surface_forms.items(), key=lambda x: sum(x[1].values()), reverse=True)}
surface_forms_nacc = {k: v for k,v in sorted(surface_forms_nacc.items(), key=lambda x: sum(x[1].values()), reverse=True)}

with open(SURFACE_FORMS_FILE_NONACC, 'w', encoding='utf-8') as outfile:
    json.dump(surface_forms_nacc, outfile, indent=4, ensure_ascii=False)

with open(SURFACE_FORMS_FILE, 'w', encoding='utf-8') as outfile:
    json.dump(surface_forms, outfile, indent=4, ensure_ascii=False)

In [68]:
orthographic_errors = dict(sorted(orthographic_errors.items(), key=lambda item: (item[0][0], item[0][1])))

errors = [[] for _ in range(len(df))]
for k,v in orthographic_errors.items():
    v['idx1'] = k[1]
    v['idx2'] = k[2]
    errors[k[0]].append(v)

with open(ORTHOGRAPHIC_ERRORS_FILE, 'w', encoding='utf-8') as outfile:
    json.dump(errors, outfile, indent=4, ensure_ascii=False)

errors[0]

[{'prv': 'à mas',
  'mod': 'además',
  'ctx': 'pliego en cuarto ; ofreciendo à mas sus redactores , dar los',
  'idx1': 101,
  'idx2': 106},
 {'prv': 'dore',
  'mod': 'dos',
  'ctx': 'Abierta la sesion á las dore y un minuto de la',
  'idx1': 298,
  'idx2': 302}]

In [69]:
df.loc[0, "text"]

'La publicacion del Oso se harà dos veces cada semana, y constará de un pliego en cuarto ; ofreciendo à mas sus redactores, dar los gravados oportunos, siempre que lo exija el asunto de que trate. Redactado por un Num. 8. TEMA del Periodico. POLITICA MILITAR. OCTAVA SESION. Abierta la sesion á las dore y un minuto de la noche , 25 de Febrero de 1845 , con asistencia de todos los Señores Representantes, se leyó y aprobó la acta de la Asamblea anterior , ménos en lo tocante à la torre del Convento de Santo Domingo, punto que quedó para ventilarse en mejor ocasion. Enseguida se dió cuenta de una nota del Ejecutivo , referente à que urjía la necesidad de organizar un Ejército ; pues decia el Excmo. Decano: - "Un poder sin bayonetas vale tanto como un cero puesto á la izquierda."'

In [70]:
for i,e in enumerate(errors):
    text = df.loc[i, "text"]
    chdif = 0
    for v in e:
        p = v['prv']
        m = v['mod']
        ctx = v['ctx']
        idx1 = v['idx1'] - chdif
        idx2 = v['idx2'] - chdif
        assert p == text[idx1:idx2], f"ERROR at {i}. Expected {p} but got {text[idx1:idx2]}"
        text = text[:idx1] + m + text[idx2:]
        chdif += len(p) - len(m)
    df.loc[i, "text"] = text


In [71]:
df.loc[0, "text"]

'La publicacion del Oso se harà dos veces cada semana, y constará de un pliego en cuarto ; ofreciendo además sus redactores, dar los gravados oportunos, siempre que lo exija el asunto de que trate. Redactado por un Num. 8. TEMA del Periodico. POLITICA MILITAR. OCTAVA SESION. Abierta la sesion á las dos y un minuto de la noche , 25 de Febrero de 1845 , con asistencia de todos los Señores Representantes, se leyó y aprobó la acta de la Asamblea anterior , ménos en lo tocante à la torre del Convento de Santo Domingo, punto que quedó para ventilarse en mejor ocasion. Enseguida se dió cuenta de una nota del Ejecutivo , referente à que urjía la necesidad de organizar un Ejército ; pues decia el Excmo. Decano: - "Un poder sin bayonetas vale tanto como un cero puesto á la izquierda."'

In [72]:
df.to_csv("../data/corrected-latam-xix.tsv", sep="\t", index=False)
df.to_parquet('../data/corrected-latam-xix.parquet')