In [1]:
"""
Re-used notebook from:
https://github.com/Shef-AIRE/llms_post-ocr_correction/blob/main/results.ipynb

"""


from datasets import Dataset
from IPython.core.getipython import get_ipython
from peft import AutoPeftModelForCausalLM
from tqdm import tqdm
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, BitsAndBytesConfig, pipeline
import Levenshtein
import pandas as pd
import torch

In [2]:
# Compute character error rate (CER)
def cer(prediction, target):
    distance = Levenshtein.distance(prediction, target)
    return distance / len(target)

# Update the results dataframe with CER reduction values
def get_results(data, preds):
    results = data.to_pandas()
    results['Model Correction'] = preds
    # results = results.rename(columns={'CER': 'old_CER'})
    results['old_CER'] = results.apply(lambda row: cer(row['OCR Text'], row['Ground Truth']), axis=1)
    results['new_CER'] = results.apply(lambda row: cer(row['Model Correction'], row['Ground Truth']), axis=1)
    results['CER_reduction'] = ((results['old_CER'] - results['new_CER']) / (results['old_CER'])) * 100
    return results

## BART

Generate post-OCR corrections with BART and save to `results` folder of the project.

In [4]:
model_dir = 'model/***'

test = pd.read_csv('***.csv')
test = Dataset.from_pandas(test)

model = AutoModelForSeq2SeqLM.from_pretrained(model_dir)
tokenizer = AutoTokenizer.from_pretrained(model_dir)
generator = pipeline('text2text-generation', model=model.to('cuda'), tokenizer=tokenizer, device='cuda', max_length=1024)

preds = []
for sample in tqdm(test):
    preds.append(generator(sample['OCR Text'])[0]['generated_text'])

results = get_results(test, preds)
results.to_csv('results/***.csv', index=False)

100%|██████████| 446/446 [02:41<00:00,  2.76it/s]


## Llama 2

Generate post-OCR corrections with Llama 2 and save to `results` folder of the project.

In [3]:
# Load model and configure the callback
model_id = '***'
model_dir = f'***/model/{model_id}'

test = pd.read_csv('***.csv')
test = Dataset.from_pandas(test)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=torch.bfloat16,
)

model = AutoPeftModelForCausalLM.from_pretrained(
    model_dir,
    quantization_config=bnb_config,
    low_cpu_mem_usage=True,
    torch_dtype=torch.float16,
)
tokenizer = AutoTokenizer.from_pretrained(model_dir)

i = 0
preds = []

cell = '''
prompt = f"""### Anweisung:
Korrigieren Sie die OCR-Fehler im bereitgestellten Text.

### Eingabe:
{test[i]['OCR Text']}

### Antwort:
"""

input_ids = tokenizer(prompt, max_length=1024, return_tensors='pt', truncation=True).input_ids.cuda()
with torch.inference_mode():
    outputs = model.generate(input_ids=input_ids, max_new_tokens=1024, do_sample=True, temperature=0.7, top_p=0.1, top_k=40)
pred = tokenizer.batch_decode(outputs.detach().cpu().numpy(), skip_special_tokens=True)[0][len(prompt):].strip()
preds.append(pred)
i += 1
'''

ipython = get_ipython()
for _ in tqdm(range(len(test))):
    ipython.run_cell(cell)

results = get_results(test, preds)
results.to_csv('results/***.csv', index=False)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

  0%|          | 0/376 [00:00<?, ?it/s]Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
100%|██████████| 376/376 [29:42<00:00,  4.74s/it]
