In [5]:
import re
import Levenshtein

def normalize_text(text: str) -> str:
    """
    Chuẩn hóa văn bản:
    - Loại bỏ khoảng trắng dư thừa (nhiều khoảng trắng giữa các từ).
    - Loại bỏ khoảng trắng đầu/cuối.
    - Chuyển về chữ thường.
    """
    # Tách chuỗi thành các từ rồi nối lại bằng một khoảng trắng
    return ' '.join(text.strip().split()).lower()

def calculate_cer(ground_truth: str, ocr_result: str) -> float:
    """
    Tính toán Character Error Rate (CER)
    CER = (S + D + I) / N, với N là tổng số ký tự của ground truth.
    """
    # Chuẩn hóa văn bản
    gt_norm = normalize_text(ground_truth)
    ocr_norm = normalize_text(ocr_result)
    
    edit_distance = Levenshtein.distance(gt_norm, ocr_norm)
    if len(gt_norm) == 0:
        return float('inf')
    return edit_distance / len(gt_norm)

def calculate_wer(ground_truth: str, ocr_result: str) -> float:
    """
    Tính toán Word Error Rate (WER) theo cấp độ từ.
    """
    # Chuẩn hóa văn bản
    gt_norm = normalize_text(ground_truth)
    ocr_norm = normalize_text(ocr_result)
    
    gt_words = gt_norm.split()
    ocr_words = ocr_norm.split()
    
    # Khởi tạo bảng dynamic programming để tính khoảng cách edit giữa danh sách từ
    n = len(gt_words)
    m = len(ocr_words)
    dp = [[0] * (m + 1) for _ in range(n + 1)]
    
    for i in range(n + 1):
        dp[i][0] = i
    for j in range(m + 1):
        dp[0][j] = j
    
    for i in range(1, n + 1):
        for j in range(1, m + 1):
            if gt_words[i - 1] == ocr_words[j - 1]:
                dp[i][j] = dp[i - 1][j - 1]
            else:
                dp[i][j] = 1 + min(dp[i - 1][j],    # deletion
                                   dp[i][j - 1],    # insertion
                                   dp[i - 1][j - 1]) # substitution
    
    if n == 0:
        return float('inf')
    return dp[n][m] / n

# Ví dụ sử dụng:
if __name__ == '__main__':
    ground_truth_text = "  Đây    là   một   ví dụ  về  văn bản    tiếng Việt.  "
    ocr_output_text = "  Đây làmột   ví dụ  về  văn bản    tiếng Việt.  "
    
    cer_value = calculate_cer(ground_truth_text, ocr_output_text)
    wer_value = calculate_wer(ground_truth_text, ocr_output_text)
    
    print(f"CER: {cer_value:.2f}")
    print(f"WER: {wer_value:.2f}")


CER: 0.03
WER: 0.20


In [1]:
!pip install python-Levenshtein

Defaulting to user installation because normal site-packages is not writeable
Collecting python-Levenshtein
  Downloading python_levenshtein-0.27.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.27.1 (from python-Levenshtein)
  Downloading levenshtein-0.27.1-cp312-cp312-win_amd64.whl.metadata (3.6 kB)
Downloading python_levenshtein-0.27.1-py3-none-any.whl (9.4 kB)
Downloading levenshtein-0.27.1-cp312-cp312-win_amd64.whl (100 kB)
Installing collected packages: Levenshtein, python-Levenshtein
Successfully installed Levenshtein-0.27.1 python-Levenshtein-0.27.1



[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [1]:
!pip install neo4j==1.7.3 neobolt==1.7.13

Defaulting to user installation because normal site-packages is not writeable
Collecting neo4j==1.7.3
  Using cached neo4j-1.7.3.tar.gz (24 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting neobolt==1.7.13
  Using cached neobolt-1.7.13.tar.gz (183 kB)
  Installing build dependencies: started
  Installing build dependencies: finished with status 'done'
  Getting requirements to build wheel: started
  Getting requirements to build wheel: finished with status 'done'
  Preparing metadata (pyproject.toml): started
  Preparing metadata (pyproject.toml): finished with status 'done'
Collecting neotime~=1.7.1 (from neo4j==1.7.3)
  Using cached neotime-1.7.4.tar.gz (17 kB)
  Installing build dependencies: 


[notice] A new release of pip is available: 24.2 -> 25.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
