In [6]:
import json
import re

# Input/output file paths
input_path = "/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/data/invoices.jsonl"
output_path = "/mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/data/labeled_invoices_fixed.jsonl"

# Regex patterns supporting Vietnamese + optional English + full prefix included
label_patterns = [
    (r"(Đơn vị bán hàng:\s*CÔNG TY[^\n]+)", "COMPANY_NAME"),
    (r"^(CÔNG TY[^\n]+)$", "COMPANY_NAME"),
    (r"(Mã số thuế\s*(\(.*?\))?\s*:\s*\d+)", "TAX_CODE"),
    (r"(Ký hiệu\s*(\(.*?\))?\s*:\s*[A-Z0-9]+)", "SERIAL"),
    (r"(Số\s*(\(.*?\))?\s*:\s*\d+)", "INVOICE_NO"),
    (r"(Tên đơn vị\s*(\(.*?\))?\s*:\s*[^\n]+)", "BUYER_NAME"),
    (r"(Hình thức thanh toán\s*(\(.*?\))?\s*:\s*(TM/CK|Tiền mặt|Chuyển khoản))", "PAYMENT_METHOD"),
    (r"(Cộng tiền hàng\s*(\(.*?\))?\s*:\s*[\d\.]+)", "TOTAL_AMOUNT"),
    (r"(Tiền thuế GTGT\s*(\(.*?\))?\s*:\s*[\d\.]+)", "VAT_AMOUNT"),
    (r"(Tổng tiền thanh toán\s*(\(.*?\))?\s*:\s*[\d\.]+)", "TOTAL_PAYMENT"),
]

# Processing
output_data = []

with open(input_path, "r", encoding="utf-8") as f:
    for line in f:
        data = json.loads(line)
        text = data["text"]
        labels = []
        matched_tax_codes = []

        for pattern, label in label_patterns:
            for match in re.finditer(pattern, text, flags=re.MULTILINE):
                full_span = match.group(1)
                start = match.start(1)
                end = match.end(1)

                if label == "TAX_CODE":
                    matched_tax_codes.append((start, end))
                else:
                    labels.append([start, end, label])

        # Assign TAX_CODEs explicitly
        if len(matched_tax_codes) > 0:
            labels.append([matched_tax_codes[0][0], matched_tax_codes[0][1], "SELLER_TAX_CODE"])
        if len(matched_tax_codes) > 1:
            labels.append([matched_tax_codes[1][0], matched_tax_codes[1][1], "BUYER_TAX_CODE"])

        data["label"] = labels
        output_data.append(data)

# Write labeled output
with open(output_path, "w", encoding="utf-8") as fout:
    for item in output_data:
        fout.write(json.dumps(item, ensure_ascii=False) + "\n")

print(f"✅ Done! Output saved to: {output_path}")


✅ Done! Output saved to: /mnt/c/Users/Legion/Documents/jimmy tran/Automated-invoice-processing-system/data/labeled_invoices_fixed.jsonl
