In [None]:
import os
import getpass

if os.name == 'posix':
    password = getpass.getpass("Enter sudo password: ")
    os.system(f'echo {password} | sudo -S apt install tesseract-ocr poppler-utils -y')

In [None]:
! pip install pdf2image pytesseract opencv-python python-dotenv zipfile36
! pip install paddlepaddle==3.2.0 -i https://www.paddlepaddle.org.cn/packages/stable/cpu/
! pip install paddleocr
! pip install regex

In [None]:
from pdf2image import convert_from_path
import pytesseract
import os
import requests
import zipfile
import io
import json
import csv
from paddleocr import PaddleOCR
from dotenv import load_dotenv
from utils import (ocr_layout,
                   extract_vn_letters,
                   extract_cn_letters,
                   extract_letters_index,)


## Download Tessdata for **Tesseract**

In [None]:
def download_tessdata(lang):
    local_dir = os.path.join(os.getcwd(), 'tessdata')
    if not os.path.exists(local_dir):
        os.makedirs(local_dir)
        
    file_path = os.path.join(local_dir, f'{lang}.traineddata')
    
    # Check if file exists and is likely invalid (too small, e.g. HTML page)
    if os.path.exists(file_path) and os.path.getsize(file_path) < 1024 * 1024: # < 1MB
        print(f"Removing invalid file: {file_path}")
        os.remove(file_path)
    
    # Tải về nếu file chưa tồn tại
    if not os.path.exists(file_path):
        # Use 'raw' URL to get the actual file content, not the HTML page
        url = f"https://github.com/tesseract-ocr/tessdata/raw/main/{lang}.traineddata"
        print(f"Downloading {url}...")
        response = requests.get(url)
        with open(file_path, 'wb') as f:
            f.write(response.content)
        print(f"Download tessdata for {lang} successfully")
    else:
        print(f"Tessdata for {lang} existed")
        
    return local_dir

In [None]:
tessdata_path = download_tessdata('vie')
tessdata_path = download_tessdata('chi-sim')

# Khi gọi tesseract để biết sẽ config folder testdata của mình tạo
custom_config = f'--tessdata-dir "{tessdata_path}"'

## Download Poppler for **pdf2image**

In [None]:
def install_poppler():
    local_dir = os.path.join(os.getcwd(), 'Poppler')
    bin_path = os.path.join(local_dir,'poppler-25.11.0', 'Library', 'bin')
    
    url = "https://github.com/oschwartz10612/poppler-windows/releases/download/v25.11.0-0/Release-25.11.0-0.zip"
    
    response = requests.get(url)
    response.raise_for_status()
    
    if not os.path.exists(local_dir):
        os.makedirs(local_dir)
        
    with zipfile.ZipFile(io.BytesIO(response.content)) as z:
        z.extractall(local_dir)
        
    return bin_path

In [None]:
if os.name == 'nt':
    poppler_path = install_poppler()
else:
    poppler_path = None

## Run OCR

In [None]:
load_dotenv()

if os.name == 'nt':
    tesseract_path = os.getenv('TESSERACT_PATH')
    pytesseract.pytesseract.tesseract_cmd = tesseract_path
else:
    os.environ['TESSDATA_PREFIX'] = os.path.join(os.getcwd(), 'tessdata')

In [None]:
start_page = 107
end_page = 108

pages = convert_from_path('../pdf1.pdf', 
                          poppler_path=poppler_path, 
                          dpi=400,
                          first_page = start_page,
                          last_page = end_page,
                          )

In [None]:
cn_ocr = PaddleOCR(lang='ch', 
                use_doc_orientation_classify=False,
                use_doc_unwarping=False,
                use_textline_orientation=False,)

In [None]:
full_text = {}
full_text['CN'] = []
full_text['VN'] = []

csv_file = "result_ocr.csv"

with open(csv_file, mode = "w", newline='', encoding='utf-8-sig') as file:
    writer = csv.writer(file)
    writer.writerow(['src_id', 'cn_text', 'vi_text'])

for i, page_image in enumerate(pages):
    cn_text, vn_text = ocr_layout(page_image, cn_ocr, i)
    
    # Ghi kết quả ocr (chưa được làm sạch) vào csv
    with open(csv_file, mode = 'a', newline='', encoding='utf-8-sig') as file:
        writer = csv.writer(file)
        writer.writerow([i, cn_text, vn_text])
        
    full_text['CN'].append(cn_text)
    
    # vn_text sẽ chứa tiếng việt và tiếng pinyin -> sẽ xử lý sau
    full_text['VN'].append(vn_text)

In [None]:
merged_text_vi = "\n".join(full_text['VN'])
letters_vi = extract_vn_letters(merged_text_vi)

In [None]:
merged_text_cn = "\n".join(full_text['CN'])
letters_cn = extract_cn_letters(merged_text_cn)

In [None]:
def save_to_json(data, filename):
    output_data = []
    
    records = data.to_dict('records')

    for row in records:
        item = {
            "id" : row['id'],
            "vi" : row['vi'],
            "zh" : row['zh']
        }
        
        output_data.append(item)
        
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(
            output_data,
            f,
            indent = 4,
            ensure_ascii=False,
        )
        
    print("Save file json sucessfully")

In [None]:
working_dir = os.getcwd()

extracted_letters = extract_letters_index(letters_vi, letters_cn, start_num=1, end_num=500)
save_to_json(extracted_letters, os.path.join(working_dir, "../pdf1.json"))

extracted_letters.to_csv(os.path.join(working_dir, "../pdf1.csv"), index=False)