#Install package

In [None]:
!pip install torch torchvision torchaudio
!pip install surya-ocr
!pip install paddlepaddle paddleocr openpyxl
!pip install shapely
!pip install easyocr
!pip install paddleocr

#Initialize image path and output path


In [None]:
!git clone https://github.com/tahoangquan2/voting-system.git

In [None]:
import os
image_folder = '/content/voting-system/input'
output_folder= '/content/voting-system/output'
# os.makedirs(output_folder, exist_ok=True)

#Collect PaddleOCR Model

In [None]:
from paddleocr import PaddleOCR, draw_ocr
import os
import pandas as pd

In [None]:
# Initialize the PaddleOCR detector
ocr = PaddleOCR(use_angle_cls=True, lang='en', use_gpu = True)  # Adjust language as needed


In [None]:
# List and sort images based on extracted page number
images_files = [f for f in os.listdir(image_folder) if f.lower().endswith(('.jpeg', '.jpg', '.png', '.bmp'))]


# Process images and save results
results = []

for idx, image_name in enumerate(images_files):
    image_path = os.path.join(image_folder, image_name)

    # Perform OCR detection
    ocr_result = ocr.ocr(image_path, cls=False)

    if ocr_result[0]:
    # Collect bounding box data and confidence
      for line in ocr_result[0]:
          box = line[0]  # Bounding box coordinates
          confidence = line[1][1]  # Confidence score
          results.append({
              "Page Number": idx + 1,
              "Bounding box": box,
              "Confidence": confidence  # Add confidence score
          })

# Convert results to a DataFrame
Paddledf = pd.DataFrame(results)

#Collect VietOCR Model

In [None]:
cd '/content/voting-system/vietnamese-ocr'

In [None]:
pwd

In [None]:
!pip install -r /content/voting-system/vietnamese-ocr/requirement.txt

In [None]:
import os
import cv2
import matplotlib.pyplot as plt
from PIL import Image

# from PaddleOCR import PaddleOCR, draw_ocr


# Now import modules
from vietocr.vietocr.tool.predictor import Predictor
from vietocr.vietocr.tool.config import Cfg

In [None]:
# Configure of VietOCR
config = Cfg.load_config_from_name('vgg_transformer')
# config = Cfg.load_config_from_file('vietocr/config.yml')
# config['weights'] = '/Users/bmd1905/Desktop/pretrain_ocr/vi00_vi01_transformer.pth'

config['cnn']['pretrained'] = True
config['predictor']['beamsearch'] = True
config['device'] = 'cuda:0' # mps

VietOCRrecognitor = Predictor(config)

# Example Usage
# img = cv2.imread(img_path)
# img = Image.fromarray(img)
# rec_result = VietOCRrecognitor.predict(img)

In [None]:
%cd ../

#Collect Surya Model

In [None]:
from surya.detection import batch_text_detection
from surya.model.detection.model import load_model as load_det_model, load_processor as load_det_processor

from surya.recognition import batch_recognition
from surya.model.recognition.model import load_model
from surya.model.recognition.processor import load_processor


model, processor = load_det_model(), load_det_processor()
recognition_model, recognition_processor = load_model(), load_processor()
langs = ["vi"]  # Replace with your languages

In [None]:
def Surya_line_det(images_list):
    bbox_list = []

    for idx, image in enumerate(images_list):
        print(f'Processing {image}...')
        image_path = os.path.join(image_folder, image)
        image = Image.open(image_path)

        predictions = batch_text_detection([image], model, processor)
        for bbox in predictions[0].bboxes:
            bbox_list.append({
                "Page Number": idx + 1,
                "Bounding box": bbox.polygon,
                "Confidence": bbox.confidence
            })

    return bbox_list


def Surya_ocr(warp_image):
    recognition, _ = batch_recognition([warp_image], [langs], recognition_model, recognition_processor)
    # print(f"[!] {image_name} done")
    return recognition[0]

In [None]:
# List and sort images based on extracted page number
images_list = [f for f in os.listdir(image_folder) if f.lower().endswith(('.jpeg', '.jpg', '.png', '.bmp'))]
bbox_list = Surya_line_det(images_list)
Suryabbdf = pd.DataFrame(bbox_list)

#Collect EasyOCR Model

In [None]:
import easyocr
reader = easyocr.Reader(['vi'])

In [None]:
def easyocr_ocr(warp_image):
    ocr_result = reader.recognize(warp_image)
    return ocr_result[0][1]

#Voting on text detection

In [None]:
import pandas as pd
import ast
from shapely.geometry import Polygon

def calculate_overlap_ratio(box1, box2):
    try:
        polygon1 = Polygon(box1)
        polygon2 = Polygon(box2)

        if not polygon1.is_valid or not polygon2.is_valid:
            return 0.0

        intersection = polygon1.intersection(polygon2)

        area1 = polygon1.area
        area2 = polygon2.area
        intersection_area = intersection.area

        ratio1 = intersection_area / area1 if area1 > 0 else 0.0
        ratio2 = intersection_area / area2 if area2 > 0 else 0.0

        return max(ratio1, ratio2)

    except Exception as e:
        print(f"Error calculating overlap: {e}")
        return 0.0

def parse_bbox(bbox_str):

    return ast.literal_eval(str(bbox_str))

def compare_ocr_results(df1, df2):
    output_data = []

    for page in df1['Page Number'].unique():
        page_boxes1 = df1[df1['Page Number'] == page]
        page_boxes2 = df2[df2['Page Number'] == page]
        matched_boxes2 = set()

        for _, box1 in page_boxes1.iterrows():
            # print(box1['Bounding box'])
            box1_coords = parse_bbox(box1['Bounding box'])
            matched = False

            for idx2, box2 in page_boxes2.iterrows():
                if idx2 in matched_boxes2:
                    continue

                box2_coords = parse_bbox(box2['Bounding box'])
                overlap_ratio = calculate_overlap_ratio(box1_coords, box2_coords)

                if overlap_ratio >= 0.4:
                    avg_confidence = (box1['Confidence'] + box2['Confidence']) / 2
                    if avg_confidence > 0.5:
                        output_data.append({
                            'Page Number': page,
                            'Bounding box': box1['Bounding box'],
                            'Confidence': avg_confidence,
                            'Source': 'matched'
                        })
                    matched_boxes2.add(idx2)
                    matched = True

            if not matched and box1['Confidence'] > 0.8:
                output_data.append({
                    'Page Number': page,
                    'Bounding Box': box1['Bounding Box'],
                    'Confidence': box1['Confidence'],
                    'Source': 'ocr1'
                })

        for idx2, box2 in page_boxes2.iterrows():
            if idx2 not in matched_boxes2 and box2['Confidence'] > 0.8:
                output_data.append({
                    'Page Number': page,
                    'Bounding Box': box2['Bounding Box'],
                    'Confidence': box2['Confidence'],
                    'Source': 'ocr2'
                })

    return pd.DataFrame(output_data)


matched_results = compare_ocr_results(Paddledf, Suryabbdf)

#Image Warp Perspective

In [None]:
import cv2
import numpy as np
import pandas as pd
import os
import ast
from itertools import groupby
from typing import List, Dict, Tuple

def order_points(pts):
    rect = np.zeros((4, 2), dtype="float32")
    s = pts.sum(axis=1)
    rect[0] = pts[np.argmin(s)]
    rect[2] = pts[np.argmax(s)]
    diff = np.diff(pts, axis=1)
    rect[1] = pts[np.argmin(diff)]
    rect[3] = pts[np.argmax(diff)]
    return rect

def four_point_transform(image, pts):
    rect = order_points(pts)
    (tl, tr, br, bl) = rect

    widthA = np.sqrt(((br[0] - bl[0]) ** 2) + ((br[1] - bl[1]) ** 2))
    widthB = np.sqrt(((tr[0] - tl[0]) ** 2) + ((tr[1] - tl[1]) ** 2))
    maxWidth = max(int(widthA), int(widthB))

    heightA = np.sqrt(((tr[0] - br[0]) ** 2) + ((tr[1] - br[1]) ** 2))
    heightB = np.sqrt(((tl[0] - bl[0]) ** 2) + ((tl[1] - bl[1]) ** 2))
    maxHeight = max(int(heightA), int(heightB))

    padding = 4
    padded_width = maxWidth + (2 * padding)
    padded_height = maxHeight + (2 * padding)

    dst = np.array([
        [padding, padding],
        [padded_width - padding - 1, padding],
        [padded_width - padding - 1, padded_height - padding - 1],
        [padding, padded_height - padding - 1]], dtype="float32")

    M = cv2.getPerspectiveTransform(rect, dst)
    warped = cv2.warpPerspective(image, M, (padded_width, padded_height))
    return warped

def process_images_with_boxes(image_files, df):
    """
    Process a list of image files with bounding boxes from DataFrame

    Args:
        image_files: List of image file paths
        df: DataFrame with bounding box information

    Returns:
        List of lists, where each inner list contains the cropped regions for one image
    """
    all_regions = []  # List to store lists of regions for each image

    # Process each image
    for page_idx, image_path in enumerate(image_files):
        page_num = page_idx + 1  # Assuming page numbers start from 1

        # Get boxes for current page
        page_boxes = df[df['Page Number'] == page_num]

        # Read the image
        image_path = os.path.join(image_folder, image_path)
        image = cv2.imread(image_path)
        if image is None:
            print(f"Error: Could not read image: {image_path}")
            all_regions.append([])  # Add empty list for failed image
            continue

        # List to store regions for current image
        page_regions = []

        # Process each box in the current page
        for _, row in page_boxes.iterrows():
            try:
                bbox = np.array(ast.literal_eval(str(row['Bounding box'])), dtype="float32")
                warped = four_point_transform(image, bbox)
                page_regions.append(warped)
            except Exception as e:
                print(f"Error processing region in page {page_num}: {str(e)}")
                continue

        all_regions.append(page_regions)

    return all_regions

processed_regions = process_images_with_boxes(images_files, matched_results)

# Draw bounding box

In [None]:
def draw_bounding_boxes(images_files, df):
    """
    Draw bounding boxes on images and save them to output folder

    Args:
        images_files: List of image file paths
        df: DataFrame with bounding box information
    """
    # Create output folder if it doesn't exist
    os.makedirs(output_folder, exist_ok=True)

    for page_idx, image_path in enumerate(images_files):
        page_num = page_idx + 1

        # Get boxes for current page
        page_boxes = df[df['Page Number'] == page_num]

        # Read original image
        image = cv2.imread(os.path.join(image_folder, image_path))
        if image is None:
            print(f"Error: Could not read image: {image_path}")
            continue

        # Draw all bounding boxes
        for _, row in page_boxes.iterrows():
            bbox = np.array(ast.literal_eval(str(row['Bounding box'])), dtype="float32")
            # Draw lines connecting the points
            for i in range(4):
                pt1 = tuple(bbox[i])
                pt2 = tuple(bbox[(i + 1) % 4])
                cv2.line(image,
                        (int(pt1[0]), int(pt1[1])),
                        (int(pt2[0]), int(pt2[1])),
                        (255, 0, 0), 2)  # Blue color, thickness=2

        # Save the image
        output_path = os.path.join(output_folder, f'bbox_{os.path.basename(image_path)}')
        cv2.imwrite(output_path, image)
        print(f"Saved image with bounding boxes to: {output_path}")

draw_bounding_boxes(images_files, matched_results)

#Voting on text recognition

In [None]:
import csv
from collections import Counter, defaultdict
import pandas as pd
import re

import unicodedata


# Expanded mapping of accents to their corresponding numbers
ACCENT_TO_NUMBER = {
    "á": 1, "à": 2, "ã": 3, "ả": 4, "ạ": 5,
    "é": 1, "è": 2, "ẽ": 3, "ẻ": 4, "ẹ": 5,
    "í": 1, "ì": 2, "ĩ": 3, "ỉ": 4, "ị": 5,
    "ó": 1, "ò": 2, "õ": 3, "ỏ": 4, "ọ": 5,
    "ú": 1, "ù": 2, "ũ": 3, "ủ": 4, "ụ": 5,
    "ý": 1, "ỳ": 2, "ỹ": 3, "ỷ": 4, "ỵ": 5,
    "ấ": 1, "ầ": 2, "ẫ": 3, "ẩ": 4, "ậ": 5,
    "ế": 1, "ề": 2, "ễ": 3, "ể": 4, "ệ": 5,
    "ố": 1, "ồ": 2, "ỗ": 3, "ổ": 4, "ộ": 5,
    "ắ": 1, "ằ": 2, "ẵ": 3, "ẳ": 4, "ặ": 5,
    "ớ": 1, "ờ": 2, "ỡ": 3, "ở": 4, "ợ": 5,
    "ứ": 1, "ừ": 2, "ữ": 3, "ử": 4, "ự": 5#,
    #"đ": 9  # For 'đ', assigning a unique number
}

# Mapping of accented characters to their base (unmarked) characters
ACCENT_TO_BASE = {
    "á": "a", "à": "a", "ã": "a", "ả": "a", "ạ": "a",
    "é": "e", "è": "e", "ẽ": "e", "ẻ": "e", "ẹ": "e",
    "í": "i", "ì": "i", "ĩ": "i", "ỉ": "i", "ị": "i",
    "ó": "o", "ò": "o", "õ": "o", "ỏ": "o", "ọ": "o",
    "ú": "u", "ù": "u", "ũ": "u", "ủ": "u", "ụ": "u",
    "ý": "y", "ỳ": "y", "ỹ": "y", "ỷ": "y", "ỵ": "y",
    "â": "â", "ấ": "â", "ầ": "â", "ẫ": "â", "ẩ": "â", "ậ": "â",
    "ê": "ê", "ế": "ê", "ề": "ê", "ễ": "ê", "ể": "ê", "ệ": "ê",
    "ô": "o", "ố": "o", "ồ": "o", "ỗ": "o", "ổ": "o", "ộ": "o",
    "ă": "ă", "ắ": "ă", "ằ": "ă", "ẵ": "ă", "ẳ": "ă", "ặ": "ă",
    "ơ": "ơ", "ớ": "ơ", "ờ": "ơ", "ỡ": "ơ", "ở": "ơ", "ợ": "ơ",
    "ư": "ư", "ứ": "ư", "ừ": "ư", "ữ": "ư", "ử": "ư", "ự": "ư"
}



def normalize_vietnamese(word):
    normalized_word = ""
    accent_number = ""
    word = word.lower()
    for char in word:
        if char in ACCENT_TO_NUMBER:
            normalized_word += ACCENT_TO_BASE[char]  # Replace with the base character
            accent_number = str(ACCENT_TO_NUMBER[char])  # Set the corresponding accent number
        else:
            normalized_word += char  # Add the character as is if no accent

    return normalized_word + accent_number




import difflib

def load_words(file_path):
    """Load words from the given text file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return [line.strip() for line in file]

def load_normalize_words(file_path):
    """Load words from the given text file."""
    with open(file_path, 'r', encoding='utf-8') as file:
        return [normalize_vietnamese(line.strip()) for line in file]

def find_similar_words(word, word_list, max_results=10):
    """
    Find similar words using difflib.
    - Returns up to `max_results` similar words from the word list.
    """
    similar_list  = []
    normalize_word = normalize_vietnamese(word)
    cnt = 0
    for check in word_list:
        normalize_check = normalize_vietnamese(check)
        similar, dis = words_similar(normalize_word, normalize_check, threshold= 0.3)

        if similar:
            similar_list.append(check)
            cnt += 1

        if(cnt >= max_results):
            break
    return similar_list



def check_word_in_file(word, vietnam_word_list, normalize_vietnam_list):
    """
    Check if a word is in the file and return suggestions if it's not found.
    - If found, return "Yes".
    - If not, return "No" and a list of similar words.
    """

    normalize_word = normalize_vietnamese(word)
    normalize_word = re.sub(r'[.,:_;?/"()…]', '', normalize_word)
    if normalize_word in normalize_vietnam_list:
        similar_words = [word]
        return True, similar_words
    else:
        similar_words = find_similar_words(word, vietnam_word_list)
        return False, similar_words


def normalize_text(text):
    # Remove newline characters
    text = re.sub(r'\s*[\n\r]+\s*', ' ', text)

    # Convert "~", "=", "—", "→", "_" to "-"
    text = text.replace("~", "-")
    text = text.replace("=", "-")
    text = text.replace("—", "-")
    text = text.replace("→", '-')
    text = text.replace("_", '-')

    # Convert "«»", "<>" to '""'
    text = text.replace("« ", '"')
    text = text.replace(" »", '"')
    text = text.replace("< ", '"')
    text = text.replace(" >", '"')

    # Convert '...' to '…'
    text = re.sub(r'\.\.\.', '…', text)

    # Ensure punctuation is followed by a space
    text = re.sub(r'([….,;:?!])\s', r'\1 ', text)

    # Remove spaces before punctuation
    text = re.sub(r'\s+([….,!?;:])', r'\1', text)

    # Ensure only one space between words
    text = re.sub(r'\s+', ' ', text)

    # Ensure a space follows "-" at the start of the text
    text = re.sub(r'^-\s*', '- ', text)

    # Strip leading and trailing spaces
    text = text.strip()

    # Return normalized text
    return text

def normalize_word(word):
    """
    Normalize a word to lowercase and remove accents (diacritical marks).

    Args:
        word (str): The word to normalize.

    Returns:
        str: The normalized word.
    """

    if(word ==  'None'):
        return word
    # Convert to lowercase
    word = word.lower()

    # Remove accents by decomposing the Unicode characters and filtering
    word = ''.join(
        char for char in unicodedata.normalize('NFD', word)
        if unicodedata.category(char) != 'Mn'
    )
    word = re.sub(r'-', '', word)
    word = re.sub(r'f', 't', word)
    word = re.sub(r'j', 'i', word)


    return word


def levenshtein_distance(a, b):
    """Calculate the Levenshtein distance between two strings."""
    len_a, len_b = len(a), len(b)
    dp = [[0] * (len_b + 1) for _ in range(len_a + 1)]

    # Initialize base cases
    for i in range(len_a + 1):
        dp[i][0] = i  # Cost of deleting all characters from `a`
    for j in range(len_b + 1):
        dp[0][j] = j  # Cost of inserting all characters into `a`

    # Compute distances
    for i in range(1, len_a + 1):
        for j in range(1, len_b + 1):
            if a[i - 1] == b[j - 1]:
                cost = 0  # No cost if characters match
            else:
                cost = 1  # Substitution cost
            dp[i][j] = min(
                dp[i - 1][j] + 1,      # Deletion
                dp[i][j - 1] + 1,      # Insertion
                dp[i - 1][j - 1] + cost  # Substitution
            )

    return dp[len_a][len_b]


def words_similar(word1, word2, threshold=0.5):
    """
    Check if two words are similar based on Levenshtein distance
    after normalization.

    Args:
        word1 (str): First word.
        word2 (str): Second word.
        threshold (float): Maximum allowed distance as a fraction of word length.

    Returns:
        bool: True if the words are similar enough, False otherwise.
    """
    if(word1 == 'None' and word1 == word2):
        return True, 4
    if(word1 == 'None'):
        return False, len(word2)
    if(word2 == 'None'):
        return False, len(word1)
    # Normalize both words
    word1 = normalize_word(word1)
    word2 = normalize_word(word2)
    # Calculate Levenshtein distance
    distance = levenshtein_distance(word1, word2)
    if(distance <= threshold * max(len(word2), len(word1))):
      return True , distance
    else:
      return False , distance


def MED_to_word(sen1, sen2):
    # Initialize cache for MED calculation and operations tracking
    sen1 = sen1.strip()
    sen2 = sen2.strip()
    seq1 = sen1.split(' ')
    seq2 = sen2.split(' ')
    cache = [[float("inf")] * (len(seq2) + 1) for _ in range(len(seq1) + 1)]
    ops = [[None] * (len(seq2) + 1) for _ in range(len(seq1) + 1)]

    # Fill base cases
    for j in range(len(seq2) + 1):
        cache[len(seq1)][j] = len(seq2) - j
        ops[len(seq1)][j] = 'insert'
    for i in range(len(seq1) + 1):
        cache[i][len(seq2)] = len(seq1) - i
        ops[i][len(seq2)] = 'delete'

    # Fill the cache and ops table
    for i in range(len(seq1) - 1, -1, -1):
        for j in range(len(seq2) - 1, -1, -1):
            compare, dis = words_similar(seq1[i],seq2[j])
            if compare:
                cache[i][j] = cache[i + 1][j + 1]
                ops[i][j] = 'match'  # Characters match, move diagonally
            else:
                # Consider all operations: insert, delete, substitute
                insert_cost = 1 + cache[i][j + 1]
                delete_cost = 1 + cache[i + 1][j]
                substitute_cost = 1 + cache[i + 1][j + 1]

                # Choose the operation with the minimum cost
                if insert_cost <= delete_cost and insert_cost <= substitute_cost:
                    cache[i][j] = insert_cost
                    ops[i][j] = 'insert'
                elif delete_cost <= insert_cost and delete_cost <= substitute_cost:
                    cache[i][j] = delete_cost
                    ops[i][j] = 'delete'
                else:
                    cache[i][j] = substitute_cost
                    ops[i][j] = 'substitute'

    # Backtrack
    aligned_seq1, aligned_seq2 = [], []
    i, j = 0, 0
    while i < len(seq1) or j < len(seq2):
        if i < len(seq1) and j < len(seq2) and ops[i][j] == 'match':
            aligned_seq1.append(seq1[i])
            aligned_seq2.append(seq2[j])
            i += 1
            j += 1
        elif i < len(seq1) and ops[i][j] == 'delete':
            aligned_seq1.append(seq1[i])
            aligned_seq2.append('None')
            i += 1
        elif j < len(seq2) and ops[i][j] == 'insert':
            aligned_seq1.append('None')
            aligned_seq2.append(seq2[j])
            j += 1
        elif i < len(seq1) and j < len(seq2) and ops[i][j] == 'substitute':
            aligned_seq1.append(seq1[i])
            aligned_seq2.append(seq2[j])
            i += 1
            j += 1

    aligned_seq1 = ' '.join(aligned_seq1)
    aligned_seq2 = ' '.join(aligned_seq2)
    return aligned_seq1, aligned_seq2


def align_multiple_sequences(sequences, best_ocr = 1):
    cnt = len(sequences)
    try_case = 0
    while try_case < len(sequences):
        aligned_sequences = sequences[:]
        limit = 0
        base_sequence = aligned_sequences.pop(try_case)
        while limit < 10:
            # Align each remaining sequence to the base sequence
            updated_sequences = []
            for seq in aligned_sequences:
                aligned_seq, base_sequence = MED_to_word(seq, base_sequence)
                updated_sequences.append(aligned_seq)

            # Update the list of aligned sequences
            aligned_sequences = updated_sequences

            # Check if all sequences are aligned to the same length


            if all(len(seq. split(' ')) == len(base_sequence.split(' ')) for seq in aligned_sequences):
                aligned_sequences.append(base_sequence)
                return aligned_sequences, 0



            limit += 1

        try_case += 1


    # If can't align return the best ocr result
    res = []
    for i in range(cnt):
        res.append(sequences[best_ocr])
    return res, 1


def read_ocr_inputs(csv_file):
    ocr_outputs = []
    with open(csv_file, mode='r', encoding='utf-8') as file:
        reader = csv.reader(file)
        for row in reader:
            # Skip the filename column and extract the next 4 columns
            ocr_outputs.append(row[1:5])  # Columns 2 to 5 (0-based index)

    return ocr_outputs


import string

def most_common_end_punctuation(strings):
    """
    Find the most common ending punctuation in a list of strings.
    :param strings: List of strings
    :return: The most common ending punctuation and its count
    """
    end_punctuations = []

    # Define punctuation characters
    punctuation_set = {'?', '!', '.', ';' , ',', ']', '}', '%' , '$', '…'}

    # Collect ending punctuation from each string
    for s in strings:
        if s and s[-1] in punctuation_set:
            end_punctuations.append(s[-1])

    # Use Counter to find the most common ending punctuation
    if end_punctuations:
        counter = Counter(end_punctuations)
        return counter.most_common(1)[0][0]  # Return the most common punctuation and its count
    else:
        return '' # No ending punctuation found

def most_common_start_punctuation(strings):
    """
    Find the most common start punctuation in a list of strings.
    :param strings: List of strings
    :return: The most common start punctuation and its count
    """
    start_punctuations = []

    # Define punctuation characters
    punctuation_set = ['-', '"', '(' , '{', '[']

    # Collect ending punctuation from each string
    for s in strings:
        if s and s[0] in punctuation_set:
            start_punctuations.append(s[0])

    # Use Counter to find the most common ending punctuation
    if start_punctuations:
        counter = Counter(start_punctuations)
        return counter.most_common(1)[0][0]  # Return the most common punctuation and its count
    else:
        return '' # No ending punctuation found

def is_integer(s):
    """
    Check if a string represents an integer.
    :param s: Input string
    :return: True if the string represents an integer, False otherwise
    """
    try:
        int(s)  # Try converting the string to an integer
        return True
    except ValueError:
        return False

def vote(list_word, vietnam_word_list, normalize_vietnam_list):

    vote_container = []
    weight_container = []
    begin_punctuation = most_common_start_punctuation(list_word)
    end_punctuation = most_common_end_punctuation(list_word)

    for i in range(len(list_word)): #Early termination if there is a valid word and a word same to it
        i = len(list_word) - 1 - i
        nw = re.sub(r'[.,:_;?/"()…]', '', list_word[i])
        if(nw == '' or nw == 'None'):
            continue
        in_list, similar_words = check_word_in_file(nw, vietnam_word_list, normalize_vietnam_list)
        if in_list:
            for j in range(len(list_word)):
                if(i != j):
                    now = re.sub(r'[.,:_;?/"()…]', '', list_word[j])
                    if(now == nw):
                        return list_word[i], 2004

    for i , word in enumerate(reversed(list_word)):
        org_word = word
        if word == 'None' or word == '':
            continue
        word = re.sub(r'[.,:_;?/"()…]', '', word)
        if is_integer(word):
            vote_container.append(word)
            weight_container.append(1)
            continue
        in_list, similar_words = check_word_in_file(word, vietnam_word_list, normalize_vietnam_list)
        if in_list:
            for other_word in list_word:
                if org_word != other_word:  # Avoid comparing the same element
                    other_word = re.sub(r'[.,:_;?/"(){}[]…]', '', other_word)
                    nw = normalize_vietnamese(normalize_word(word))
                    now = normalize_vietnamese(normalize_word(other_word))
                    similar, dist = words_similar(nw, now, threshold = 0.3)
                    if(similar): #Early termination if there is a valid word and a word similar to it
                            return begin_punctuation + word + end_punctuation, 2011

            vote_container.append(word)
            weight_container.append(1 * (1.25 * (i // 2)))
            continue
        for sim in similar_words:
            vote_container.append(sim)
            dist = levenshtein_distance(sim, word)
            weight_container.append(0.68 * (1 - dist/max(len(word), len(sim))) * (1.25 * (i // 2)))

    if not vote_container:
        if begin_punctuation == '' and end_punctuation == '':
            return None, 0  # Return None if the list is empty
        else:
            return begin_punctuation + end_punctuation, 1

    if len(vote_container) != len(weight_container):
        raise ValueError("vote_container and weight_container must have the same length.")

        # Aggregate weights for each unique vote
    weight_sum = defaultdict(float)
    for vote, weight in zip(vote_container, weight_container):
        weight_sum[vote] += weight

    # Find the item with the highest total weight
    most_common = max(weight_sum.items(), key=lambda item: item[1])
    most_common_item, total_weight = most_common

    return begin_punctuation + most_common_item + end_punctuation , total_weight


def read_and_vote(warp_images_list, vietnam_word_list, normalize_vietnam_list ):

    # for bbox in df["Bounding Box"]:  # Select by column label

    # Iterate through rows starting from the second row
    rowID = 1
    image_list = []
    for image_id, warp_images in enumerate(warp_images_list):
          line_list = []
          for warp_image in warp_images:

              pil_img = Image.fromarray(warp_image)
              sentence1 = VietOCRrecognitor.predict(pil_img) # VietOCR text
              sentence2 = easyocr_ocr(warp_image) # Easy OCR text
              sentence3 = Surya_ocr(pil_img)   # Surya text


              sequences = [sentence1, sentence2, sentence3]
              align_sentences, success = align_multiple_sequences(sequences)
              vote_sentence_splits = []

              if success == '1':
                  vote_sentence = align_sentences[0]
              else:
                  sentences_splits = []
                  for align in align_sentences:
                      word_list = align.split(' ')
                      sentences_splits.append(word_list)
                  for i in range(len(sentences_splits[0])):
                      word1 = sentences_splits[0][i]
                      word2 = sentences_splits[1][i]
                      word3 = sentences_splits[2][i]

                      list_word_to_vote = [word1, word2, word3]
                      vote_word = vote(list_word_to_vote, vietnam_word_list, normalize_vietnam_list)

                      if vote_word[0]:
                          vote_sentence_splits.append(vote_word[0])

                  vote_sentence = ' '.join(vote_sentence_splits)
                  print(vote_sentence)
                  line_list.append(vote_sentence)
          image_list.append(line_list)
    return image_list

          # Writing the voted text and bounding box to a text file
          # Assuming warp_image contains the image ID and bounding boxes in some form
          #image_id = warp_image["image_id"]  # Replace with actual image ID retrieval
          # bounding_boxes = warp_image["bounding_boxes"]  # Replace with actual bounding box retrieval

          # label_file_path = os.path.join(output_folder, f"{image_id}.txt")
          # with open(label_file_path, 'w') as label_file:
          #     for i, bbox in enumerate(bounding_boxes):
          #         # bbox is assumed to be a tuple or list (x_min, y_min, x_max, y_max)
          #         label_line = f"{bbox} {vote_sentence}\n"
          #         label_file.write(label_line)


# Example usage
vietnamese_file_path = '/content/voting-system/voting/VN_MorphoSyllable_List.txt'
vietnam_word_list = load_words(vietnamese_file_path)
normalize_vietnam_list = load_normalize_words(vietnamese_file_path)
voted_ocr = read_and_vote(processed_regions, vietnam_word_list, normalize_vietnam_list)



#Apply Language Model

In [None]:
from transformers import pipeline
corrector = pipeline("text2text-generation", model="bmd1905/vietnamese-correction-v2", device=0)
MAX_LENGTH = 512

In [None]:
def revert_pipeline_changes(ocr_lines, predictions):
    check_list = []
    reverted_lines = []

    for row, (ocr_line, prediction) in enumerate(zip(ocr_lines, predictions)):
        pred_text = prediction['generated_text']

        # Remove added capitalization if OCR doesn't have it
        if ocr_line and pred_text[0].isupper() and ocr_line[0].islower():
            pred_text = pred_text[0].lower() + pred_text[1:]

        # Remove added punctuation if OCR doesn't have it
        punctuation_marks = [".", "!", "?", ":", "…", ","]
        for mark in punctuation_marks:
            if ocr_line and not ocr_line.endswith(mark) and pred_text.endswith(mark):
                check_list.append((row + 2, 1))
                pred_text = pred_text[:-1]
                break

        ocr_word = ocr_line.split()
        pred_word = pred_text.split()
        if len(pred_word) - len(ocr_word) > 1:
            check_list.append((row + 2, 2))
            pred_text = " ".join(pred_word[:len(ocr_word)])
        elif len(pred_word) - len(ocr_word) == 1 and levenshtein_distance(pred_word[1], ocr_word[0])/len(ocr_word[0]) < 0.5:
            check_list.append((row + 2, 3))
            pred_text = " ".join(pred_word[1:])
        elif len(pred_word) - len(ocr_word) == 1:
            check_list.append((row + 2, 4))
            pred_text = " ".join(pred_word[:-1])

        reverted_lines.append(pred_text)

    return reverted_lines, check_list


def batch_predictions(voted_ocr):
    predictions = corrector(voted_ocr, max_length=MAX_LENGTH)
    return predictions

In [None]:
LM_results = []
for image in voted_ocr:
  predictions = batch_predictions(image)
  refine_ocr, check_list = revert_pipeline_changes(image, predictions)
  LM_results.append(refine_ocr)

print(LM_results)

# Final Output

In [None]:
import json

def write_results_to_file(images_files, matched_results, text_strings, output_file):
    """
    Write results to a text file in the specified format

    Args:
        images_files: List of image file paths
        matched_results: DataFrame with Page Number and Bounding box
        text_strings: Texts corresponding to the bounding boxes
        output_file: Path to output text file
    """
    with open(output_file, 'w', encoding='utf-8') as f:
        # Process each page
        for page_idx, image_path in enumerate(images_files):
            page_num = page_idx + 1

            # Write image path
            f.write(f"output_images/{image_path}")

            # Get boxes and texts for current page
            page_data = matched_results[matched_results['Page Number'] == page_num]

            # Create list for JSON data
            json_data = []

            # Add each box and text pair
            for idx, row in page_data.iterrows():
                # Convert bounding box string to list of points
                bbox = np.array(ast.literal_eval(str(row['Bounding box'])), dtype="float32")
                points = [[int(x), int(y)] for x, y in bbox]

                # Get corresponding text
                text = text_strings[page_idx][len(json_data)]  # Use current count as index

                # Create entry
                entry = {
                    "transcription": text,
                    "points": points,
                    "difficult": False
                }
                json_data.append(entry)

            # Write JSON data
            f.write(" " + json.dumps(json_data, ensure_ascii=False))
            f.write("\n")

output_file = output_folder + "/LM.txt"
write_results_to_file(images_files, matched_results, LM_results, output_file)

output_file = output_folder + "/Vote.txt"
write_results_to_file(images_files, matched_results, voted_ocr, output_file)