In [None]:
#Melakukan koneksi ke google drive dimana file receipt diupload
from google.colab import drive
drive.mount('/content/drive')

#Menginstall dependensi yang diperlukan
!apt-get install -y poppler-utils
!pip install pdf2image
!pip install easyocr
!pip install matplotlib

#Mengimport library yang diperlukan
import os
import cv2
import easyocr
import re
import matplotlib.pyplot as plt
import numpy as np  # Import numpy
from pdf2image import convert_from_path
from skimage.metrics import structural_similarity as ssim
from difflib import SequenceMatcher

# Fungsi untuk bilateral filtering pada gambar
def apply_bilateral_filtering(image):
    return cv2.bilateralFilter(image, 9, 75, 75)

# Fungsi untuk mean filtering pada gambar
def apply_mean_filtering(image):
    return cv2.blur(image, (5, 5))

# Fungsi untuk median filtering pada gambar
def apply_median_filtering(image):
    return cv2.medianBlur(image, 5)

# Fungsi untuk denoising filtering pada gambar
def apply_denoising_filtering(image):
    return cv2.fastNlMeansDenoisingColored(image, None, 10, 10, 7, 21)

# Fungsi untuk komparasi hasil filtering
def compare_images(imageA, imageB):
    imageA_gray = cv2.cvtColor(imageA, cv2.COLOR_BGR2GRAY)
    imageB_gray = cv2.cvtColor(imageB, cv2.COLOR_BGR2GRAY)
    score, _ = ssim(imageA_gray, imageB_gray, full=True)
    return score

# Fungsi untuk mengecek ketajaman gambar menggunakan metode variance of Laplacian
def is_image_clear(image, threshold=100):
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    variance = cv2.Laplacian(gray, cv2.CV_64F).var()
    return variance > threshold  # Jika variance > threshold, gambar dianggap cukup jelas

# Fungsi choose_best_preprocessing untuk memilih metode preprocessing terbaik
def choose_best_preprocessing(image):
    if not is_image_clear(image):
        print("\nGambar terlalu buram untuk diproses. Harap gunakan gambar yang lebih jelas.")
        return None, "Image too blurry"

    # Proses lanjut jika gambar cukup jelas
    print("\nGambar dinyatakan jelas. Melanjutkan ke tahap preprocessing.")
    bilateral = apply_bilateral_filtering(image)
    mean = apply_mean_filtering(image)
    median = apply_median_filtering(image)
    denoising = apply_denoising_filtering(image)

    processed_images = [bilateral, mean, median, denoising]
    original = image

    # Komparasi hasil filtering dengan gambar asli
    scores = [compare_images(original, img) for img in processed_images]

    # Pilih hasil filtering dengan score SSIM tertinggi
    best_index = scores.index(max(scores))
    best_image = processed_images[best_index]

    return best_image, ['Bilateral', 'Mean', 'Median', 'Denoising'][best_index]

# Fungsi untuk melakukan OCR
reader = easyocr.Reader(['en', 'id'])

# Fungsi untuk memotong gambar
def crop_image_by_area(image):
    height, width, _ = image.shape
    top_area = image[0:int(height * 0.2), 0:width]         # Bagian atas (nama toko)
    middle_area = image[int(height * 0.2):int(height * 0.8), 0:width]  # Bagian tengah (list produk)
    return top_area, middle_area

def perform_ocr_on_cropped_areas(top_area, middle_area):
    top_text = reader.readtext(top_area)
    middle_text = reader.readtext(middle_area)
    return top_text, middle_text

# Fungsi untuk mengonversi PDF menjadi gambar
def convert_pdf_to_images(pdf_path):
    images = convert_from_path(pdf_path)
    return images

# Fungsi untuk mengekstrak informasi dari hasil OCR
def extract_info_from_ocr(top_text, middle_text, full_text):
    top_result = " ".join([item[1] for item in top_text])
    middle_result = " ".join([item[1] for item in middle_text])

    # Daftar pola regex untuk berbagai format tanggal
    date_patterns = [
        r"\b(\d{2}[\/\-]\d{2}[\/\-]\d{4})\b",   # dd/mm/yyyy
        r"\b(\d{4}[\/\-]\d{2}[\/\-]\d{2})\b",   # yyyy/mm/dd
        r"\b([A-Za-z]+\s\d{1,2},\s\d{4})\b",    # Month Name dd, yyyy
        r"\b(\d{1,2}\s[A-Za-z]+\s\d{4})\b",     # dd Month Name yyyy
        r"\b([A-Za-z]{3,9}\s\d{1,2}\s*,\s*\d{4})\b",  # JULY 24 , 2020
        r"\b(\d{1,2}[\/\-]\d{1,2}[\/\-]\d{2})\b",     # 11/08/19
        r"\b(\d{2}-\d{2}-\d{2})\b"                    # 22-12-24
    ]

    total_pattern = r"total.*?(\d+[,\.]?\d*)"  # Pola untuk total harga
    price_pattern = r"(\d{1,2}\s*[:.]\s*\d{2})"  # Pola untuk harga

    # Ekstraksi tanggal
    date = None
    for pattern in date_patterns:
        date_match = re.search(pattern, full_text)
        if date_match:
            date = date_match.group(1)
            break

    # Ekstraksi total harga
    total_match = re.search(total_pattern, full_text.lower())
    total_price = total_match.group(1) if total_match else None

    # Ekstraksi produk dan harga
    products_with_prices = []
    product_lines = middle_result.split('\n')
    for line in product_lines:
        parts = re.split(price_pattern, line)
        for i in range(0, len(parts), 2):
            product_name = parts[i].strip()
            if i + 1 < len(parts):
                price = parts[i + 1].strip()
                products_with_prices.append(f"- {product_name} - {price}")

    return {
        "store_info": top_result,
        "products": "\n".join(products_with_prices),
        "date": date,
        "total_price": total_price
    }

# Fungsi untuk memproses file di folder receipts
def process_receipts_folder(receipts_folder):
    files = os.listdir(receipts_folder)
    all_extracted_data = []

    for file in files:
        file_path = os.path.join(receipts_folder, file)
        extracted_data = None

        if file.lower().endswith(('.png', '.jpg', '.jpeg')):
            # Jika file adalah gambar
            image = cv2.imread(file_path)

            # Pilih preprocessing terbaik
            best_image, best_method = choose_best_preprocessing(image)

            top_area, middle_area = crop_image_by_area(best_image)
            top_text, middle_text = perform_ocr_on_cropped_areas(top_area, middle_area)

            # OCR seluruh gambar
            full_text = " ".join([item[1] for item in reader.readtext(best_image)])
            extracted_data = extract_info_from_ocr(top_text, middle_text, full_text)

        elif file.lower().endswith('.pdf'):
            # Jika file adalah PDF, konversi PDF menjadi gambar
            images = convert_pdf_to_images(file_path)

            # Proses setiap halaman dari PDF
            for page_num, image in enumerate(images):
                # Konversi gambar dari format PIL ke OpenCV
                image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)

                # Pilih preprocessing terbaik
                best_image, best_method = choose_best_preprocessing(image)

                top_area, middle_area = crop_image_by_area(best_image)
                top_text, middle_text = perform_ocr_on_cropped_areas(top_area, middle_area)

                # OCR seluruh gambar
                full_text = " ".join([item[1] for item in reader.readtext(best_image)])
                extracted_data = extract_info_from_ocr(top_text, middle_text, full_text)

                # Simpan hasil untuk setiap halaman PDF
                all_extracted_data.append({
                    "file_name": f"{file}",
                    "data": extracted_data,
                    "preprocessing": best_method
                })
            continue  # Lanjutkan ke file berikutnya setelah memproses PDF

        if extracted_data:
            all_extracted_data.append({
                "file_name": file,
                "data": extracted_data,
                "preprocessing": best_method
            })

    return all_extracted_data

# Path ke folder receipts di Google Drive
receipts_folder = '/content/drive/My Drive/receipts'

# Proses semua file di folder receipts dan simpan hasilnya
all_extracted_data = process_receipts_folder(receipts_folder)

# Fungsi untuk menghitung akurasi OCR dengan membandingkan hasil OCR terhadap ground truth
def calculate_accuracy(ocr_text, ground_truth_text):
    matcher = SequenceMatcher(None, ocr_text, ground_truth_text)
    return matcher.ratio() * 100  # Konversi ke persen

# Simpan all_extracted_data ke file .txt dalam format teks biasa di Colab
def save_extracted_data_to_txt(all_extracted_data, filename):
    with open(filename, 'w') as file:
        for item in all_extracted_data:
            file.write(f"\nFile Name: {item['file_name']}\n")
            file.write(f"Store Info: {item['data']['store_info']}\n")
            file.write(f"Date: {item['data']['date']}\n")
            file.write(f"Total Price: {item['data']['total_price']}\n")

            products = item['data']['products'].split('\n')
            file.write("Products and Prices:\n")
            for product in products:
                file.write(f"{product}\n")

# Nama file output yang akan disimpan ke Colab
output_filename = '/content/extracted_data.txt'  # Simpan ke Colab
save_extracted_data_to_txt(all_extracted_data, output_filename)

# Load konten dari groundtruth.txt
with open('/content/drive/My Drive/receipts/groundtruth.txt', 'r') as gt_file:
    ground_truth_text = gt_file.read()

# Baca extracted_data.txt dan hitung akurasi per file name
accuracies = []
with open(output_filename, 'r') as extracted_file:
    ocr_text = extracted_file.read()
    ocr_files = ocr_text.split("\nFile Name: ")  # Pisahkan per file name
    gt_files = ground_truth_text.split("\nFile Name: ")  # Pisahkan per file name

    # Konversi ground truth menjadi dictionary untuk pencarian yang mudah
    gt_dict = {}
    for gt_file in gt_files[1:]:
        gt_lines = gt_file.splitlines()
        gt_file_name = gt_lines[0].strip()
        gt_content = "\n".join(gt_lines[1:]).strip()
        gt_dict[gt_file_name] = gt_content

    # Periksa setiap file dari hasil OCR dan cocokkan dengan ground truth atau tampilkan 0% jika tidak ada
    for ocr_file in ocr_files[1:]:  # Mulai dari indeks 1 untuk mengabaikan bagian sebelum "File Name:"
        ocr_lines = ocr_file.splitlines()
        file_name = ocr_lines[0].strip()  # Nama file di OCR hasil
        ocr_content = "\n".join(ocr_lines[1:]).strip()  # Konten OCR untuk file ini

        # Cek apakah file_name ada di dictionary ground truth
        if file_name in gt_dict:
            accuracy = calculate_accuracy(ocr_content, gt_dict[file_name])
        else:
            accuracy = 0.0  # Set akurasi ke 0% jika tidak ditemukan di ground truth

        accuracies.append((file_name, accuracy))

# Unduh file extracted_data.txt ke disk lokal jika diperlukan
from google.colab import files
files.download(output_filename)

# Tampilkan hasil di Colab
for item in all_extracted_data:
    print(f"\nFile Name: {item['file_name']}")
    #print(f"Preprocessing Used: {item['preprocessing']}")
    print(f"Store Info: {item['data']['store_info']}")

    products = item['data']['products'].split("\n")
    for product in products:
        print(f"{product}")

    print(f"Date: {item['data']['date']}")
    print(f"Total Price: {item['data']['total_price']}")
    print("-------------------------------------------------------\n")

# Tampilkan hasil akurasi per file
for file_name, accuracy in accuracies:
    print(f"OCR Accuracy for {file_name}: {accuracy:.2f}%")

# Hapus folder receipts
import shutil
shutil.rmtree(receipts_folder)
print(f'{receipts_folder} telah dihapus.')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
poppler-utils is already the newest version (22.02.0-2ubuntu0.5).
0 upgraded, 0 newly installed, 0 to remove and 49 not upgraded.





Gambar dinyatakan jelas. Melanjutkan ke tahap preprocessing.

Gambar dinyatakan jelas. Melanjutkan ke tahap preprocessing.

Gambar dinyatakan jelas. Melanjutkan ke tahap preprocessing.


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>


File Name: Album Harry Styles.jpg
Store Info: Harry Styles"
- Undln Tdd C Un Nnnn MAY 12 , 2017 - 01:32
- PM MEET ME IN THE HALLWAY - 3 : 47
- SIGN OF THE TIMES - 5:40
- CAROLINA - 3:09
- TWo gHoSts - 3 :49
- SWEET CREATURE - 3:45
- ONLY ANGEL - 4:51
- KIWI - 2: 56
- EVER SINCE NEW YORK - 4 : 13
- WOHAN - 4 : 38
- FROM THE DINING TABLE - 3:31
- ITEM COUNT : 10 TOTAL: - 37 . 79
Date: MAY 12 , 2017
Total Price: 37
-------------------------------------------------------


File Name: Speak Now Taylor Swift (1).png
Store Info: 7 S+ Speakl)
- ORDER #0003 FOR TAYLOR SWIFT OCTOBER 25 , 2010 - 11 :53
- PM MINE (POP MIX) - 3:50
- SPARKS FLY - 4: 20
- BACK To DECEMBER - 4:53
- SPEAK Now - 4: 00
- DEAR JOHN - 6 : 43
- MEAN - 3:57
- THE STORY OF US - 4: 25
- NEVER GRow UP - 4:50
- ENCHANTED - 5:52
- BETTER THAN REVENGE - 3:37
- INNOCENT - 5:02
- HAUNTED - 4: 02
- LAST KISS - 6 : 07
- Long LIVE - 5:17
- OURS - 3:58
- IF THIS Was A HOVIE - 3:54
- SUPERMAN - 4: 36
Date: OCTOBER 25 , 2010
Total Price: