In [18]:
import os
import cv2
import easyocr
import numpy as np
import pandas as pd

In [None]:
import os
import cv2
import numpy as np
import re
import easyocr
import pandas as pd

# Function to preprocess images (noise removal, contrast adjustment, etc.)
def preprocess_image(image_path):
    # Load the image
    image = cv2.imread(image_path)
    
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply GaussianBlur for noise reduction
    blurred = cv2.GaussianBlur(gray, (5, 5), 0)
    
    # Adjust contrast using histogram equalization
    equalized = cv2.equalizeHist(blurred)
    
    return equalized

# Function to extract structured data using regex
def extract_data_from_text(text):
    # Patterns for fields with updated Date regex to handle ':', ';', or '.' after the month
    patterns = {
        "TheaterName": r"\b(CINE CITE|METROPOLE|MAJESTIC)\b",  # Only the specified theaters
        "Salle": r"\bSalle\s*(\d{1,2})\b",  # Capture 'Salle' followed by a number
        "Room": r"\b(\d{1,2})\b",  # Room number, 1 or 2 digits
        "DayOfWeek": r"\b(lun|mar|mer|jeu|ven|sam|dim)\b",  # Day of the week in French
        "Date": r"\b(lun|mar|mer|jeu|ven|sam|dim)[\.;:]?\s*(\d{1,2})\s+([a-zA-Zé]+)[\.;:]?\s+(\d{4})\b",  # Date with various separators
        "Hour": r"\b(\d{1,2}[:.]\d{2})\b",  # Hour in format '17:30'
        "Version": r"\b(VF|VOSTF)\b",  # Version (VF or VOSTF)
    }

    parsed_data = {}
    
    # Extract data using regex
    for field, pattern in patterns.items():
        match = re.search(pattern, text, re.IGNORECASE)
        if match:
            # Store only the matched text without tuple structure
            parsed_data[field] = match.group(0)
        else:
            parsed_data[field] = ""  # If no match, assign an empty string
    
    return parsed_data

# Main function to process images, extract text, and save to CSV
def extract_and_structure_data(input_folder, output_csv="structured_ticket_data.csv"):
    reader = easyocr.Reader(['fr'])  # OCR reader for French text
    structured_data = []

    # Iterate over all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.lower().endswith(('.png', '.jpg', '.jpeg')):  # Only process image files
            image_path = os.path.join(input_folder, filename)
            
            # Step 1: Preprocess the image
            preprocessed_image = preprocess_image(image_path)
            
            # Step 2: OCR processing to get text from the image
            ocr_results = reader.readtext(preprocessed_image)
            
            # Filter OCR results to ensure we only use high-confidence text (confidence > 0.3)
            full_text = " ".join([text[1] for text in ocr_results if text[2] > 0.3])  # Only use high-confidence text

            # Print the raw OCR text for each image
            print(f"Raw OCR text for {filename}:", full_text)
            
            # Step 3: Extract structured data using regex
            ticket_data = extract_data_from_text(full_text)
            ticket_data["Filename"] = filename  # Add the filename to the result

            # Print parsed data before any transformations
            print(f"Parsed data before transformation for {filename}:", ticket_data)

            # Format the date directly if extracted
            if ticket_data["Date"]:
                day_of_week, day, month, year = re.match(
                    r"(lun|mar|mer|jeu|ven|sam|dim)[\.;:]?\s*(\d{1,2})\s+([a-zA-Zé]+)[\.;:]?\s+(\d{4})", 
                    ticket_data["Date"], 
                    re.IGNORECASE
                ).groups()
                ticket_data["Date"] = f"{day_of_week}. {day} {month} {year}"
            
            structured_data.append(ticket_data)  # Store the parsed data

    # Step 4: Convert structured data to DataFrame and save to CSV
    df = pd.DataFrame(structured_data)

    # Save to CSV
    df.to_csv(output_csv, index=False)
    print(f"Structured data saved to {output_csv}")

# Example usage
input_folder = 'data'  # Folder containing images to process
extract_and_structure_data(input_folder)


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


Raw OCR text for 20241108_135104.jpg: LE MAJESTIC LILLE Salle 06 jeu. 29 déc. 2022 17.30 ANNIE COLEREVF UGC illimite Abonnement UGC Illimité UGC ILLIMITE 1 U17 335227186 -1 0024421 0p.6500000003217410 Ni reprie nfechange
Parsed data before transformation for 20241108_135104.jpg: {'TheaterName': 'MAJESTIC', 'Salle': 'Salle 06', 'Room': '06', 'DayOfWeek': 'jeu', 'Date': 'jeu. 29 déc. 2022', 'Hour': '17.30', 'Version': '', 'Filename': '20241108_135104.jpg'}
Raw OCR text for 20241108_135110.jpg: LE MAJESTIC LILLE Salle mer 28 déc. 2022 01 20,15 LES BANSHEES D'INI VOstF UGC illimite UI-+ ++++149302 Abonnement UGC Illimité UGC ILLIMITE MINC 2 5060) 0p.6500000003214802 echange
Parsed data before transformation for 20241108_135110.jpg: {'TheaterName': 'MAJESTIC', 'Salle': '', 'Room': '28', 'DayOfWeek': 'mer', 'Date': 'mer 28 déc. 2022', 'Hour': '', 'Version': 'VOstF', 'Filename': '20241108_135110.jpg'}
Raw OCR text for 20241108_135114.jpg: LE METROPOLE LILLE Salle 02 mar. 27 déc. 2022 14.00 LE