# OCR for extracting specific information

In [1]:
import cv2

def preprocess_image(image_path):
    # Read the image
    image = cv2.imread(image_path)
    
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply noise reduction
    noise_reduced = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)
    
    # Apply thresholding
    _, thresholded = cv2.threshold(noise_reduced, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
    
    return thresholded


In [2]:
import pytesseract

def extract_text_tesseract(image):
    text = pytesseract.image_to_string(image)
    return text


In [5]:
import easyocr

def extract_text_easyocr(image_path):
    reader = easyocr.Reader(['en'])
    result = reader.readtext(image_path)
    text = " ".join([res[1] for res in result])
    return text


In [7]:
import re

def extract_dates_and_mrp(text):
    # Define regex patterns for different date formats
    date_pattern = r'\b(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})\b'
    
    # Define regex patterns for MRP formats
    mrp_pattern = r'\b(?:Rs\.|mrp:|/-)\s*[0-9]+(?:\.[0-9]{1,2})?\b'
    
    # Find all date and MRP matches in the text
    dates = re.findall(date_pattern, text, re.IGNORECASE)
    mrp = re.findall(mrp_pattern, text, re.IGNORECASE)
    
    # Initialize variables for identified dates
    expiry_date = None
    manufactured_date = None
    
    # Identify context for dates
    lines = text.split('\n')
    for line in lines:
        if any(keyword in line.lower() for keyword in ['exp', 'Exp Date', 'BEST BEFORE']):
            expiry_date_matches = re.findall(date_pattern, line, re.IGNORECASE)
            if expiry_date_matches:
                expiry_date = expiry_date_matches[0]
        if any(keyword in line.lower() for keyword in ['mfd', 'Nfg Date', 'mfg.date']):
            manufactured_date_matches = re.findall(date_pattern, line, re.IGNORECASE)
            if manufactured_date_matches:
                manufactured_date = manufactured_date_matches[0]
    
    return {
        "expiry_date": expiry_date,
        "manufactured_date": manufactured_date,
        "mrp": mrp[0] if mrp else None
    }


In [9]:
def process_image(image_path, use_easyocr=False):
    # Preprocess the image
    preprocessed_image = preprocess_image(image_path)
    
    # Extract text using the selected OCR tool
    if use_easyocr:
        text = extract_text_easyocr(image_path)
    else:
        text = extract_text_tesseract(preprocessed_image)
    
    # Post-process the extracted text to find relevant information
    result = extract_dates_and_mrp(text)
    
    return result

# Example usage
image_path = r'C:\Users\DELL\Documents\smart_vision_project\ocr_specified\image4.jpeg'
result = process_image(image_path, use_easyocr=True)
print(result)


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.
  net.load_state_dict(copyStateDict(torch.load(trained_model, map_location=device)))
  state_dict = torch.load(model_path, map_location=device)


{'expiry_date': '01-03-2021', 'manufactured_date': '01-03-2021', 'mrp': None}
