# OCR-NLP Pipeline: Data Ingestion & OCR

This notebook demonstrates the data ingestion and OCR components of the pipeline, showing how scanned documents are processed and converted to text.

In [None]:
import sys
import os
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np
import cv2
from PIL import Image

# Add the parent directory to the path so we can import our modules
sys.path.append('..')

from src.ocr_engine import get_ocr_engine, DocumentProcessor
from src.nlp_parser import TextPreprocessor

## 1. Setting Up the Environment

First, let's set up our environment and check if we have the necessary dependencies installed.

In [None]:
# Check if Tesseract is installed and available
import shutil

tesseract_path = shutil.which('tesseract')
if tesseract_path:
    print(f"Tesseract is installed at: {tesseract_path}")
else:
    print("Tesseract is not installed or not in PATH. Please install it:")
    print("  - Ubuntu/Debian: sudo apt-get install tesseract-ocr")
    print("  - macOS: brew install tesseract")
    print("  - Windows: Download installer from https://github.com/UB-Mannheim/tesseract/wiki")

In [None]:
# Create sample directories if they don't exist
data_dir = Path('../data')
samples_dir = data_dir / 'samples'
processed_dir = data_dir / 'processed'
outputs_dir = Path('../outputs')

for dir_path in [samples_dir, processed_dir, outputs_dir]:
    dir_path.mkdir(parents=True, exist_ok=True)
    
print(f"Sample directory: {samples_dir}")
print(f"Processed directory: {processed_dir}")
print(f"Outputs directory: {outputs_dir}")

## 2. Creating a Sample Document

For demonstration purposes, let's create a simple sample document with text that we can use for OCR testing.

In [None]:
def create_sample_image(text, filename, size=(800, 600), font_scale=1, thickness=2):
    """Create a sample image with text for OCR testing."""
    # Create a blank image
    img = np.ones((size[1], size[0], 3), dtype=np.uint8) * 255
    
    # Add some text
    font = cv2.FONT_HERSHEY_SIMPLEX
    color = (0, 0, 0)  # Black color
    
    # Split text into lines
    lines = text.strip().split('\n')
    y_position = 50
    
    for line in lines:
        cv2.putText(img, line, (50, y_position), font, font_scale, color, thickness)
        y_position += 40
    
    # Save the image
    cv2.imwrite(str(filename), img)
    return img

# Sample invoice text
invoice_text = """INVOICE #12345
Date: 2023-03-21
Vendor: Acme Corporation

Bill To:
John Smith
123 Main Street
Anytown, CA 12345
Email: john.smith@example.com

Item        Quantity    Price       Total
Widget A    5           $10.00      $50.00
Widget B    3           $15.00      $45.00
Service X   2 hours     $75.00      $150.00

Subtotal:                           $245.00
Tax (8%):                           $19.60
Total:                              $264.60

Payment Terms: Net 30
Due Date: 2023-04-20
"""

# Create and save the sample image
sample_image_path = samples_dir / 'sample_invoice.png'
img = create_sample_image(invoice_text, sample_image_path, size=(800, 800), font_scale=0.7)

# Display the image
plt.figure(figsize=(10, 12))
plt.imshow(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
plt.axis('off')
plt.title('Sample Invoice')
plt.show()

print(f"Created sample invoice at: {sample_image_path}")

## 3. OCR Processing with Tesseract

Now, let's use our OCR engine to extract text from the sample image.

In [None]:
# Initialize Tesseract OCR engine
ocr_engine = get_ocr_engine('tesseract', lang='eng')
document_processor = DocumentProcessor(ocr_engine)

# Process the sample image
extracted_text = document_processor.process_image(sample_image_path, preprocess=False)

print("Extracted Text (without preprocessing):")
print("-" * 50)
print(extracted_text)

## 4. Image Preprocessing for Better OCR Results

Let's demonstrate how preprocessing can improve OCR results, especially for lower quality scans.

In [None]:
def add_noise_to_image(image, noise_level=20):
    """Add noise to an image to simulate a lower quality scan."""
    # Convert to grayscale if it's a color image
    if len(image.shape) == 3:
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    else:
        gray = image.copy()
    
    # Add Gaussian noise
    noise = np.random.normal(0, noise_level, gray.shape).astype(np.uint8)
    noisy_img = cv2.add(gray, noise)
    
    # Add some blur to simulate a low-quality scan
    blurred = cv2.GaussianBlur(noisy_img, (3, 3), 0)
    
    return blurred

# Create a noisy version of our sample image
img = cv2.imread(str(sample_image_path))
noisy_img = add_noise_to_image(img, noise_level=15)
noisy_image_path = samples_dir / 'sample_invoice_noisy.png'
cv2.imwrite(str(noisy_image_path), noisy_img)

# Display the noisy image
plt.figure(figsize=(10, 12))
plt.imshow(noisy_img, cmap='gray')
plt.axis('off')
plt.title('Noisy Sample Invoice')
plt.show()

print(f"Created noisy sample invoice at: {noisy_image_path}")

In [None]:
# Process the noisy image without preprocessing
extracted_text_noisy = document_processor.process_image(noisy_image_path, preprocess=False)

print("Extracted Text from Noisy Image (without preprocessing):")
print("-" * 50)
print(extracted_text_noisy)

In [None]:
# Process the noisy image with preprocessing
extracted_text_noisy_preprocessed = document_processor.process_image(noisy_image_path, preprocess=True)

print("Extracted Text from Noisy Image (with preprocessing):")
print("-" * 50)
print(extracted_text_noisy_preprocessed)

## 5. Visualizing the Preprocessing Steps

Let's visualize each step of the preprocessing to understand how it improves OCR quality.

In [None]:
def visualize_preprocessing_steps(image_path):
    """Visualize the steps in the preprocessing pipeline."""
    # Load the image
    image = cv2.imread(str(image_path))
    
    # Convert to grayscale
    gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
    
    # Apply adaptive thresholding
    thresh = cv2.adaptiveThreshold(
        gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, 
        cv2.THRESH_BINARY, 11, 2
    )
    
    # Noise removal
    kernel = np.ones((1, 1), np.uint8)
    opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel)
    
    # Display all steps
    fig, axes = plt.subplots(2, 2, figsize=(15, 12))
    
    axes[0, 0].imshow(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
    axes[0, 0].set_title('Original Image')
    axes[0, 0].axis('off')
    
    axes[0, 1].imshow(gray, cmap='gray')
    axes[0, 1].set_title('Grayscale')
    axes[0, 1].axis('off')
    
    axes[1, 0].imshow(thresh, cmap='gray')
    axes[1, 0].set_title('Adaptive Thresholding')
    axes[1, 0].axis('off')
    
    axes[1, 1].imshow(opening, cmap='gray')
    axes[1, 1].set_title('Noise Removal')
    axes[1, 1].axis('off')
    
    plt.tight_layout()
    plt.show()
    
    return opening

# Visualize preprocessing steps for the noisy image
preprocessed_img = visualize_preprocessing_steps(noisy_image_path)

## 6. Text Cleaning and Normalization

After OCR, we often need to clean and normalize the extracted text to correct common OCR errors.

In [None]:
# Initialize text preprocessor
text_preprocessor = TextPreprocessor()

# Clean the extracted text
cleaned_text = text_preprocessor.clean_ocr_text(extracted_text_noisy_preprocessed)

print("Cleaned OCR Text:")
print("-" * 50)
print(cleaned_text)

In [None]:
# Process the text with all preprocessing steps
processed_result = text_preprocessor.process_text(
    extracted_text_noisy_preprocessed,
    clean=True,
    normalize=True,
    remove_stopwords=False,
    remove_punctuation=False
)

print(f"Word count: {processed_result['word_count']}")
print(f"Number of sentences: {len(processed_result['sentences'])}")
print(f"Number of paragraphs: {len(processed_result['paragraphs'])}")

print("\nFirst 3 sentences:")
for i, sentence in enumerate(processed_result['sentences'][:3]):
    print(f"{i+1}. {sentence}")

## 7. Comparing OCR Engines

Let's compare the results from different OCR engines (Tesseract vs. EasyOCR).

In [None]:
# Try to initialize EasyOCR (this may take a while the first time)
try:
    import easyocr
    easyocr_available = True
    print("EasyOCR is available. Initializing...")
    easy_ocr_engine = get_ocr_engine('easyocr', lang_list=['en'])
    easy_document_processor = DocumentProcessor(easy_ocr_engine)
except ImportError:
    easyocr_available = False
    print("EasyOCR is not installed. Skipping comparison.")
    print("You can install it with: pip install easyocr")

In [None]:
if easyocr_available:
    # Process the same image with EasyOCR
    easyocr_text = easy_document_processor.process_image(sample_image_path, preprocess=False)
    
    print("EasyOCR Extracted Text:")
    print("-" * 50)
    print(easyocr_text)
    
    # Compare word counts
    tesseract_words = len(extracted_text.split())
    easyocr_words = len(easyocr_text.split())
    
    print(f"\nComparison:")
    print(f"Tesseract word count: {tesseract_words}")
    print(f"EasyOCR word count: {easyocr_words}")

## 8. Saving Processed Results

Finally, let's save our processed results to the output directory.

In [None]:
# Save the original and processed text
with open(processed_dir / 'sample_invoice_original.txt', 'w') as f:
    f.write(extracted_text)
    
with open(processed_dir / 'sample_invoice_processed.txt', 'w') as f:
    f.write(cleaned_text)
    
print(f"Saved original text to: {processed_dir / 'sample_invoice_original.txt'}")
print(f"Saved processed text to: {processed_dir / 'sample_invoice_processed.txt'}")

## 9. Summary and Next Steps

In this notebook, we've demonstrated:

1. Setting up the OCR environment
2. Creating sample documents for testing
3. Extracting text using Tesseract OCR
4. Preprocessing images to improve OCR quality
5. Cleaning and normalizing OCR output
6. Comparing different OCR engines

In the next notebook, we'll explore entity extraction and the full pipeline from PDF to structured JSON output.