In [None]:
!pip install groq opencv-python numpy



In [None]:
!pip install groq

Collecting groq
  Downloading groq-0.37.1-py3-none-any.whl.metadata (16 kB)
Downloading groq-0.37.1-py3-none-any.whl (137 kB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m137.5/137.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: groq
Successfully installed groq-0.37.1


#upload the images


In [None]:
from google.colab import files
uploaded = files.upload()

Saving page_14.jpg to page_14.jpg
Saving page_30.jpg to page_30.jpg
Saving page_35.jpg to page_35.jpg


#add your groq api here

In [None]:
import os
GROQ_API_KEY = ""  # Replace with your key
os.environ["GROQ_API_KEY"] = GROQ_API_KEY

In [None]:
"""
Complete OCR + PII Extraction Pipeline for Handwritten Medical Documents
Pipeline: Input ‚Üí Pre-processing ‚Üí OCR ‚Üí Text Cleaning ‚Üí PII Detection ‚Üí Redaction
"""

import os
import base64
import json
import re
from typing import Dict, List, Optional, Tuple
from dataclasses import dataclass, asdict, field
from datetime import datetime
import cv2
import numpy as np
from groq import Groq

# =============================================================================
# DATA STRUCTURES
# =============================================================================

@dataclass
class MedicalNote:
    """Structured medical note with doctor attribution"""
    doctor_name: str
    date_time: str
    note_type: str  # e.g., "Progress Note", "Treatment Advised", "Investigation"
    content: str
    medications: List[str] = field(default_factory=list)
    instructions: List[str] = field(default_factory=list)

@dataclass
class PatientInfo:
    """Complete patient information"""
    name: str = ""
    age: str = ""
    sex: str = ""
    ipd_no: str = ""
    uhid_no: str = ""
    bed_no: str = ""
    department: str = ""
    admission_date: str = ""

@dataclass
class ExtractionResult:
    """Complete structured extraction result"""
    raw_text: str
    cleaned_text: str
    patient_info: PatientInfo
    medical_notes: List[MedicalNote]
    all_pii: Dict[str, List[str]]
    metadata: Dict

    def to_dict(self):
        return {
            'raw_text': self.raw_text,
            'cleaned_text': self.cleaned_text,
            'patient_info': asdict(self.patient_info),
            'medical_notes': [asdict(note) for note in self.medical_notes],
            'all_pii': self.all_pii,
            'metadata': self.metadata
        }

# =============================================================================
# PREPROCESSING MODULE
# =============================================================================

class ImagePreprocessor:
    """Handles image preprocessing: deskew, denoise, enhance"""

    @staticmethod
    def deskew_image(image: np.ndarray) -> np.ndarray:
        """Correct image tilt/rotation"""
        gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if len(image.shape) == 3 else image
        gray = cv2.bitwise_not(gray)

        # Detect edges
        thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]

        # Find contours and get rotation angle
        coords = np.column_stack(np.where(thresh > 0))
        if len(coords) < 10:
            return image

        angle = cv2.minAreaRect(coords)[-1]

        # Adjust angle
        if angle < -45:
            angle = -(90 + angle)
        else:
            angle = -angle

        # Rotate image
        if abs(angle) > 0.5:  # Only rotate if significant tilt
            (h, w) = image.shape[:2]
            center = (w // 2, h // 2)
            M = cv2.getRotationMatrix2D(center, angle, 1.0)
            rotated = cv2.warpAffine(image, M, (w, h),
                                    flags=cv2.INTER_CUBIC,
                                    borderMode=cv2.BORDER_REPLICATE)
            return rotated

        return image

    @staticmethod
    def denoise_image(image: np.ndarray) -> np.ndarray:
        """Remove noise from image"""
        return cv2.fastNlMeansDenoisingColored(image, None, 10, 10, 7, 21)

    @staticmethod
    def enhance_contrast(image: np.ndarray) -> np.ndarray:
        """Enhance image contrast"""
        lab = cv2.cvtColor(image, cv2.COLOR_BGR2LAB)
        l, a, b = cv2.split(lab)
        clahe = cv2.createCLAHE(clipLimit=3.0, tileGridSize=(8,8))
        l = clahe.apply(l)
        enhanced = cv2.merge([l, a, b])
        return cv2.cvtColor(enhanced, cv2.COLOR_LAB2BGR)

    def preprocess(self, image_path: str, save_debug: bool = False) -> Tuple[np.ndarray, str]:
        """Complete preprocessing pipeline"""
        print("  ‚Üí Pre-processing: Starting...")

        # Load image
        img = cv2.imread(image_path)
        if img is None:
            raise ValueError(f"Could not load image: {image_path}")

        original = img.copy()

        # Step 1: Deskew
        print("    ‚Ä¢ Correcting tilt...")
        img = self.deskew_image(img)

        # Step 2: Denoise
        print("    ‚Ä¢ Removing noise...")
        img = self.denoise_image(img)

        # Step 3: Enhance
        print("    ‚Ä¢ Enhancing contrast...")
        img = self.enhance_contrast(img)

        # Save preprocessed image
        output_path = None
        if save_debug:
            output_path = image_path.replace('.jpg', '_preprocessed.jpg')
            cv2.imwrite(output_path, img)
            print(f"    ‚Ä¢ Saved preprocessed: {output_path}")

        print("  ‚úì Pre-processing: Complete")
        return img, output_path

# =============================================================================
# TEXT CLEANING MODULE
# =============================================================================

class TextCleaner:
    """Clean and normalize extracted text"""

    @staticmethod
    def clean_text(raw_text: str) -> str:
        """Clean extracted text"""
        text = raw_text

        # Remove excessive whitespace
        text = re.sub(r'\s+', ' ', text)

        # Fix common OCR errors
        text = text.replace('|', 'I')
        text = text.replace('0', 'O').replace('O', '0')  # Context dependent

        # Remove special characters but keep medical notation
        text = re.sub(r'[^\w\s\-\.\,\:\;\(\)\/\+\=\@]', '', text)

        # Normalize line breaks
        text = text.strip()

        return text

# =============================================================================
# MAIN PIPELINE
# =============================================================================

class HandwrittenPIIPipeline:
    """
    Complete pipeline: Preprocessing ‚Üí OCR ‚Üí Cleaning ‚Üí PII Extraction ‚Üí Redaction
    """

    def __init__(self, api_key: str):
        self.client = Groq(api_key=api_key)
        self.model = "meta-llama/llama-4-scout-17b-16e-instruct"
        self.preprocessor = ImagePreprocessor()
        self.cleaner = TextCleaner()

    def encode_image(self, image: np.ndarray) -> str:
        """Encode image to base64"""
        _, buffer = cv2.imencode('.jpg', image)
        return base64.b64encode(buffer).decode('utf-8')

    def extract_structured_data(self, image: np.ndarray) -> Dict:
        """Use Groq Vision to extract structured medical data"""
        print("  ‚Üí OCR: Extracting text with Groq Vision...")

        base64_image = self.encode_image(image)

        prompt = """You are an expert medical document analyzer. Extract ALL information from this handwritten medical document.

Return ONLY valid JSON in this EXACT structure (no markdown, no explanation):

{
  "raw_text": "Complete verbatim transcription",
  "patient_info": {
    "name": "",
    "age": "",
    "sex": "",
    "ipd_no": "",
    "uhid_no": "",
    "bed_no": "",
    "department": "",
    "admission_date": ""
  },
  "medical_notes": [
    {
      "doctor_name": "Dr. Name",
      "date_time": "DD/MM/YY HH:MM",
      "note_type": "Progress Note/Treatment/Investigation",
      "content": "What the doctor wrote",
      "medications": ["med1 dosage", "med2 dosage"],
      "instructions": ["instruction1", "instruction2"]
    }
  ],
  "all_pii": {
    "patient_names": [],
    "doctor_names": [],
    "dates": [],
    "id_numbers": [],
    "phone_numbers": [],
    "addresses": []
  }
}

CRITICAL:
- Associate each note/medication with the specific doctor who wrote it
- Extract date/time for each note
- If multiple doctors wrote notes, create separate entries
- Extract ALL medications with dosages
- Include confidence markers for unclear text"""

        try:
            completion = self.client.chat.completions.create(
                model=self.model,
                messages=[{
                    "role": "user",
                    "content": [
                        {"type": "text", "text": prompt},
                        {
                            "type": "image_url",
                            "image_url": {"url": f"data:image/jpeg;base64,{base64_image}"}
                        }
                    ]
                }],
                temperature=0.1,
                max_tokens=3000
            )

            response_text = completion.choices[0].message.content

            # Clean JSON
            json_text = response_text
            if "```json" in json_text:
                json_text = json_text.split("```json")[1].split("```")[0]
            elif "```" in json_text:
                json_text = json_text.split("```")[1].split("```")[0]
            json_text = json_text.strip()

            result = json.loads(json_text)
            print("  ‚úì OCR: Extraction complete")
            return result

        except Exception as e:
            print(f"  ‚úó OCR Error: {str(e)}")
            return {
                "raw_text": f"Error: {str(e)}",
                "patient_info": {},
                "medical_notes": [],
                "all_pii": {}
            }

    def create_redacted_image(self, image: np.ndarray, pii_dict: Dict) -> np.ndarray:
        """Create redacted version with PII blocked out"""
        print("  ‚Üí Creating redacted image...")
        redacted = image.copy()

        # Add black bars over PII regions (simplified version)
        # In production, would use text detection to find exact locations
        height, width = redacted.shape[:2]

        # Add "REDACTED" watermark
        font = cv2.FONT_HERSHEY_SIMPLEX
        text = "PII REDACTED"
        cv2.putText(redacted, text, (50, 50), font, 1, (0, 0, 255), 2)

        print("  ‚úì Redaction complete")
        return redacted

    def process_document(self, image_path: str, output_dir: str = "output") -> ExtractionResult:
        """
        Complete pipeline execution
        """
        print(f"\n{'='*70}")
        print(f"PROCESSING: {os.path.basename(image_path)}")
        print('='*70)

        os.makedirs(output_dir, exist_ok=True)
        filename = os.path.splitext(os.path.basename(image_path))[0]

        # STEP 1: Preprocessing
        preprocessed_img, _ = self.preprocessor.preprocess(image_path, save_debug=True)

        # STEP 2: OCR with Groq Vision
        extracted_data = self.extract_structured_data(preprocessed_img)

        # STEP 3: Text Cleaning
        print("  ‚Üí Text Cleaning: Processing...")
        raw_text = extracted_data.get('raw_text', '')
        cleaned_text = self.cleaner.clean_text(raw_text)
        print("  ‚úì Text Cleaning: Complete")

        # STEP 4: Structure data
        patient_info = PatientInfo(**extracted_data.get('patient_info', {}))

        medical_notes = [
            MedicalNote(**note)
            for note in extracted_data.get('medical_notes', [])
        ]

        all_pii = extracted_data.get('all_pii', {})

        # STEP 5: Create result
        result = ExtractionResult(
            raw_text=raw_text,
            cleaned_text=cleaned_text,
            patient_info=patient_info,
            medical_notes=medical_notes,
            all_pii=all_pii,
            metadata={
                'filename': os.path.basename(image_path),
                'processed_at': datetime.now().isoformat(),
                'total_notes': len(medical_notes),
                'total_pii_categories': len(all_pii)
            }
        )

        # STEP 6: Save outputs
        self._save_results(result, filename, output_dir, preprocessed_img)

        # STEP 7: Create redacted image
        redacted_img = self.create_redacted_image(preprocessed_img, all_pii)
        redacted_path = os.path.join(output_dir, f"{filename}_redacted.jpg")
        cv2.imwrite(redacted_path, redacted_img)
        print(f"  ‚úì Saved redacted image: {redacted_path}")

        print(f"\n{'='*70}")
        self._print_summary(result)
        print('='*70)

        return result

    def _save_results(self, result: ExtractionResult, filename: str,
                     output_dir: str, preprocessed_img: np.ndarray):
        """Save all output files"""

        # 1. Complete JSON
        json_path = os.path.join(output_dir, f"{filename}_structured.json")
        with open(json_path, 'w', encoding='utf-8') as f:
            json.dump(result.to_dict(), f, indent=2, ensure_ascii=False)
        print(f"  ‚úì Saved: {json_path}")

        # 2. Human-readable report
        report_path = os.path.join(output_dir, f"{filename}_report.txt")
        with open(report_path, 'w', encoding='utf-8') as f:
            f.write(self._generate_report(result))
        print(f"  ‚úì Saved: {report_path}")

        # 3. Preprocessed image
        preproc_path = os.path.join(output_dir, f"{filename}_preprocessed.jpg")
        cv2.imwrite(preproc_path, preprocessed_img)
        print(f"  ‚úì Saved: {preproc_path}")

    def _generate_report(self, result: ExtractionResult) -> str:
        """Generate human-readable report"""
        lines = []
        lines.append("=" * 70)
        lines.append("MEDICAL DOCUMENT EXTRACTION REPORT")
        lines.append("=" * 70)
        lines.append("")

        # Patient Info
        lines.append("PATIENT INFORMATION")
        lines.append("-" * 70)
        pi = result.patient_info
        lines.append(f"Name: {pi.name}")
        lines.append(f"Age/Sex: {pi.age} / {pi.sex}")
        lines.append(f"IPD No: {pi.ipd_no}")
        lines.append(f"UHID No: {pi.uhid_no}")
        lines.append(f"Bed No: {pi.bed_no}")
        lines.append(f"Department: {pi.department}")
        lines.append("")

        # Medical Notes by Doctor
        lines.append("MEDICAL NOTES (DOCTOR-WISE)")
        lines.append("-" * 70)
        for i, note in enumerate(result.medical_notes, 1):
            lines.append(f"\n[Note {i}] {note.note_type}")
            lines.append(f"Doctor: {note.doctor_name}")
            lines.append(f"Date/Time: {note.date_time}")
            lines.append(f"Content: {note.content}")
            if note.medications:
                lines.append(f"Medications Prescribed:")
                for med in note.medications:
                    lines.append(f"  ‚Ä¢ {med}")
            if note.instructions:
                lines.append(f"Instructions:")
                for inst in note.instructions:
                    lines.append(f"  ‚Ä¢ {inst}")
            lines.append("")

        # PII Summary
        lines.append("PII DETECTED")
        lines.append("-" * 70)
        for category, items in result.all_pii.items():
            if items:
                lines.append(f"{category.upper()}: {', '.join(items)}")

        return "\n".join(lines)

    def _print_summary(self, result: ExtractionResult):
        """Print summary to console"""
        print("\nüìã EXTRACTION SUMMARY")
        print(f"Patient: {result.patient_info.name} ({result.patient_info.age}/{result.patient_info.sex})")
        print(f"Medical Notes: {len(result.medical_notes)}")

        for i, note in enumerate(result.medical_notes, 1):
            print(f"\n  Note {i}: {note.note_type}")
            print(f"  ‚îî‚îÄ Doctor: {note.doctor_name}")
            print(f"  ‚îî‚îÄ Date: {note.date_time}")
            print(f"  ‚îî‚îÄ Medications: {len(note.medications)}")
            if note.medications:
                for med in note.medications[:3]:
                    print(f"     ‚Ä¢ {med}")
                if len(note.medications) > 3:
                    print(f"     ... and {len(note.medications)-3} more")

        total_pii = sum(len(items) for items in result.all_pii.values())
        print(f"\n  Total PII Detected: {total_pii} items across {len(result.all_pii)} categories")

# =============================================================================
# MAIN EXECUTION
# =============================================================================

def main():
    """Run the complete pipeline"""

    # Configuration
    GROQ_API_KEY = os.getenv("GROQ_API_KEY", "your-api-key-here")

    if GROQ_API_KEY == "your-api-key-here":
        print("‚ö†Ô∏è  Please set GROQ_API_KEY environment variable")
        print("Get free API key from: https://console.groq.com/")
        return

    # Initialize pipeline
    pipeline = HandwrittenPIIPipeline(api_key=GROQ_API_KEY)

    # Process documents
    image_paths = [
        "sample1.jpg",
        "sample2.jpg",
        "sample3.jpg"
    ]

    results = []
    for img_path in image_paths:
        if os.path.exists(img_path):
            result = pipeline.process_document(img_path)
            results.append(result)
        else:
            print(f"‚ö†Ô∏è  File not found: {img_path}")

    print(f"\n‚úì Pipeline complete! Processed {len(results)} documents")
    print(f"‚úì Check 'output' folder for results")

if __name__ == "__main__":
    main()

‚ö†Ô∏è  File not found: sample1.jpg
‚ö†Ô∏è  File not found: sample2.jpg
‚ö†Ô∏è  File not found: sample3.jpg

‚úì Pipeline complete! Processed 0 documents
‚úì Check 'output' folder for results


In [None]:
from glob import glob

# Get uploaded images
images = glob("*.jpg") + glob("*.jpeg") + glob("*.png")

# Process
pipeline = HandwrittenPIIPipeline(api_key=os.getenv("GROQ_API_KEY"))

for img in images:
    result = pipeline.process_document(img)

# Cell 6: Download results
import shutil
shutil.make_archive('results', 'zip', 'output')
files.download('results.zip')


PROCESSING: page_30_preprocessed.jpg
  ‚Üí Pre-processing: Starting...
    ‚Ä¢ Correcting tilt...
    ‚Ä¢ Removing noise...
    ‚Ä¢ Enhancing contrast...
    ‚Ä¢ Saved preprocessed: page_30_preprocessed_preprocessed.jpg
  ‚úì Pre-processing: Complete
  ‚Üí OCR: Extracting text with Groq Vision...
  ‚úì OCR: Extraction complete
  ‚Üí Text Cleaning: Processing...
  ‚úì Text Cleaning: Complete
  ‚úì Saved: output/page_30_preprocessed_structured.json
  ‚úì Saved: output/page_30_preprocessed_report.txt
  ‚úì Saved: output/page_30_preprocessed_preprocessed.jpg
  ‚Üí Creating redacted image...
  ‚úì Redaction complete
  ‚úì Saved redacted image: output/page_30_preprocessed_redacted.jpg


üìã EXTRACTION SUMMARY
Patient: Santosh Pradhan (36/M)
Medical Notes: 2

  Note 1: Progress Note
  ‚îî‚îÄ Doctor: Dr. Pruthia
  ‚îî‚îÄ Date: 11/9/24 3:30 PM
  ‚îî‚îÄ Medications: 4
     ‚Ä¢ Inj THIAMINE (200) 100 ml NS TID
     ‚Ä¢ Inj LOPEZ (1amp) slow IV TDS as - q sos
     ‚Ä¢ Inj PAN (40) 1 vial IV BD
 

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>