<a href="https://colab.research.google.com/github/thakursaurabh/ML-project/blob/main/Saurabh.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pytesseract
import cv2
import re
import spacy
import json

# Initialize spaCy for NLP
nlp = spacy.load("en_core_web_sm")

# Function to extract text from an image using Tesseract OCR
def extract_text_from_image(image_path):
    img = cv2.imread(image_path)
    text = pytesseract.image_to_string(img)
    return text

# Function to extract relevant information from extracted text
def extract_invoice_info(text):
    invoice_info = {}

    # Extract invoice number
    invoice_number = re.search(r'Invoice\s*Number\s*:\s*(\w+)', text, re.IGNORECASE)
    if invoice_number:
        invoice_info['invoice_number'] = invoice_number.group(1)

    # Extract vendor name
    vendor_name = re.search(r'Vendor\s*Name\s*:\s*(.+)', text, re.IGNORECASE)
    if vendor_name:
        invoice_info['vendor_name'] = vendor_name.group(1)

    # Extract GST number
    gst_number = re.search(r'GST\s*Number\s*:\s*(\w+)', text, re.IGNORECASE)
    if gst_number:
        invoice_info['gst_number'] = gst_number.group(1)

    # Extract vendor address
    vendor_address = re.search(r'Vendor\s*Address\s*:\s*(.+)', text, re.IGNORECASE)
    if vendor_address:
        invoice_info['vendor_address'] = vendor_address.group(1)

    # Extract delivery address
    delivery_address = re.search(r'Delivery\s*Address\s*:\s*(.+)', text, re.IGNORECASE)
    if delivery_address:
        invoice_info['delivery_address'] = delivery_address.group(1)

    # Extract buyer name
    buyer_name = re.search(r'Buyer\s*Name\s*:\s*(.+)', text, re.IGNORECASE)
    if buyer_name:
        invoice_info['buyer_name'] = buyer_name.group(1)

    # Extract buyer address
    buyer_address = re.search(r'Buyer\s*Address\s*:\s*(.+)', text, re.IGNORECASE)
    if buyer_address:
        invoice_info['buyer_address'] = buyer_address.group(1)

    # Extract item details (assuming a specific format, adjust as needed)
    item_details = re.findall(r'Item\s*Name:\s*(.+)\s*Price:\s*(\d+\.\d+)\s*Quantity:\s*(\d+)\s*Total\s*Tax\s*Amount:\s*(\d+\.\d+)', text, re.IGNORECASE)
    if item_details:
        invoice_info['item_details'] = []
        for item in item_details:
            item_info = {
                'item_name': item[0],
                'price': float(item[1]),
                'quantity': int(item[2]),
                'total_tax_amount': float(item[3])
            }
            invoice_info['item_details'].append(item_info)

    return invoice_info

# Replace 'invoice.jpg' with the path to your scanned invoice image
invoice_image_path = 'invoice.jpg'
extracted_text = extract_text_from_image(invoice_image_path)
invoice_info = extract_invoice_info(extracted_text)

# Convert the extracted information to JSON
output_json = json.dumps(invoice_info, indent=4)

# Save the JSON to a file
with open('invoice_info.json', 'w') as json_file:
    json_file.write(output_json)
