In [None]:
!pip install PyPDF2 pdfminer.six pytesseract opencv-python-headless numpy pandas pdf2image
!apt-get install -y poppler-utils tesseract-ocr

# New Section

In [None]:
import io
import PyPDF2
import pytesseract
from pdf2image import convert_from_path

# extracts text from a non-scanned PDFs
def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        reader = PyPDF2.PdfReader(file)
        text = ''
        for page in reader.pages:
            text += page.extract_text()
    print(text)
    return text
# extracts text from a scanned PDF using OCR Tesseract
def extract_text_from_scanned_pdf(file_path):
    images = convert_from_path(file_path)
    text = ''
    for image in images:
        text += pytesseract.image_to_string(image)
    print(text)
    return text

In [None]:
# extracts invoice-related data from the extracted text using regex patterns
import re

def extract_invoice_data(text):
    # regex patterns
    patterns = {
        'taxable_value': r'Sub Total\s*₹\s*([\d,.]+)',
        'sgst_amount': r'SGST@\d+\.?\d*%\s*₹\s*([\d,.]+)',
        'cgst_amount': r'CGST@\d+\.?\d*%\s*₹\s*([\d,.]+)',
        'igst_amount': r'IGST@\d+\.?\d*%\s*₹\s*([\d,.]+)',
        'sgst_rate': r'SGST@(\d+\.?\d*)%',
        'cgst_rate': r'CGST@(\d+\.?\d*)%',
        'igst_rate': r'IGST@(\d+\.?\d*)%',
        'tax_amount': r'Total\s*₹\s*([\d,.]+)',
        'final_amount': r'Total\s*₹\s*([\d,.]+)',
        'invoice_number': r'Invoice No\.:\s*(\d+)',
        'invoice_date': r'Date:\s*([\d/]+)',
        'place_of_supply': r'Place of Supply:\s*(\d+-\w+)',
        'place_of_origin': r'State:\s*(\d+-\w+)',
        'gstin_supplier': r'GSTIN:\s*([A-Z0-9]{15})',
        'gstin_recipient': r'GSTIN Number:\s*([A-Z0-9]{15})'
    }


    data = {}
    for key, pattern in patterns.items():
        match = re.search(pattern, text)
        data[key] = match.group(1).replace(',', '') if match else '0'

    # convert numeric fields to float
    numeric_fields = ['taxable_value', 'sgst_amount', 'cgst_amount', 'igst_amount', 'tax_amount', 'final_amount']
    for field in numeric_fields:
        if data[field]:
            try:
                data[field] = float(data[field])
            except ValueError:
                data[field] = 0

    # calculate taxable amount
    if True:
        data['taxable_value'] = data['final_amount'] - (data['sgst_amount'] + data['cgst_amount'] + data['igst_amount'])
    else:
        data['taxable_value'] = '0'

    # calculate tax amount
    if True:
        data['tax_amount'] = (data['sgst_amount'] + data['cgst_amount'] + data['igst_amount'])
    else:
        data['tax_amount'] = '0'


    debug = False
    if debug:
        for key, value in data.items():
            print(f"{key}: {value}")

    return data



In [None]:
text = extract_text_from_pdf('/content/03Jul_Invoice_615.pdf')
data = extract_invoice_data(text)
print(data)

TAX INVOICE
KPP Hometown OrganicsAddress:6/549 Puthur Maariyamman Kovil st,Bye Pass 4 Cross Road Palacode,Dharmapuri, Tamilnadu - 636808Phone:9994218657Email:organicsfromhometown@gmail.comGSTIN:33BDNPN7360K1Z3State:33 - Tamil NaduInvoice No.: 615Date:03/07/2024Place of Supply:29-KarnatakaBill To:Sri Mahalakshmi EnterprisesNO.10/41 APMC YARD 6TH MAIN ROAD YESHWANTHPURContact No.:08026705904, 08048903677GSTIN Number:29AGWPA9333Q1ZEState:29-KarnatakaTransporta on Details:Delivery Date:03/07/24Delivery Loca on:YeshwanthpurShip To:NO.10/41 APMC YARD 6TH MAIN ROAD YESHWANTHPUR#Item nameHSN/ SACQuan tyUnitPrice/ UnitGSTAmount1HTO Jaggery Powder170130Box₹ 590.00₹ 885.00 (5%)₹ 18,585.00Total30₹ 885.00₹ 18,585.00
Pay To:Bank Name : AXIS BANK, DHARMAPURIBank Account No. : 921020007484915Bank IFSC code : UTIB0000889Account holder's name : KPP Hometown Organics Invoice Amount In WordsEighteen Thousand Five Hundred Eighty Five Rupees onlyTerms And Condi ons-Price diﬀerences between invoice issued fo

In [None]:
# saves extracted invoice data to a CSV file and download it.
import pandas as pd
from google.colab import files

df = pd.DataFrame([data])
print(df)
df.to_csv('extracted_invoices.csv', index=False)
files.download('extracted_invoices.csv')

   taxable_value  sgst_amount  cgst_amount  igst_amount sgst_rate cgst_rate  \
0        16815.0          0.0          0.0        885.0         0         0   

  igst_rate  tax_amount  final_amount invoice_number invoice_date  \
0         5       885.0       17700.0            615   03/07/2024   

    place_of_supply         place_of_origin   gstin_supplier  gstin_recipient  
0  29-KarnatakaBill  29-KarnatakaTransporta  33BDNPN7360K1Z3  29AGWPA9333Q1ZE  


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>