In [1]:
# Install necessary libraries
!pip install pytesseract
!apt-get install -y libtesseract-dev
!pip install Pillow
!apt-get update
!apt-get install -y tesseract-ocr
!pip install pandas
!pip install sweetviz

# Import libraries
import pytesseract
from PIL import Image
import os
import csv
import sweetviz as sv
import pandas as pd
from google.colab import drive
import pandas as pd
import re

# Mount Google Drive
drive.mount('/content/drive')


Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Installing collected packages: pytesseract
Successfully installed pytesseract-0.3.13
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  libarchive-dev libleptonica-dev
The following NEW packages will be installed:
  libarchive-dev libleptonica-dev libtesseract-dev
0 upgraded, 3 newly installed, 0 to remove and 49 not upgraded.
Need to get 3,743 kB of archives.
After this operation, 16.0 MB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 libarchive-dev amd64 3.6.0-1ubuntu1.3 [581 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libleptonica-dev amd64 1.82.0-3build1 [1,562 kB]
Get:3 http://archive.ubuntu.com/ubuntu jammy/universe amd64 libtesseract-dev amd64 4.1.1-2.1build1 [1

In [3]:
import os
def extract_text_from_image(image_path):

    image = Image.open(image_path)
    return pytesseract.image_to_string(image)

def process_images_in_folder(folder_path):

    extracted_data = []

    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)

        if file_name.lower().endswith(('.png', '.jpg', '.jpeg', '.tiff', '.bmp', '.gif')):
            print(f"Processing {file_name}...")
            text = extract_text_from_image(file_path)
            extracted_data.append({"file_name": file_name, "extracted_text": text})

    return extracted_data

def save_extracted_data_to_csv(extracted_data, output_csv_file):

    with open(output_csv_file, 'w', newline='', encoding='utf-8') as csv_file:
        writer = csv.DictWriter(csv_file, fieldnames=["file_name", "extracted_text"])
        writer.writeheader()
        writer.writerows(extracted_data)

folder_path = '/content/drive/MyDrive/BillsDataset'
output_csv_file = '/content/drive/MyDrive/extracted_text.csv'

extracted_data = process_images_in_folder(folder_path)
save_extracted_data_to_csv(extracted_data, output_csv_file)

print(f"Text extracted and saved to {output_csv_file}")


Processing IMG_20240814_132450.jpg...
Processing IMG_20240814_132242.jpg...
Processing IMG_20240814_132236.jpg...
Processing IMG_20240814_132231.jpg...
Processing IMG_20240814_132223.jpg...
Processing IMG_20240814_132217.jpg...
Processing IMG_20240814_132211.jpg...
Processing IMG_20240814_132205.jpg...
Processing IMG_20240814_132159.jpg...
Processing IMG_20240814_132151.jpg...
Processing IMG_20240814_132145.jpg...
Processing IMG_20240814_132139.jpg...
Processing IMG_20240814_132132.jpg...
Processing IMG_20240814_132126.jpg...
Processing IMG_20240814_132121.jpg...
Processing IMG_20240814_132115.jpg...
Processing IMG-20240814-WA0010.jpg...
Processing IMG-20240814-WA0011.jpg...
Processing IMG-20240814-WA0012.jpg...
Processing IMG-20240814-WA0013.jpg...
Processing IMG-20240814-WA0026.jpg...
Processing IMG-20240814-WA0025.jpg...
Processing IMG-20240814-WA0024.jpg...
Processing IMG-20240814-WA0023.jpg...
Processing IMG-20240814-WA0018(2).jpg...
Processing IMG-20240814-WA0022.jpg...
Processin

In [4]:
def correct_and_extract(text):

    # Defining corrections
    corrections = {
        'ie rrent': 'Torrent Power',
        'GIN:': 'GIN:',
        'Meter No.': 'Meter Number:',
        'T No.': 'T No.:',
        'Bill Date': 'Bill Date:',
        'Meter Serial No.': 'Meter Serial Number:',
        'Past Reading': 'Past Reading:',
        'Present Reading': 'Present Reading:',
        'Fixed Charges': 'Fixed Charges:',
        'Excess Demand Charges': 'Excess Demand Charges:',
        'Energy Charges': 'Energy Charges:',
        'Electricity Duty': 'Electricity Duty:',
        'Regulatory Surcharge 1': 'Regulatory Surcharge 1:',
        'Regulatory Surcharge 2': 'Regulatory Surcharge 2:',
        'Total Current Dues': 'Total Current Dues:',
        'Arrears': 'Arrears:',
        'Total Dues': 'Total Dues:',
        'Last Payment Date': 'Last Payment Date:',
        'Last Payment Amount': 'Last Payment Amount:',
        'Last Payment Mode': 'Last Payment Mode:',
        'Security Deposit': 'Security Deposit:',
        'Purpose': 'Purpose:'
    }

    for wrong, correct in corrections.items():
        text = text.replace(wrong, correct)

    # Extracting relevant data
    extracted_data = {}

    def extract_field(label):
        pattern = rf'{label}\s*(\d+[\d,.]*)'
        match = re.search(pattern, text)
        return match.group(1) if match else None

    extracted_data['meter_no'] = extract_field('Meter Number:')
    extracted_data['bill_no'] = extract_field('T No.:')
    extracted_data['bill_date'] = extract_field('Bill Date:')
    extracted_data['past_reading'] = extract_field('Past Reading:')
    extracted_data['present_reading'] = extract_field('Present Reading:')
    extracted_data['fixed_charges'] = extract_field('Fixed Charges:')
    extracted_data['excess_demand_charges'] = extract_field('Excess Demand Charges:')
    extracted_data['energy_charges'] = extract_field('Energy Charges:')
    extracted_data['electricity_duty'] = extract_field('Electricity Duty:')
    extracted_data['regulatory_surcharge_1'] = extract_field('Regulatory Surcharge 1:')
    extracted_data['regulatory_surcharge_2'] = extract_field('Regulatory Surcharge 2:')
    extracted_data['total_current_dues'] = extract_field('Total Current Dues:')
    extracted_data['arrears'] = extract_field('Arrears:')
    extracted_data['total_dues'] = extract_field('Total Dues:')
    extracted_data['last_payment_date'] = extract_field('Last Payment Date:')
    extracted_data['last_payment_amount'] = extract_field('Last Payment Amount:')
    extracted_data['last_payment_mode'] = extract_field('Last Payment Mode:')
    extracted_data['security_deposit'] = extract_field('Security Deposit:')
    extracted_data['purpose'] = extract_field('Purpose:')

    return extracted_data


csv_file_path = '/content/drive/MyDrive/extracted_text.csv'
output_folder = '/content/drive/MyDrive/'
os.makedirs(output_folder, exist_ok=True)


df = pd.read_csv(csv_file_path)

processed_data = []

for _, row in df.iterrows():
    text = row['extracted_text']
    corrected_data = correct_and_extract(text)
    processed_data.append(corrected_data)

processed_df = pd.DataFrame(processed_data)

processed_csv_file = os.path.join(output_folder, 'processed_text_data.csv')
processed_df.to_csv(processed_csv_file, index=False, encoding='utf-8')
print(f"Processed data saved to {processed_csv_file}")


Processed data saved to /content/drive/MyDrive/processed_text_data.csv


In [5]:
my_report = sv.analyze(processed_df)

my_report.show_html()

Collecting sweetviz
  Downloading sweetviz-2.3.1-py3-none-any.whl.metadata (24 kB)
Downloading sweetviz-2.3.1-py3-none-any.whl (15.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m15.1/15.1 MB[0m [31m80.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sweetviz
Successfully installed sweetviz-2.3.1


                                             |          | [  0%]   00:00 -> (? left)

Report SWEETVIZ_REPORT.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.
