In [9]:
from PIL import Image
import pytesseract
import re
import pandas as pd

In [5]:
# Load the new image
image_path = "image.png"
image = Image.open(image_path)

# Perform OCR to extract text from the image
extracted_text_new = pytesseract.image_to_string(image)

# Display a portion of the extracted text for verification
extracted_text_new


'Condensed Consolidated Statement of Financial Position\nAs on Quarter ended 29 Poush, 2081\n\nAssets\n\nGroup\n\nThis Quarter\nEnding\n\nImmediate Previous\nYear Ending\n\nBank\n\nThis Quarter Ending\n\nImmediate Previous\nYear Ending\n\nCash and cash equivalent\n\nDue from Nepal Rastra Bank\n\n5,822,973,123\n10,953,644,717\n\n5,074,320.773\n15,212,779,452\n\n5,822.700,300\n10,953,644,717\n\n5,074,083.242\n15,212,779,452\n\nPlacement with Bank and Financial\nInstitutions\n\n9,752,625,000\n\n5,214,300,000\n\n9,752,625,000\n\n5$,214,300,000\n\nDerivative financial instruments\n\n9,972,912\n\n9,972,912\n\nOther trading assets\n\n1,499,319.294\n\n1,274,757.944\n\n1,439,127,517\n\n1,216,149,394\n\nLoan and advances to B/FIs\nLoans and advances to customers\n\n204, 111,191,593\n\n203,612,949,861\n\n3.056,962,705\n204,111,191,593\n\n3,2 03.613\n203,617,049,861\n\nInvestment securities\n\n74,205,700,804\n\n68,889,854,65 1\n\n73,915,597, 188\n\n68,611:262.068\n\nCurrent tax assets\n\n99,883,51

In [None]:
extracted_text_new.split("\n")

In [15]:
# Cleaning the extracted text before extracting tabular data

# Replace common OCR misreads
cleaned_text = extracted_text_new.replace("|", "").replace("\n\n", "\n")

# Normalize spaces and punctuation inconsistencies
cleaned_text = re.sub(r'\s+', ' ', cleaned_text)  # Replace multiple spaces with a single space
cleaned_text = re.sub(r'(\d),(\d)', r'\1\2', cleaned_text)  # Remove misplaced commas in numbers

# Split into lines again for structured processing
cleaned_lines = cleaned_text.split("\n")

# Display a sample of cleaned lines for analysis
cleaned_lines[:50]


['Condensed Consolidated Statement of Financial Position As on Quarter ended 29 Poush, 2081 Assets Group This Quarter Ending Immediate Previous Year Ending Bank This Quarter Ending Immediate Previous Year Ending Cash and cash equivalent Due from Nepal Rastra Bank 5822973123 10953644717 5074320.773 15212779452 5822.700300 10953644717 5074083.242 15212779452 Placement with Bank and Financial Institutions 9752625000 5214300000 9752625000 5$,214300000 Derivative financial instruments 9972912 9972912 Other trading assets 1499319.294 1274757.944 1439127517 1216149394 Loan and advances to B/FIs Loans and advances to customers 204, 111191593 203612949861 3.056962705 204111191593 32 03.613 203617049861 Investment securities 74205700804 6888985465 1 73915597, 188 68611:262.068 Current tax assets 99883514 138800570 99414651 139134.770 Investment in subsidiaries 29.013000 29013000 Investment in associates Investment properties 1015708938 943089080 1015708938 Property and equipment Goodwill and Int

In [16]:
# Re-extracting structured rows manually

structured_data = []

# Identifying numeric values and mapping them to headers
for line in cleaned_lines:
    parts = line.split()  # Split by spaces
    if len(parts) >= 5:  # Ensure valid row with enough columns
        row = [parts[0]] + parts[-4:]  # First column as key, last 4 as values
        structured_data.append(row)

# Convert to DataFrame
df_financials_fixed = pd.DataFrame(structured_data, columns=["Assets", "Group (This Quarter)", "Group (Previous Year)", "Bank (This Quarter)", "Bank (Previous Year)"])

# Display the structured table
df_financials_fixed.head(20)


Unnamed: 0,Assets,Group (This Quarter),Group (Previous Year),Bank (This Quarter),Bank (Previous Year)
0,Condensed,36139947720,311684741778,320150123086,311375433657


In [12]:
structured_data

[['Total liabilities',
  '284,468,899,398',
  '275,299,839,617',
  '284,387,881,807',
  '275,235,485,937']]

In [10]:
# Convert extracted structured data into a DataFrame
df_extracted = pd.DataFrame(structured_data, columns=["Assets", "Group (This Quarter)", "Group (Previous Year)", "Bank (This Quarter)", "Bank (Previous Year)"])


In [11]:
df_extracted

Unnamed: 0,Assets,Group (This Quarter),Group (Previous Year),Bank (This Quarter),Bank (Previous Year)
0,Total liabilities,284468899398,275299839617,284387881807,275235485937


In [None]:
import cv2
import pytesseract
import numpy as np
import pandas as pd

# Load the image
image_path = "image.png"
image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)

# Apply adaptive thresholding to improve table visibility
thresh = cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 11, 2)

# Detect horizontal and vertical lines to find table structure
horizontal_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (40, 1))
vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, 40))

# Detect lines
horizontal_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, horizontal_kernel)
vertical_lines = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel)

# Combine both line masks
table_structure = cv2.addWeighted(horizontal_lines, 0.5, vertical_lines, 0.5, 0)

# Find contours of table cells
contours, _ = cv2.findContours(table_structure, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)

# Sort contours from top to bottom and left to right
contours = sorted(contours, key=lambda x: (cv2.boundingRect(x)[1], cv2.boundingRect(x)[0]))

# Extract text from each detected cell
extracted_data = []
for contour in contours:
    x, y, w, h = cv2.boundingRect(contour)
    cell = image[y:y+h, x:x+w]  # Extract cell region
    text = pytesseract.image_to_string(cell, config='--psm 6').strip()
    extracted_data.append(text)

# Organize extracted text into a structured table
num_columns = 5  # Expected number of columns
structured_data = [extracted_data[i:i + num_columns] for i in range(0, len(extracted_data), num_columns)]

# Convert to DataFrame
df_financials = pd.DataFrame(structured_data, columns=["Assets", "Group (This Quarter)", "Group (Previous Year)", "Bank (This Quarter)", "Bank (Previous Year)"])




In [21]:
# Display the structured DataFrame
df_financials.head(20)

Unnamed: 0,Assets,Group (This Quarter),Group (Previous Year),Bank (This Quarter),Bank (Previous Year)
0,Condensed Consolidated Statement of Financial ...,,Condensed Consolidated Statement of Financial ...,(,
1,,,,,
2,on,"As on Quarter ended 29 Poush, 2081","As on Quarter ended 29 Poush, 2081",,
3,,,,,pe aes Bh es 5)
4,,,,,
5,,,,,
6,,,,,
7,,,,,
8,,,,,
9,,,,,
