In [2]:
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import pandas as pd
import glob

In [3]:
#  Manually set the full path to tesseract.exe
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [4]:
path = "./Input/set-1/Train/SP_MIS02824100914340 2.pdf"

# Convert PDF to Image
pdf_images = convert_from_path(path)
img = pdf_images[0].convert("RGB")


# Perform OCR
ocr_data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
df = pd.DataFrame(ocr_data)


df.loc[df['text'] == "42480"]

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text


In [None]:
df[100:200]

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text
100,5,1,7,1,7,5,675,299,164,22,83,0009406423
101,5,1,7,1,7,6,860,303,78,18,90,11675
102,5,1,7,1,7,7,960,308,47,13,96,rue
103,5,1,7,1,7,8,1027,303,164,19,96,Sherbrooke
104,5,1,7,1,7,9,1211,306,45,16,96,est
...,...,...,...,...,...,...,...,...,...,...,...,...
195,3,1,22,1,0,0,62,742,333,5,-1,
196,4,1,22,1,1,0,62,742,333,5,-1,
197,5,1,22,1,1,1,62,742,333,5,95,
198,2,1,23,0,0,0,60,730,6,364,-1,


In [5]:
# Create the training set 
train_folder_path = './Input/set-1/Train'
train_files = glob.glob(f"{train_folder_path}/*.pdf")
train_ocr_collection = []
doc_numbers = ['0362418', '0362431', '0362440', '0362453']
dates = ['10/07/24']
total_litres = ['42423', '42480', '42397', '42402']

for file in train_files:
    pdf_images = convert_from_path(file)
    image = pdf_images[0].convert("RGB")

    ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)

    ocr_df = pd.DataFrame(ocr_data)

    # Add width and height
    ocr_df['bottom'] = ocr_df['top'] + ocr_df['height']
    ocr_df['right'] = ocr_df['left'] + ocr_df['width']

    # Create a target column
    ocr_df.loc[ocr_df['text'].isin(doc_numbers), 'label'] = 1
    ocr_df.loc[ocr_df['text'].isin(dates), 'label'] = 2
    ocr_df.loc[(ocr_df['text'].isin(total_litres)) & (ocr_df['top'].astype('int') > 1400), 'label'] = 3

    # ocr_df["label"].fillna(0, inplace=True)
    ocr_df["label"] = ocr_df["label"].fillna(0)


    # Add the dataframe to the collection
    train_ocr_collection.append(ocr_df)

# Combine all OCR DataFrames into a single DataFrame
train_df = pd.concat(train_ocr_collection, ignore_index=True)

# Save as CSV (optional)
train_df.to_csv("training_data.csv", index=False)

In [6]:
train_df.loc[train_df['label'].isin([3])]

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,bottom,right,label
329,5,1,20,1,2,7,1308,1523,77,19,96,42423,1542,1385,3.0
1632,5,1,23,1,2,6,1319,1521,78,19,96,42397,1540,1397,3.0
2316,5,1,26,1,32,7,1318,1522,77,18,87,42402,1540,1395,3.0


In [10]:
# Process the Test folder

test_folder_path = './Input/set-1/Test'
test_files = glob.glob(f"{test_folder_path}/*.pdf")

test_ocr_collection = []  # Store test DataFrames

for file in test_files:  # List of PDFs in the test folder
    pdf_images = convert_from_path(file)
    image = pdf_images[0].convert("RGB")

    ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
    ocr_df = pd.DataFrame(ocr_data)

    # Add width and height
    ocr_df['bottom'] = ocr_df['top'] + ocr_df['height']
    ocr_df['right'] = ocr_df['left'] + ocr_df['width']

    # NO LABELS for test data
    test_ocr_collection.append(ocr_df)

# Merge into a single test DataFrame
test_df = pd.concat(test_ocr_collection, ignore_index=True)
test_df.to_csv("test_data.csv", index=False)

In [11]:
# Select only numerical features for training
feature_columns = ["left", "top", "width", "height", "bottom", "right"]

X_train = train_df[feature_columns]
X_test = test_df[feature_columns]
y_train = train_df["label"]  # Target variable (1 for document number, 0 for others)

## Predict on the Test Set

In [12]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=2)
clf.fit(X_train, y_train)

test_df["predicted_label"]  = clf.predict(X_test)

test_df.loc[test_df['predicted_label'].isin([1, 2, 3])]

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,bottom,right,predicted_label
159,5,1,30,1,5,1,146,196,110,19,95,0114749,215,256,1.0
161,5,1,30,1,5,3,526,198,129,20,96,10/07/24,218,655,2.0
361,5,1,30,1,32,4,1308,1452,78,19,95,46065,1471,1386,3.0
719,5,1,31,1,5,1,149,192,110,19,95,0114750,211,259,1.0
721,5,1,31,1,5,3,529,194,129,21,96,10/07/24,215,658,2.0
924,5,1,31,1,32,4,1310,1450,78,20,95,46136,1470,1388,3.0
1308,5,1,29,1,5,1,148,193,111,19,95,0114751,212,259,1.0
1310,5,1,29,1,5,3,528,195,129,20,96,10/08/24,215,657,2.0
1511,5,1,29,1,31,4,1309,1452,79,18,96,46130,1470,1388,3.0
1860,5,1,30,1,5,1,150,194,109,20,96,0114752,214,259,1.0


## Single file lookup

In [None]:
path = "./Input/set-1/Test/SP_MIS02824100914340 5.pdf"

# Convert PDF to Image
pdf_images = convert_from_path(path)
img = pdf_images[0].convert("RGB")


# Perform OCR
ocr_data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
final_df = pd.DataFrame(ocr_data)

# Compute Bounding Box Details
final_df['bottom'] = final_df['top'] + final_df['height']
final_df['right'] = final_df['left'] + final_df['width']

# Select relevant features for model prediction
X_test = final_df[feature_columns]

new_clf = KNeighborsClassifier(n_neighbors=2)
new_clf.fit(X_train, y_train)

# Predict Document Numbers
final_df["predicted_label"] = new_clf.predict(X_test)

# Return only text identified as a document number
doc_num = final_df.loc[final_df["predicted_label"] == 1, "text"].tolist()[0]
v_date = final_df.loc[final_df["predicted_label"] == 2, "text"].tolist()[0]



In [None]:
if final_df["predicted_label"].eq(3).any():
    total_litres = final_df.loc[final_df["predicted_label"] == 3, "text"].tolist()[0]
else:
    total_litres = None


In [None]:
print(f"Document Number: {doc_num}")
print(f"Date: {v_date}")
print(f"Total Litres: {total_litres}")

Document Number: 0362663
Date: 10/08/24
Total Litres: 42465


## Process Multiple test files

In [13]:
folder_path = './Input/set-1/Test'
files = glob.glob(f"{folder_path}/*.pdf")
output_collection = []
final_df = []

new_clf = KNeighborsClassifier(n_neighbors=2)
new_clf.fit(X_train, y_train)

for file in files:
    # Convert PDF to Image
    pdf_images = convert_from_path(file)
    img = pdf_images[0].convert("RGB")


    # Perform OCR
    ocr_data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
    final_df = pd.DataFrame(ocr_data)

    # Compute Bounding Box Details
    final_df['bottom'] = final_df['top'] + final_df['height']
    final_df['right'] = final_df['left'] + final_df['width']

    # Select relevant features for model prediction
    X_test = final_df[feature_columns]

    # Predict labels
    final_df["predicted_label"] = new_clf.predict(X_test)

    # Extract information

    if final_df["predicted_label"].eq(1).any():
        doc_num = final_df.loc[final_df["predicted_label"] == 1, "text"].tolist()[0]
    else:
        doc_num = None

    if final_df["predicted_label"].eq(2).any():
        v_date = final_df.loc[final_df["predicted_label"] == 2, "text"].tolist()[0]
    else:
        v_date = None

    if final_df["predicted_label"].eq(3).any():
        total_litres = final_df.loc[final_df["predicted_label"] == 3, "text"].tolist()[0]
    else:
        total_litres = None

    # Store results
    data = {
        'File': file,
        'Document Number': doc_num,
        'Date': v_date,
        'Total Litres': total_litres
    }

    output_collection.append(data)

        

In [14]:
output_df = pd.DataFrame(output_collection)
output_df

Unnamed: 0,File,Document Number,Date,Total Litres
0,./Input/set-1/Test\SP_MIS02824100914340 19.pdf,114749,10/07/24,46065.0
1,./Input/set-1/Test\SP_MIS02824100914340 20.pdf,114750,10/07/24,46136.0
2,./Input/set-1/Test\SP_MIS02824100914340 21.pdf,114751,10/08/24,46130.0
3,./Input/set-1/Test\SP_MIS02824100914340 22.pdf,114752,10/08/24,46132.0
4,./Input/set-1/Test\SP_MIS02824100914340 23.pdf,114753,10/08/24,46128.0
5,./Input/set-1/Test\SP_MIS02824100914340 24.pdf,114754,10/08/24,46127.0
6,./Input/set-1/Test\SP_MIS02824100914340 25.pdf,114755,10/08/24,48086.0
7,./Input/set-1/Test\SP_MIS02824100914340 26.pdf,114757,10/08/24,46146.0
8,./Input/set-1/Test\SP_MIS02824100914340 27.pdf,114758,10/08/24,48099.0
9,./Input/set-1/Test\SP_MIS02824100914340 28.pdf,114760,10/08/24,46145.0


In [15]:
output_df.to_csv('results.csv')