In [40]:
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import pandas as pd
import glob

In [41]:
#  Manually set the full path to tesseract.exe
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"

In [42]:
# Create the training set 
train_folder_path = './Input/set-1/Train'
train_files = glob.glob(f"{train_folder_path}/*.pdf")
train_ocr_collection = []
doc_numbers = ['0362418', '0362431', '0362440', '0362453']

for file in train_files:
    pdf_images = convert_from_path(file)
    image = pdf_images[0].convert("RGB")

    ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)

    ocr_df = pd.DataFrame(ocr_data)

    # Add width and height
    ocr_df['bottom'] = ocr_df['top'] + ocr_df['height']
    ocr_df['right'] = ocr_df['left'] + ocr_df['width']

    # Create a target column
    ocr_df.loc[ocr_df['text'] .isin(doc_numbers), 'label'] = 1
    ocr_df["label"].fillna(0, inplace=True)


    # Add the dataframe to the collection
    train_ocr_collection.append(ocr_df)

# Combine all OCR DataFrames into a single DataFrame
train_df = pd.concat(train_ocr_collection, ignore_index=True)

# Save as CSV (optional)
train_df.to_csv("training_data.csv", index=False)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ocr_df["label"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  ocr_df["label"].fillna(0, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behave

In [43]:
# Process the Test folder

test_folder_path = './Input/set-1/Test'
test_files = glob.glob(f"{test_folder_path}/*.pdf")

test_ocr_collection = []  # Store test DataFrames

for file in test_files:  # List of PDFs in the test folder
    pdf_images = convert_from_path(file)
    image = pdf_images[0].convert("RGB")

    ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
    ocr_df = pd.DataFrame(ocr_data)

    # Add width and height
    ocr_df['bottom'] = ocr_df['top'] + ocr_df['height']
    ocr_df['right'] = ocr_df['left'] + ocr_df['width']

    # NO LABELS for test data
    test_ocr_collection.append(ocr_df)

# Merge into a single test DataFrame
test_df = pd.concat(test_ocr_collection, ignore_index=True)
test_df.to_csv("test_data.csv", index=False)

In [44]:
# Select only numerical features for training
feature_columns = ["left", "top", "width", "height", "bottom", "right"]

X_train = train_df[feature_columns]
X_test = test_df[feature_columns]
y_train = train_df["label"]  # Target variable (1 for document number, 0 for others)

## Predict on the Test Set

In [45]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=2)
clf.fit(X_train, y_train)

test_df["predicted_label"]  = clf.predict(X_test)

test_df.loc[test_df['predicted_label'] == 1.0]

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,bottom,right,predicted_label
55,5,1,4,1,4,1,140,212,109,19,95,362663,231,249,1.0
790,5,1,28,1,5,1,144,209,111,19,96,362704,228,255,1.0
1428,5,1,35,1,5,1,144,212,109,17,96,362682,229,253,1.0
2118,5,1,29,1,4,1,148,208,108,19,95,362697,227,256,1.0


## Single file lookup

In [55]:
path = "./Input/set-1/Test/SP_MIS02824100914340 5.pdf"

# Convert PDF to Image
pdf_images = convert_from_path(path)
img = pdf_images[0].convert("RGB")


# Perform OCR
ocr_data = pytesseract.image_to_data(img, output_type=pytesseract.Output.DICT)
final_df = pd.DataFrame(ocr_data)

# Compute Bounding Box Details
final_df['bottom'] = final_df['top'] + final_df['height']
final_df['right'] = final_df['left'] + final_df['width']

# Select relevant features for model prediction
X_test = final_df[feature_columns]

new_clf = KNeighborsClassifier(n_neighbors=2)
new_clf.fit(X_train, y_train)

# Predict Document Numbers
final_df["predicted_label"] = new_clf.predict(X_test)

# Return only text identified as a document number
final_df.loc[final_df["predicted_label"] == 1.0, "text"].tolist()



['0362663']