In [1]:
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import pandas as pd
import glob
import json

## Create a Master DataFrame with all the JSON Data

We compiled a list of JSON files, each corresponding to an image file, by appending relevant information. The purpose of this process is to train the dataset using the JSON data. The goal is to label the training dataset by marking words in the address if they appear in the Tesseract-extracted dataframe.

In [None]:
json_file_path = "./Input/jsons_data"
json_files = glob.glob(f"{json_file_path}/*.json")

json_collection = []

# Loop through all JSON files
for file in json_files: 
    with open(file, "r") as f:
        data = json.load(f)

        # Extract the ID from the filename and add it to the dictionary for mapping downstream
        file_name = file.replace("./Input/jsons_data\\W2_", '').replace(".json", "")
        data['file'] = file_name

        # Append the DataFrame to the collection
        json_collection.append(data)

# Convert the list of dictionaries into a DataFrame
json_df = pd.DataFrame(json_collection)

json_df.to_csv('W2_data.csv', index=False)

In [None]:
# Preview the dataframe
json_df.head()

In [24]:
def create_address_objects(df):
    address_objects = {}  # Initialize an empty dictionary
    df.apply(lambda row: address_objects.update({row['file']: row["Employee's address"]}), axis=1)
    return address_objects  # Return the list after processing



In [25]:
create_address_objects(json_df)

{'1000': '31403 David Circles Suite 863, West Erinfort, WY',
 '1001': '613 Roger Crest Apt. 802, Leeton, IA',
 '1002': '2199 Little Falls, Snyderton, TX',
 '1003': '6503 John Stream, New Meredithstad, MI',
 '1004': '95060 Crystal Burg Apt. 070, Davisburgh, AR',
 '1005': '41435 Hughes Drive, New Teresaberg, KS',
 '1006': '95842 Freeman Coves Apt. 609, Robertburgh, MA',
 '1007': '341 Charles Mountains, Morganberg, CO',
 '1008': '84813 Eric Way Apt. 116, North Amberborough, AR',
 '1009': '730 Huber Island Suite 344, Collinshaven, IN',
 '1010': '9622 Jessica Estates Apt. 532, Nathantown, IA',
 '1011': '08470 Miller Stream, Petersmouth, IN',
 '1012': '68390 Blackwell Drive, South Robert, MD',
 '1013': '27374 Henderson Oval, Lake James, NJ',
 '1014': '917 April Isle, New Taylor, IL',
 '1015': '672 Brian Common Suite 123, East Lisa, UT',
 '1016': '73200 Tammy Bridge Apt. 560, Chadfort, ND',
 '1017': '83396 Cross Pines Suite 762, Johnstonville, SC',
 '1018': '8632 Mcclure Curve, Gailfort, NJ',

In [48]:
addresses = create_address_objects(json_df)

In [51]:
# image_path = './Input/W2\W2_XL_input_clean_1000.JPG'
train_folder_path = './Input/W2/Train'
train_files = glob.glob(f"{train_folder_path}/*.JPG")
train_ocr_collection = []

for image_path in train_files:

    ocr_data = pytesseract.image_to_data(image_path, output_type=pytesseract.Output.DICT)
    ocr_df = pd.DataFrame(ocr_data)

    file_name = image_path.replace("./Input/W2/Train\\W2_XL_input_clean_", '').replace(".jpg", "")
    address_bow = addresses[file_name].replace(',', '').split(' ')

    # Add width and height
    ocr_df['bottom'] = ocr_df['top'] + ocr_df['height']
    ocr_df['right'] = ocr_df['left'] + ocr_df['width']

    # Create a target column
    ocr_df.loc[(ocr_df['text'].isin(address_bow))  & (ocr_df['top'].astype('int') > 300) & (ocr_df['top'].astype('int') < 800), 'label'] = 1

    # ocr_df["label"].fillna(0, inplace=True)
    ocr_df["label"] = ocr_df["label"].fillna(0)

    # Add the dataframe to the collection
    train_ocr_collection.append(ocr_df)

# Combine all OCR DataFrames into a single DataFrame
train_df = pd.concat(train_ocr_collection, ignore_index=True)

# Save as CSV (optional)
train_df.to_csv("W2_training_data.csv", index=False)



In [52]:
ocr_df.loc[ocr_df['label'] == 1]

Unnamed: 0,level,page_num,block_num,par_num,line_num,word_num,left,top,width,height,conf,text,bottom,right,label
288,5,1,35,1,16,1,66,396,44,17,96,730,413,110,1.0
289,5,1,35,1,16,2,129,396,78,17,96,Huber,413,207,1.0
290,5,1,35,1,16,3,226,396,94,17,96,Island,413,320,1.0
291,5,1,35,1,16,4,338,396,77,17,96,Suite,413,415,1.0
292,5,1,35,1,16,5,434,396,44,17,96,344,413,478,1.0
297,5,1,35,1,17,1,65,435,190,17,89,Collinshaven,452,255,1.0
298,5,1,35,1,17,2,338,436,30,16,91,IN,452,368,1.0


In [47]:
address = ' '.join(ocr_df.loc[ocr_df['label'] == 1]['text'].to_list())

In [53]:
# Process the Test folder

test_folder_path = './Input/W2/Test'
test_files = glob.glob(f"{test_folder_path}/*.JPG")

test_ocr_collection = []  # Store test DataFrames

for file in test_files:  # List of PDFs in the test folder
    
    ocr_data = pytesseract.image_to_data(image_path, output_type=pytesseract.Output.DICT)
    ocr_df = pd.DataFrame(ocr_data)

    file_name = image_path.replace("./Input/W2/Train\\W2_XL_input_clean_", '').replace(".jpg", "")
    address_bow = addresses[file_name].replace(',', '').split(' ')

    # Add width and height
    ocr_df['bottom'] = ocr_df['top'] + ocr_df['height']
    ocr_df['right'] = ocr_df['left'] + ocr_df['width']

    # NO LABELS for test data
    test_ocr_collection.append(ocr_df)

# Merge into a single test DataFrame
test_df = pd.concat(test_ocr_collection, ignore_index=True)
test_df.to_csv("W2_test_data.csv", index=False)

In [54]:
# Select only numerical features for training
feature_columns = ["left", "top", "width", "height", "bottom", "right"]

X_train = train_df[feature_columns]
X_test = test_df[feature_columns]
y_train = train_df["label"]  # Target variable (1 for document number, 0 for others)

In [55]:
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors=2)
clf.fit(X_train, y_train)

test_df["predicted_label"]  = clf.predict(X_test)

In [74]:
image_path = "./Input/W2/Test/W2_XL_input_clean_1035.jpg"


# Perform OCR
ocr_data = pytesseract.image_to_data(image_path, output_type=pytesseract.Output.DICT)
final_df = pd.DataFrame(ocr_data)


# Compute Bounding Box Details
final_df['bottom'] = final_df['top'] + final_df['height']
final_df['right'] = final_df['left'] + final_df['width']

# Select relevant features for model prediction
X_test = final_df[feature_columns]

new_clf = KNeighborsClassifier(n_neighbors=2)
new_clf.fit(X_train, y_train)

# Predict Document Numbers
final_df["predicted_label"] = new_clf.predict(X_test)

# Return only text identified as a document number
address = ' '.join(final_df.loc[final_df['predicted_label'] == 1]['text'].to_list())

print(address)


869 Roger Mountains Ryanfurt OK
