In [None]:
import pytesseract
from PIL import Image
from pdf2image import convert_from_path
import pandas as pd
import glob
import json
from sklearn.neighbors import KNeighborsClassifier
import joblib

# Pre-processing Steps

## Create a Master DataFrame with all the JSON Data

We compiled a list of JSON files, each corresponding to an image file, by appending relevant information. The purpose of this process is to train the dataset using the JSON data. The goal is to label the training dataset by marking words in the address if they appear in the Tesseract-extracted dataframe.

In [6]:
json_file_path = "./Input/jsons_data"
json_files = glob.glob(f"{json_file_path}/*.json")

json_collection = []

# Loop through all JSON files
for file in json_files: 
    with open(file, "r") as f:
        data = json.load(f)

        # Extract the ID from the filename and add it to the dictionary for mapping downstream
        file_name = file.replace("./Input/jsons_data\\W2_", '').replace(".json", "")
        data['file'] = file_name

        # Append the DataFrame to the collection
        json_collection.append(data)

# Convert the list of dictionaries into a DataFrame
json_df = pd.DataFrame(json_collection)

json_df.to_csv('W2_data.csv', index=False)

In [None]:
# Preview the dataframe
json_df.head()

### Addreses object 

In [7]:
def create_address_objects(df):
    address_objects = {}  # Initialize an empty dictionary
    df.apply(lambda row: address_objects.update({row['file']: row["Employee's address"]}), axis=1)
    return address_objects  # Return the dictionary after processing


In [8]:
addresses = create_address_objects(json_df)

In [None]:

train_folder_path = './Input/W2/Train'
train_files = glob.glob(f"{train_folder_path}/*.JPG")
train_ocr_collection = []

for image_path in train_files:

    # Perform OCR
    ocr_data = pytesseract.image_to_data(image_path, output_type=pytesseract.Output.DICT)
    ocr_df = pd.DataFrame(ocr_data)

    # Get the address for this Image
    file_name = image_path.replace("./Input/W2/Train\\W2_XL_input_clean_", '').replace(".jpg", "")
    address_bow = addresses[file_name].replace(',', '').split(' ')

    # Add Bounding box details width and height
    ocr_df['bottom'] = ocr_df['top'] + ocr_df['height']
    ocr_df['right'] = ocr_df['left'] + ocr_df['width']

    # Create a target column
    ocr_df.loc[(ocr_df['text'].isin(address_bow))  & (ocr_df['top'].astype('int') > 300) & (ocr_df['top'].astype('int') < 800), 'label'] = 1

    # ocr_df["label"].fillna(0, inplace=True)
    ocr_df["label"] = ocr_df["label"].fillna(0)

    # Add the dataframe to the collection
    train_ocr_collection.append(ocr_df)

# Combine all OCR DataFrames into a single DataFrame
train_df = pd.concat(train_ocr_collection, ignore_index=True)



In [10]:
# Select only numerical features for training
feature_columns = ["left", "top", "width", "height", "bottom", "right"]

X_train = train_df[feature_columns]
y_train = train_df["label"]  # Target variable (1 for document number, 0 for others)

# Process the model 

## Train the Model

In [15]:
knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train)


## Save the trained model for future ad-hoc use

In [16]:
# Save the trained model
joblib.dump(knn, 'ocr_knn_model.pkl')

['ocr_knn_model.pkl']