# Flow

### Image Preprocessing + OCR


In [4]:
import cv2
import numpy as np
import pandas as pd
import easyocr



def preprocess_image(image_path):
    
    img = cv2.imread(image_path)
    

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    denoised_img = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)
        
    return denoised_img
    
if __name__ == "__main__":
    
    image_path = r"image.png"  
    

    processed_image = preprocess_image(image_path)
    
    reader = easyocr.Reader(['en'])

    result = reader.readtext(processed_image)
    result_str = ""
    
    for detection in result:
        print(detection[1])
        result_str += detection[1] 
        

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


'Ramesh Kumar$ Aadhaar number is 1234-5678-9012, issued on January 15, 1990. He lives at
123 Main Street, Bangalore, Karnataka: His mobile number is 98765-43210, and his email is
ramesh kumar@example com_


### Prediction

In [5]:
from transformers import BertTokenizer, BertForTokenClassification
import torch


model = BertForTokenClassification.from_pretrained('./saved_model')
tokenizer = BertTokenizer.from_pretrained('./saved_model')


model.eval()

def predict(text, model, tokenizer, label_list):
    
    inputs = tokenizer(text, return_tensors="pt", is_split_into_words=False, padding=True, truncation=True)
    
    
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)

    
    predicted_labels = [label_list[prediction.item()] for prediction in predictions[0]]

    
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    result = list(zip(tokens, predicted_labels))
    
    return result





label_list = ['O', 'B-NAME', 'I-NAME', 'B-AADHAAR', 'I-AADHAAR', 'B-DL', 'I-DL', 
              'B-PASSPORT', 'I-PASSPORT', 'B-DATE', 'I-DATE', 'B-ADDRESS', 'I-ADDRESS', 
              'B-MOBILE', 'I-MOBILE', 'B-EMAIL', 'I-EMAIL', 'B-BANK', 'I-BANK', 
              'B-CC', 'I-CC', 'B-MEDICAL', 'I-MEDICAL', 'B-LOAN', 'I-LOAN', 
              'B-PIN', 'I-PIN', 'B-OTP', 'I-OTP', 'B-FINANCIAL', 'I-FINANCIAL', 
              'B-IP', 'I-IP', 'B-LOGIN', 'I-LOGIN', 'B-COOKIES', 'I-COOKIES', 
              'B-CREDIT', 'I-CREDIT', 'B-INSURANCE', 'I-INSURANCE', 'B-GENETIC', 
              'I-GENETIC', 'B-BIOMETRIC', 'I-BIOMETRIC', 'B-CARD', 'I-CARD']


predictions = predict(result_str, model, tokenizer, label_list)


print("Token  |  Predicted Label")
print("-------------------------")
for token, label in predictions:
    print(f"{token}  |  {label}")


Token  |  Predicted Label
-------------------------
[CLS]  |  B-CARD
'  |  O
ram  |  O
##esh  |  O
kumar  |  O
$  |  O
aa  |  O
##dha  |  O
##ar  |  O
number  |  O
is  |  B-CARD
123  |  O
##4  |  O
-  |  O
56  |  O
##7  |  O
##8  |  O
-  |  O
90  |  O
##12  |  O
,  |  B-CARD
issued  |  O
on  |  O
january  |  O
15  |  O
,  |  B-CARD
1990  |  O
.  |  B-CARD
he  |  O
lives  |  O
at  |  O
##12  |  O
##3  |  O
main  |  O
street  |  O
,  |  B-CARD
bangalore  |  O
,  |  B-CARD
karnataka  |  O
:  |  O
his  |  O
mobile  |  O
number  |  O
is  |  O
98  |  O
##7  |  O
##65  |  O
-  |  O
43  |  O
##21  |  O
##0  |  O
,  |  B-CARD
and  |  O
his  |  O
email  |  O
is  |  O
##ram  |  O
##esh  |  O
kumar  |  O
@  |  O
example  |  O
com  |  O
_  |  O
[SEP]  |  B-CARD


In [7]:
df = pd.DataFrame(predictions, columns=['Token', 'Predicted Label'])
df.to_csv("ouput.csv", index=False)