# Flow

### Image Preprocessing + OCR


In [41]:
import cv2
import numpy as np
import pandas as pd
import easyocr



def preprocess_image(image_path):
    
    img = cv2.imread(image_path)
    

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    denoised_img = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)
        
    return denoised_img
    
if __name__ == "__main__":
    
    image_path = r"image.png"  
    

    processed_image = preprocess_image(image_path)
    
    reader = easyocr.Reader(['en'])

    result = reader.readtext(processed_image)
    result_str = ""
    
    for detection in result:
        print(detection[1])
        result_str += detection[1] 
        

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


'Ramesh Kumar$ Aadhaar number is 1234-5678-9012, issued on January 15, 1990. He lives at
123 Main Street, Bangalore, Karnataka: His mobile number is 98765-43210, and his email is
ramesh kumar@example com_


### Prediction

In [43]:
from transformers import BertTokenizer, BertForTokenClassification
import torch


model = BertForTokenClassification.from_pretrained('./saved_model')
tokenizer = BertTokenizer.from_pretrained('./saved_model')


model.eval()

def predict(text, model, tokenizer, label_list):
    
    inputs = tokenizer(text, return_tensors="pt", is_split_into_words=False, padding=True, truncation=True)
    
    
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)

    
    predicted_labels = [label_list[prediction.item()] for prediction in predictions[0]]

    
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    result = list(zip(tokens, predicted_labels))
    
    return result





label_list = ['O', 'B-NAME', 'I-NAME', 'B-AADHAAR', 'I-AADHAAR', 'B-DL', 'I-DL', 
              'B-PASSPORT', 'I-PASSPORT', 'B-DATE', 'I-DATE', 'B-ADDRESS', 'I-ADDRESS', 
              'B-MOBILE', 'I-MOBILE', 'B-EMAIL', 'I-EMAIL', 'B-BANK', 'I-BANK', 
              'B-CC', 'I-CC', 'B-MEDICAL', 'I-MEDICAL', 'B-LOAN', 'I-LOAN', 
              'B-PIN', 'I-PIN', 'B-OTP', 'I-OTP', 'B-FINANCIAL', 'I-FINANCIAL', 
              'B-IP', 'I-IP', 'B-LOGIN', 'I-LOGIN', 'B-COOKIES', 'I-COOKIES', 
              'B-CREDIT', 'I-CREDIT', 'B-INSURANCE', 'I-INSURANCE', 'B-GENETIC', 
              'I-GENETIC', 'B-BIOMETRIC', 'I-BIOMETRIC', 'B-CARD', 'I-CARD']


predictions = predict(result_str, model, tokenizer, label_list)


print("Token  |  Predicted Label")
print("-------------------------")
for token, label in predictions:
    print(f"{token}  |  {label}")


Token  |  Predicted Label
-------------------------
[CLS]  |  B-CARD
'  |  O
ram  |  O
##esh  |  O
kumar  |  O
$  |  O
aa  |  O
##dha  |  O
##ar  |  O
number  |  O
is  |  B-CARD
123  |  O
##4  |  O
-  |  O
56  |  O
##7  |  O
##8  |  O
-  |  O
90  |  O
##12  |  O
,  |  B-CARD
issued  |  O
on  |  O
january  |  O
15  |  O
,  |  B-CARD
1990  |  O
.  |  B-CARD
he  |  O
lives  |  O
at  |  O
##12  |  O
##3  |  O
main  |  O
street  |  O
,  |  B-CARD
bangalore  |  O
,  |  B-CARD
karnataka  |  O
:  |  O
his  |  O
mobile  |  O
number  |  O
is  |  O
98  |  O
##7  |  O
##65  |  O
-  |  O
43  |  O
##21  |  O
##0  |  O
,  |  B-CARD
and  |  O
his  |  O
email  |  O
is  |  O
##ram  |  O
##esh  |  O
kumar  |  O
@  |  O
example  |  O
com  |  O
_  |  O
[SEP]  |  B-CARD


In [44]:
df = pd.DataFrame(predictions, columns=['Token', 'Predicted Label'])
df.to_csv("ouput.csv", index=False)

### Obfestication


In [55]:
import cv2
import numpy as np
import pandas as pd
import easyocr
from fuzzywuzzy import fuzz
import dpctl
from numba import cuda, jit
from numba import float32

# DPC++ function to obfuscate sensitive areas in the image
@cuda.jit
def obfuscate_kernel(image, boxes, labels, obfuscate_labels, output_image):
    # Calculate thread indices
    i = cuda.grid(1)
    if i < boxes.shape[0]:
        # Get box coordinates and label
        box = boxes[i]
        label = labels[i]
        
        if label in obfuscate_labels:
            # Draw black rectangle on the output image
            for x in range(box[0], box[2]):
                for y in range(box[1], box[3]):
                    output_image[y, x] = 0  # Fill with black

def preprocess_image(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    denoised_img = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)
    return img, denoised_img

def obfuscate_image(original_image, result, label_mapping, obfuscate_labels):
    # Prepare data for GPU
    boxes = []
    labels = []
    
    for detection in result:
        text = detection[1]  # Detected text by OCR
        box = detection[0]  # Bounding box coordinates

        # Check if the detected text corresponds to any of the obfuscate labels
        for token, label in label_mapping.items():
            if fuzz.ratio(text, token) >= 80:  # 80% fuzzy matching
                boxes.append((int(box[0][0]), int(box[0][1]), int(box[2][0]), int(box[2][1])))  # (x1, y1, x2, y2)
                labels.append(label)
                break

    boxes = np.array(boxes, dtype=np.int32)
    labels = np.array(labels, dtype=np.object)

    # Allocate output image
    output_image = np.copy(original_image)

    # Launch the kernel
    threads_per_block = 256
    blocks_per_grid = (boxes.shape[0] + (threads_per_block - 1)) // threads_per_block
    obfuscate_kernel[blocks_per_grid, threads_per_block](output_image, boxes, labels, obfuscate_labels, output_image)

    return output_image

if __name__ == "__main__":
    df = pd.read_csv(r"D:\nvm\obscure\Code\ouput.csv", dtype=str)
    label_mapping = dict(zip(df['Token'], df['Predicted Label']))  # Create a dictionary mapping

    # Define a list of labels to obfuscate
    obfuscate_labels = [label for label in label_mapping.values() if label != 'O']

    image_path = r"image.png"
    original_image, processed_image = preprocess_image(image_path)

    reader = easyocr.Reader(['en'])
    result = reader.readtext(processed_image)

    # Print OCR results for debugging
    print("OCR Results:", result)

    obfuscated_image = obfuscate_image(original_image, result, label_mapping, obfuscate_labels)

    # Save the obfuscated image
    cv2.imwrite("obfuscated_image.png", obfuscated_image)
    print("Obfuscation completed and saved as 'obfuscated_image.png'.")


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


OCR Results: [([[12, 0], [848, 0], [848, 28], [12, 28]], "'Ramesh Kumar$ Aadhaar number is 1234-5678-9012, issued on January 15, 1990. He lives at", 0.6290654541461264), ([[2, 30], [830, 30], [830, 58], [2, 58]], '123 Main Street, Bangalore, Karnataka: His mobile number is 98765-43210, and his email is', 0.5916109796921261), ([[4, 62], [270, 62], [270, 86], [4, 86]], 'ramesh kumar@example com_', 0.9276296836902777)]
Match found: 'ramesh kumar@example com_' with token 'ramesh.kumar@example.com' (label: B-EMAIL)
Obfuscation completed and saved as 'obfuscated_image.png'.
