# Flow

### Image Preprocessing + OCR


In [41]:
import cv2
import numpy as np
import pandas as pd
import easyocr



def preprocess_image(image_path):
    
    img = cv2.imread(image_path)
    

    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    denoised_img = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)
        
    return denoised_img
    
if __name__ == "__main__":
    
    image_path = r"image.png"  
    

    processed_image = preprocess_image(image_path)
    
    reader = easyocr.Reader(['en'])

    result = reader.readtext(processed_image)
    result_str = ""
    
    for detection in result:
        print(detection[1])
        result_str += detection[1] 
        

Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


'Ramesh Kumar$ Aadhaar number is 1234-5678-9012, issued on January 15, 1990. He lives at
123 Main Street, Bangalore, Karnataka: His mobile number is 98765-43210, and his email is
ramesh kumar@example com_


In [42]:
import cv2
import numpy as np
import pandas as pd
import easyocr
from fuzzywuzzy import fuzz
import re

def preprocess_image(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    denoised_img = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)
    return img, denoised_img  

def normalize_text(text):
    """Normalize text by removing spaces, special characters and converting to lowercase."""
    text = re.sub(r'\W+', '', text)  
    return text.lower()

def obfuscate_image(image, result, label_mapping):
    for detection in result:
        text = detection[1]  
        box = detection[0]  
       
        normalized_text = normalize_text(text)

        for token, label in label_mapping.items():
            normalized_token = normalize_text(token)  
            if fuzz.ratio(normalized_text, normalized_token) >= 75:  # 75% fuzzy matching
                print(f"Match found: '{text}' with token '{token}' (label: {label})") 
                
                if label == 'B-EMAIL':  # TO SET CONDITON : EMAIL
                    # Draw a rectangle over the sensitive area
                    pts = np.array(box, dtype=np.int32)
                    cv2.polylines(image, [pts], isClosed=True, color=(0, 0, 255), thickness=2)
                    # Overlay a solid rectangle to obfuscate
                    cv2.fillPoly(image, [pts], (0, 0, 0))  # Fill with black color
                break  

    return image

if __name__ == "__main__":
   
    df = pd.read_csv(r"D:\nvm\obscure\Code\ouput.csv", dtype=str)  

    label_mapping = dict(zip(df['Token'], df['Predicted Label']))  
    image_path = r"test1.png"
    original_image, processed_image = preprocess_image(image_path)

    reader = easyocr.Reader(['en'])
    result = reader.readtext(processed_image)

    # Print OCR results for debugging
    print("OCR Results:", result)

    obfuscated_image = obfuscate_image(original_image, result, label_mapping)

    cv2.imwrite("obfuscated_image.png", obfuscated_image)
    print("Obfuscation completed and saved as 'obfuscated_image.png'.")


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


OCR Results: [([[14, 18], [79, 18], [79, 38], [14, 38]], 'Ramesh', 0.8249102968355082), ([[85, 19], [137, 19], [137, 37], [85, 37]], 'Kumar', 0.9999859789284535), ([[140, 12], [219, 12], [219, 42], [140, 42]], 'fadhaar', 0.51098217646642), ([[225, 17], [469, 17], [469, 37], [225, 37]], 'number is 1234-5678-9012', 0.7164455642947598), ([[493, 17], [559, 17], [559, 37], [493, 37]], 'issued', 0.9988442035278075), ([[565, 18], [669, 18], [669, 39], [565, 39]], 'on   January', 0.6041175283571303), ([[674, 16], [760, 16], [760, 40], [674, 40]], '15, 1990', 0.6332941866807235), ([[785, 17], [869, 17], [869, 37], [785, 37]], 'He lives', 0.9946862936063581), ([[875, 21], [899, 21], [899, 37], [875, 37]], 'at', 0.46582163747708755), ([[13, 39], [49, 39], [49, 59], [13, 59]], '123', 0.9997733610188607), ([[55, 39], [167, 39], [167, 59], [55, 59]], 'Main Street', 0.8075895015040541), ([[191, 37], [290, 37], [290, 63], [191, 63]], 'Bangalore', 0.789764472715864), ([[313, 39], [409, 39], [409, 59], 

### Prediction

In [43]:
from transformers import BertTokenizer, BertForTokenClassification
import torch


model = BertForTokenClassification.from_pretrained('./saved_model')
tokenizer = BertTokenizer.from_pretrained('./saved_model')


model.eval()

def predict(text, model, tokenizer, label_list):
    
    inputs = tokenizer(text, return_tensors="pt", is_split_into_words=False, padding=True, truncation=True)
    
    
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.argmax(outputs.logits, dim=2)

    
    predicted_labels = [label_list[prediction.item()] for prediction in predictions[0]]

    
    tokens = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
    result = list(zip(tokens, predicted_labels))
    
    return result





label_list = ['O', 'B-NAME', 'I-NAME', 'B-AADHAAR', 'I-AADHAAR', 'B-DL', 'I-DL', 
              'B-PASSPORT', 'I-PASSPORT', 'B-DATE', 'I-DATE', 'B-ADDRESS', 'I-ADDRESS', 
              'B-MOBILE', 'I-MOBILE', 'B-EMAIL', 'I-EMAIL', 'B-BANK', 'I-BANK', 
              'B-CC', 'I-CC', 'B-MEDICAL', 'I-MEDICAL', 'B-LOAN', 'I-LOAN', 
              'B-PIN', 'I-PIN', 'B-OTP', 'I-OTP', 'B-FINANCIAL', 'I-FINANCIAL', 
              'B-IP', 'I-IP', 'B-LOGIN', 'I-LOGIN', 'B-COOKIES', 'I-COOKIES', 
              'B-CREDIT', 'I-CREDIT', 'B-INSURANCE', 'I-INSURANCE', 'B-GENETIC', 
              'I-GENETIC', 'B-BIOMETRIC', 'I-BIOMETRIC', 'B-CARD', 'I-CARD']


predictions = predict(result_str, model, tokenizer, label_list)


print("Token  |  Predicted Label")
print("-------------------------")
for token, label in predictions:
    print(f"{token}  |  {label}")


Token  |  Predicted Label
-------------------------
[CLS]  |  B-CARD
'  |  O
ram  |  O
##esh  |  O
kumar  |  O
$  |  O
aa  |  O
##dha  |  O
##ar  |  O
number  |  O
is  |  B-CARD
123  |  O
##4  |  O
-  |  O
56  |  O
##7  |  O
##8  |  O
-  |  O
90  |  O
##12  |  O
,  |  B-CARD
issued  |  O
on  |  O
january  |  O
15  |  O
,  |  B-CARD
1990  |  O
.  |  B-CARD
he  |  O
lives  |  O
at  |  O
##12  |  O
##3  |  O
main  |  O
street  |  O
,  |  B-CARD
bangalore  |  O
,  |  B-CARD
karnataka  |  O
:  |  O
his  |  O
mobile  |  O
number  |  O
is  |  O
98  |  O
##7  |  O
##65  |  O
-  |  O
43  |  O
##21  |  O
##0  |  O
,  |  B-CARD
and  |  O
his  |  O
email  |  O
is  |  O
##ram  |  O
##esh  |  O
kumar  |  O
@  |  O
example  |  O
com  |  O
_  |  O
[SEP]  |  B-CARD


In [44]:
df = pd.DataFrame(predictions, columns=['Token', 'Predicted Label'])
df.to_csv("ouput.csv", index=False)

### Obfestication


In [55]:
import cv2
import numpy as np
import pandas as pd
import easyocr
from fuzzywuzzy import fuzz
import re

def preprocess_image(image_path):
    img = cv2.imread(image_path)
    gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    denoised_img = cv2.fastNlMeansDenoising(gray, None, 30, 7, 21)
    return img, denoised_img  


def obfuscate_image(image, result, label_mapping, obfuscate_labels):
    for detection in result:
        text = detection[1]  # Detected text by OCR
        box = detection[0]  # Bounding box coordinates

       

        # Check if the detected text corresponds to any of the obfuscate labels
        for token, label in label_mapping.items():
            
            if fuzz.ratio(text, token) >= 80:  # 80% fuzzy matching
                print(f"Match found: '{text}' with token '{token}' (label: {label})")  # Debugging info
                
                if label in obfuscate_labels:  # Obfuscate only if the label is in the obfuscate list
                    # Draw a rectangle over the sensitive area
                    pts = np.array(box, dtype=np.int32)
                    cv2.polylines(image, [pts], isClosed=True, color=(0, 0, 255), thickness=2)
                    # Overlay a solid rectangle to obfuscate
                    cv2.fillPoly(image, [pts], (0, 0, 0))  # Fill with black color
                break  

    return image

if __name__ == "__main__":
    df = pd.read_csv(r"D:\nvm\obscure\Code\ouput.csv", dtype=str)  

    label_mapping = dict(zip(df['Token'], df['Predicted Label']))  # Create a dictionary mapping

    # Define a list of labels to obfuscate
    obfuscate_labels = [label for label in label_mapping.values() if label != 'O']

    image_path = r"image.png"
    original_image, processed_image = preprocess_image(image_path)

    reader = easyocr.Reader(['en'])
    result = reader.readtext(processed_image)

    # Print OCR results for debugging
    print("OCR Results:", result)

    obfuscated_image = obfuscate_image(original_image, result, label_mapping, obfuscate_labels)

    cv2.imwrite("obfuscated_image.png", obfuscated_image)
    print("Obfuscation completed and saved as 'obfuscated_image.png'.")


Neither CUDA nor MPS are available - defaulting to CPU. Note: This module is much faster with a GPU.


OCR Results: [([[12, 0], [848, 0], [848, 28], [12, 28]], "'Ramesh Kumar$ Aadhaar number is 1234-5678-9012, issued on January 15, 1990. He lives at", 0.6290654541461264), ([[2, 30], [830, 30], [830, 58], [2, 58]], '123 Main Street, Bangalore, Karnataka: His mobile number is 98765-43210, and his email is', 0.5916109796921261), ([[4, 62], [270, 62], [270, 86], [4, 86]], 'ramesh kumar@example com_', 0.9276296836902777)]
Match found: 'ramesh kumar@example com_' with token 'ramesh.kumar@example.com' (label: B-EMAIL)
Obfuscation completed and saved as 'obfuscated_image.png'.
