# 1. Installation

## Use pytesseract to implement OCR

In [None]:
pip install pytesseract

## Use Spacy for NLP, named entity recognition and text classification

In [None]:
pip install spacy

## Use Faker to generate fake data

In [None]:
pip install Faker

## Use Pillow to process images

In [None]:
pip install Pillow

# Download english dictionary for named entity recognition on spacy 

In [2]:
pip install en_core_web_lg

## Remember to save the font style in your directory that contains files for redaction
To download the font style file, please refer to https://github.com/tiffanychum/Data-redaction

# 2. Setup
Raise a request on Service Now to download Tesseract on your HSBC laptop by searching **Open source OCR engine 5**

# 3. Import library

In [6]:
import spacy, re, os, faker, PIL, pytesseract, en_core_web_lg, random
from PIL import Image, ImageDraw, ImageSequence, ImageFont
from pytesseract import Output
from faker import Faker

In [4]:
# load the english dictionary
nlp = en_core_web_lg.load()
# set up font style
font = ImageFont.truetype("roman-sign.ttf", 22)
# generate fake names
faker = Faker()
fake_names = [(faker.first_name()[0:5]) for x in range (100)]
# your path
gpath = '/Users/makszelai/Desktop/OCR/test/' 

In [7]:
class Redactor:
    
    def __init__(self, path):
        self.path = path
    
    def anonymizer(self):
        img = Image.open(gpath+self.path)
        frames=[]
        
        #read image pages
        for i, page in enumerate(ImageSequence.Iterator(img)):
            page = page.copy()
            d = pytesseract.image_to_data(page, output_type=Output.DICT)
            text = pytesseract.image_to_string(page, lang='eng')
            
            #identify named entity
            n1 = nlp(text)
            names = [(ent.text).strip() for ent in n1.ents if ent.label_ == 'PERSON']
            print(names) #show identified names
            n_boxes = len(d['text'])
            
            # Redact and anonymize data
            draw = ImageDraw.Draw(page) 
            for name in names:
                names2=name.split(' ')
                for i in range(n_boxes):
                    if int(d['conf'][i]) > 60: #skip space or symbol
                        for word in names2:
                            if re.search(word, d['text'][i]):
                                (x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
                                draw.rectangle((x, y, x + w, y + h), outline='yellow', fill=(255,255,51))
                                draw.text((x-1, y+3), random.choice(fake_names), fill = (0,0,0), font = font)

            frames.append(page)

        #save as redacted_
        frames[0].save(gpath+'redacted_'+path, save_all=True, append_images=frames[1:])
        print('Anonymized')
        
if __name__ == "__main__":    
    file = os.listdir(gpath)
    paths = [ e for e in file if '.tiff' in e ] #type of files
    for path in paths:
        redactor = Redactor(path)
        redactor.anonymizer()

['Alessio Huynh', 'Alessio Huynh\n\nFlat', 'London GBR']
Anonymized
['Jarred Leech', 'Maria Danis']
[]
Anonymized
['Nicholas Nelson', 'Eric Branch']
[]
Anonymized
['Adam Brown Branch', 'halifax.co.uk/bankaccounts/overdrafts']
['lendingstandardsboard']
['Nina Stom', 'Nick Clasen']
[]
[]
Anonymized
['Stephanie Casey', 'Jeffery Fords Apt']
[]
Anonymized
['Ann-Marie', 'Ann-Marie Gallegos', 'York Close', 'giffgaffLondon GBR']
[]
Anonymized
