## Partie 1: Text bloc detection à partir du PDF

In [None]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer
import pandas as pd 
import numpy as np

In [None]:
# Open a PDF file.
fp = open('<resume name here>.pdf', 'rb')

# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)

# Create a PDF document object that stores the document structure.
# Password for initialization as 2nd parameter
document = PDFDocument(parser)

# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed

# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()

# Create a PDF device object.
device = PDFDevice(rsrcmgr)

# BEGIN LAYOUT ANALYSIS
# Set parameters for analysis.
laparams = LAParams()

# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)

# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
df = pd.DataFrame( columns = ['x', 'y','value']) 
def parse_obj(lt_objs,df):

    # loop over the object list
    for obj in lt_objs:

        # if it's a textbox, print text and location
        if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
            print ("%6d, %6d, %s" % (obj.bbox[0], obj.bbox[1], obj.get_text().replace('\n', '_')))
            #new_row = {'x':obj.bbox[0], 'y':obj.bbox[1], 'value':obj.get_text().replace('\n', '_')}
            new_row = {'x':obj.bbox[0], 'y':obj.bbox[1], 'value':obj.get_text()}
            df = df.append(new_row, ignore_index=True)

        # if it's a container, recurse
        elif isinstance(obj, pdfminer.layout.LTFigure):
            parse_obj(obj._objs,df)
            
    return df    

# loop over all pages in the document
for page in PDFPage.create_pages(document):

    # read the page into a layout object
    interpreter.process_page(page)
    layout = device.get_result()

    # extract text from this object
    df=parse_obj(layout._objs,df)
    

In [None]:
print(df)

In [None]:
df['bloc']=''
df['bloc'] = np.where(df['x'].between(0,100), 'L', df['bloc'])

df['bloc'] = np.where(df['x'].between(101,500), 'R', df['bloc'])


In [None]:
df_left = df[df['bloc'] == 'L']
df_right = df[df['bloc'] == 'R']

In [None]:
df_left=df_left.sort_values("y", ascending=False)
df_right=df_right.sort_values("y", ascending=False)

In [None]:
df_left['value'] = df_left['value'].map(lambda x: x.rstrip('\n'))

In [None]:
df_left

In [None]:
df_right['value'] = df_right['value'].map(lambda x: ' '+x)

In [None]:
df_right = df_right.append(df_right).reset_index().drop_duplicates(subset='index').drop(columns='index')
df_right

## Partie 2: Détection des données à caractère personnel + Anonymisation

In [None]:
import spacy
from spacy import displacy
import fr_core_news_lg
from spacy.matcher import Matcher #Adding custom entities
from spacy.tokens import Span #Adding custom entities

In [None]:
nlp = fr_core_news_lg.load()
matcher = Matcher(nlp.vocab)

def add_email(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="EMAIL")
    doc.ents += (entity,)
    
def add_url(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="URL")
    doc.ents += (entity,)

def add_tel(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="TEL")
    doc.ents += (entity,)
    
def add_date(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="DATE")
    doc.ents += (entity,)

In [None]:
colors = {"URL": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
          "EMAIL": "RGB(241, 13, 105)",
          "TEL" : "RGB(95, 205, 231)",
          "LOC" : "RGB(32, 240, 171)",
          "PER" : "RGB(200, 232, 40)"
         }
options = { "colors": colors}

In [None]:
patternMail = [{"LIKE_EMAIL": True}]
patternUrl = [{"LIKE_URL": True}]
patternTel = [{"LIKE_NUM": True},{"LENGTH": {">": 4}}]
patternDate = [{"LIKE_NUM": True},{"LENGTH": {"==": 4}}]

matcher.add("mail", add_email, patternMail)
matcher.add("url", add_url, patternUrl)
matcher.add("tel", add_tel, patternTel)
matcher.add("date", add_date, patternDate)

In [None]:
count = 0
df_right['Anonym']=spacy.tokens.doc.Doc

for i in df_right['value']:
    doc = nlp(i)
    matches = matcher(doc)
    displacy.render(doc, style="ent", options=options)
    df_right['Anonym'][count]=doc
    count = count+1

In [None]:
df_right

In [None]:
def replace_person_names(token):
    if token.ent_iob != 0 and token.ent_type_ == 'PER':
        return '[REDACTED_PER] '
    if token.ent_iob != 0 and token.ent_type_ == 'LOC':
        return '[REDACTED_LOC] '
    if token.ent_iob != 0 and token.ent_type_ == 'EMAIL':
        return '[REDACTEDE_MAIL] '
    if token.ent_iob != 0 and token.ent_type_ == 'TEL':
        return '[REDACTED_TEL] '
    if token.ent_iob != 0 and token.ent_type_ == 'URL':
        return '[REDACTED_URL] '
   
    return token.string

def redact_names(nlp_doc):
    if  nlp_doc == spacy.tokens.doc.Doc:
        return nlp_doc.text
    else :
        for ent in nlp_doc.ents:        
            ent.merge()
        tokens = map(replace_person_names, nlp_doc)
        return ''.join(tokens)
        
    

for i in df_right['Anonym']:
    print(redact_names(i))
