## Partie 1: Text bloc detection à partir du PDF

In [None]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer
import pandas as pd 
import numpy as np

In [None]:
# Open a PDF file.
fp = open('<cv-name-here>.pdf', 'rb')

# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)

# Create a PDF document object that stores the document structure.
# Password for initialization as 2nd parameter
document = PDFDocument(parser)

# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed

# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()

# Create a PDF device object.
device = PDFDevice(rsrcmgr)

# BEGIN LAYOUT ANALYSIS
# Set parameters for analysis.
laparams = LAParams()

# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)

# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
df = pd.DataFrame( columns = ['x', 'y','value']) 
def parse_obj(lt_objs,df):

    # loop over the object list
    for obj in lt_objs:

        # if it's a textbox, print text and location
        if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
            print ("%6d, %6d, %s" % (obj.bbox[0], obj.bbox[1], obj.get_text().replace('\n', '_')))
            #new_row = {'x':obj.bbox[0], 'y':obj.bbox[1], 'value':obj.get_text().replace('\n', '_')}
            new_row = {'x':obj.bbox[0], 'y':obj.bbox[1], 'value':obj.get_text()}
            df = df.append(new_row, ignore_index=True)

        # if it's a container, recurse
        elif isinstance(obj, pdfminer.layout.LTFigure):
            parse_obj(obj._objs,df)
            
    return df    

# loop over all pages in the document
for page in PDFPage.create_pages(document):

    # read the page into a layout object
    interpreter.process_page(page)
    layout = device.get_result()

    # extract text from this object
    df=parse_obj(layout._objs,df)
    

In [None]:
print(df)

In [None]:
df['bloc']=''
df['bloc'] = np.where(df['x'].between(0,100), 'L', df['bloc'])

df['bloc'] = np.where(df['x'].between(101,500), 'R', df['bloc'])

In [None]:
df_left = df[df['bloc'] == 'L']
df_right = df[df['bloc'] == 'R']

In [None]:
df_left=df_left.sort_values("y", ascending=False)
df_right=df_right.sort_values("y", ascending=False)

In [None]:
df_left['value'] = df_left['value'].map(lambda x: x.rstrip('\n'))

In [None]:
df_left

In [None]:
df_right['value'] = df_right['value'].map(lambda x: ' '+x)

In [None]:
df_right = df_right.append(df_right).reset_index().drop_duplicates(subset='index').drop(columns='index')
df_right

## Partie 2: Détection des données à caractère personnel + Anonymisation

In [None]:
import spacy
from spacy import displacy
import fr_core_news_lg
from prettytable import PrettyTable
from spacy.scorer import Scorer
from spacy.gold import GoldParse
from spacy.matcher import Matcher #Adding custom entities
from spacy.tokens import Span #Adding custom entities
from spacy.matcher import PhraseMatcher
import re
from flair.data import Sentence
from flair.models import SequenceTagger,  MultiTagger

In [None]:
nlp = fr_core_news_lg.load()

## Filtre des POS de Flair

In [None]:
tagger = MultiTagger.load(['pos-multi', 'fr-ner'])

In [None]:
def add_inclusive(str1,str2):
    if str1.endswith(str2):
        return str1[:-len(str2)]+"("+str2+")"
    else :
        return str1+"("+str2+")"
   
def add_inclusive_adj(str1,str2,str3):
    if str1.endswith(str2):
        return str1+"("+str3+")"
    elif str1.endswith(str3):
        return str1[:-len(str3)]+str2+"("+str3+")"    

In [None]:
def merging_inclusive(sentence):
    strr=""
    for entity in sentence.get_spans('pos-multi'):
        
        for data in entity.labels:
            
            if re.sub(r'\([^)]*\)', '', str(data))[:-1] == 'VERB' and len(entity.text)>2 :
                #[TO DO] Tester avec l'auxiliare qui précède le verbe: si avoir, pas de changement, sinon:
                if entity.text.endswith("é") or entity.text.endswith("e"):
                    strr= strr+add_inclusive(entity.text,"e")+" "
                elif entity.text.endswith("i") or entity.text.endswith("ie"):
                    strr= strr+add_inclusive(entity.text,"ie")+" "
                elif entity.text.endswith("is") or entity.text.endswith("ise"):
                    strr= strr+add_inclusive(entity.text,"ise")+" "
                elif entity.text.endswith("t") or entity.text.endswith("te"):
                    strr= strr+add_inclusive(entity.text,"te")+" "
                elif entity.text.endswith("us") or entity.text.endswith("use"): #inclus(se)
                    strr= strr+add_inclusive(entity.text,"use")+" "
                elif entity.text.endswith("u") or entity.text.endswith("ue"):
                    strr= strr+add_inclusive(entity.text,"ue")+" "
                else :
                    strr= strr+entity.text+" "
  
            elif re.sub(r'\([^)]*\)', '', str(data))[:-1] == 'VERB' and len(entity.text)<2 :
                strr= strr+entity.text+" "
        
        
            elif  re.sub(r'\([^)]*\)', '', str(data))[:-1] == 'ADJ'  :
                if entity.text.endswith("ien") or entity.text.endswith("ienne"):
                    strr= strr+" "+add_inclusive_adj(entity.text,"ien","ienne")
                elif  entity.text.endswith("if") or entity.text.endswith("ive"):
                    strr= strr+" "+add_inclusive_adj(entity.text,"if","ive")
                elif  entity.text.endswith("er") or entity.text.endswith("ère"):
                    strr= strr+" "+add_inclusive_adj(entity.text,"er","ère")
                elif  entity.text.endswith("ier") or entity.text.endswith("ière"):
                    strr= strr+" "+add_inclusive_adj(entity.text,"ier","ière")
                elif  entity.text.endswith("on") or entity.text.endswith("onne"):
                    strr= strr+" "+add_inclusive_adj(entity.text,"on","ne")
                elif  entity.text.endswith("eur") or entity.text.endswith("euse"):
                    strr= strr+" "+add_inclusive_adj(entity.text,"eur","euse")
                elif  entity.text.endswith("leur") or entity.text.endswith("leure"): #meilleur(e)
                    strr= strr+" "+add_inclusive_adj(entity.text,"eur","eure")
                elif entity.text.endswith("é") or entity.text.endswith("ée"):
                    strr= strr+add_inclusive(entity.text,"ée")+" "
                else :
                    strr= strr+entity.text+" "
        
        
            elif  re.sub(r'\([^)]*\)', '', str(data))[:-1] == 'NOUN'  :                       
                if entity.text.endswith("teur") or entity.text.endswith("trice"):
                    strr= strr+" "+add_inclusive_adj(entity.text,"teur","trice")
                elif entity.text.endswith("ieur") or entity.text.endswith("ieure"):
                    strr= strr+" "+add_inclusive_adj(entity.text,"eur","eure")
                elif entity.text.endswith("peur") or entity.text.endswith("peuse"): #développeur(euse)
                    strr= strr+add_inclusive_adj(entity.text,"eur","euse")+" "
                else :
                    strr= strr+" "+entity.text+" "
                                    
                    
                
            elif  entity.text == 'la' or entity.text == 'le' or entity.text == 'La' or entity.text == 'Le'  :                    
                strr= strr+"le/la"+" "
                
            elif  entity.text == 'il' or entity.text == 'elle' or entity.text == 'Il' or entity.text == 'Elle'   :                    
                strr= strr+"il/elle"   +" "             
            elif  re.sub(r'\([^)]*\)', '', str(data))[:-1] == 'ADP'  :
                strr= strr+" "+entity.text+" "
            elif  re.sub(r'\([^)]*\)', '', str(data))[:-1] == 'AUX'  :
                strr= strr+" "+entity.text+" "
            elif  re.sub(r'\([^)]*\)', '', str(data))[:-1] == 'DET'  :
                strr= strr+entity.text  +" "  
            elif  re.sub(r'\([^)]*\)', '', str(data))[:-1] == 'CCONJ'  :
                strr= strr+" " +entity.text+" "    
            elif  entity.text == '.' or entity.text == ','  :
                strr= strr+" " +entity.text+" "    
                                                       
                               
            else :
                 strr= strr+""+entity.text
    return strr

In [None]:
entitiesListName = []
def merging_entities(sentence):
    entitiesList = []

    
    for entity in sentence.get_spans('fr-ner')  :
        
        for data in entity.labels:
            if re.sub(r'\([^)]*\)', '', str(data))[:-1] == 'PER' and len(entity.text)>2 :
                #strr= strr+"[REDACTEDPER]"
                entitiesList.append(entity.text)
                entitiesListName.append('PER')
                
            if re.sub(r'\([^)]*\)', '', str(data))[:-1] == 'LOC ' and len(entity.text)>2 :
                #strr= strr+"[REDACTEDLOC]"
                entitiesList.append(entity.text)
                entitiesListName.append('LOC')
                
            if re.sub(r'\([^)]*\)', '', str(data))[:-1] == 'MISC' and len(entity.text)>2 :
                #strr= strr+"[REDACTEDMISC]"
                entitiesList.append(entity.text)
                entitiesListName.append('MISC')
                
                                                
            if re.sub(r'\([^)]*\)', '', str(data))[:-1] == 'ORG' and len(entity.text)>2 :
               # strr= strr+"[REDACTEDORG]"
                entitiesList.append(entity.text)
                entitiesListName.append('ORG')
                
        
    return entitiesList

## Ajout de nouvelles entités grâce au rule-based entities de Spacy

In [None]:
matcher = Matcher(nlp.vocab)

def add_email(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="EMAIL")
    doc.ents += (entity,)
    
def add_url(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="URL")
    doc.ents += (entity,)

def add_tel(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="TEL")
    doc.ents += (entity,)
    
def add_Sdate(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="S-DATE")
    doc.ents += (entity,)
def add_date(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="DATE")
    doc.ents += (entity,)
        
def add_verb(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="VERB_FEM")
    doc.ents += (entity,)
def add_adj(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="adj fem")
    doc.ents += (entity,)
def add_ssNum(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="NUM_SEC_SO")
    doc.ents += (entity,)
def add_age(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="AGE")
    doc.ents += (entity,)
def add_situation_fam(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="SIT_FAM")
    doc.ents += (entity,)
def add_formation(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="FORMATION")
    doc.ents += (entity,)
def add_enfants(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="nbr_enfants")
    doc.ents += (entity,)
def add_sexe(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="sexe")
    doc.ents += (entity,)

def add_verb(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="verb")
    doc.ents += (entity,)

In [None]:
patternMail = [{"LIKE_EMAIL": True}]
patternUrl = [{"LIKE_URL": True}]
patternTel = [{"TEXT": {"REGEX": "^\+*\d{11}$"}}]
patternSecS = [{"TEXT": {"REGEX": "^\+*\d{13}$"}}]
#[{"SHAPE": "d"}, {"SHAPE": "dd"}, {"SHAPE": "dd"}, {"SHAPE": "dd"}, {"SHAPE": "ddd"}, {"SHAPE": "ddd"}]
patternSDate = [{"SHAPE": "dddd", "LENGTH": 4}]
patternDate = [{"TEXT": {"REGEX": "^(0[1-9]|[12][0-9]|3[01])[- /.](0[1-9]|1[012])[- /.](19|20)\d\d$"}}]
patternVerbFem = [{"POS": "VERB","TEXT": {"REGEX": "ée$"}}]
patternAge = [{"ORTH": "age"}, {"ORTH": ":"}, {}]
patternAge2 = [{"ORTH": "a"},{},{"ORTH": "ans"} ]
patternFormation = [{"ORTH": "de"},{},{"ORTH": "ans"} ]
patternEnfants = [{},{"ORTH": "enfants"} ,{"ORTH": "s", "OP":"?"}]
patternFamil =[{"TEXT": {"REGEX": "^marié(e)$"}}]
               # {"ORTH": "pacsé(e)", "OP":"?"},{"ORTH": "divorcé(e)", "OP":"?"},{"ORTH": "séparé(e)", "OP":"?"},{"ORTH": "célibataire", "OP":"?"},{"ORTH": "veuf", "OP":"?"} ]
patternSexe = [{"ORTH": "(M)", "OP":"?"},{"ORTH": "(F)", "OP":"?"} ]

In [None]:
matcher = Matcher(nlp.vocab)
matcher.add("mail", add_email, patternMail)
matcher.add("url", add_url, patternUrl)
matcher.add("tel", add_tel, patternTel)
matcher.add("sec", add_ssNum, patternSecS)
matcher.add("date", add_date, patternDate)
matcher.add("Sdate", add_Sdate, patternSDate)
#matcher.add("VerbFem", add_verb, patternVerbFem)
matcher.add("Age", add_age, patternAge)
matcher.add("Age2", add_age, patternAge2)
matcher.add("situation", add_situation_fam, patternFamil)
matcher.add("formation", add_formation, patternFormation)
matcher.add("enfants", add_enfants, patternEnfants)
matcher.add("sexe", add_sexe, patternSexe)
#matcher.add("verb", add_verb, patternVerb)
#matcher.add("adj", add_adj, pattern1)

In [None]:
count = 0
df_right['Anonym']=spacy.tokens.doc.Doc

for i in df_right['value']:
    doc = nlp(i)
    df_right['Anonym'][count]=doc
    count = count+1

In [None]:
df_right

## Anonymisation

In [None]:
def anonymize_entities(token):
    if token.ent_iob != 0 and token.ent_type_ == 'PER':
        return '[REDACTEDPER] '
    if token.ent_iob != 0 and token.ent_type_ == 'MISC':
        return '[REDACTEDMISC] '    
    if token.ent_iob != 0 and token.ent_type_ == 'LOC':
        return '[REDACTEDLOC] '
    if token.ent_iob != 0 and token.ent_type_ == 'EMAIL':
        return '[REDACTEDEMAIL] '
    if token.ent_iob != 0 and token.ent_type_ == 'TEL':
        return '[REDACTEDTEL] '
    if token.ent_iob != 0 and token.ent_type_ == 'URL':
        return '[REDACTEDURL] '
    if token.ent_iob != 0 and token.ent_type_ == 'S-DATE':
        return '[REDACTEDS-DATE] '
    if token.ent_iob != 0 and token.ent_type_ == 'DATE':
        return '[REDACTEDDATE] '
    if token.ent_iob != 0 and token.ent_type_ == 'NUM_SEC_SO':
        return '[REDACTEDNUM_SEC_SO] '
    if token.ent_iob != 0 and token.ent_type_ == 'AGE':
        return '[REDACTEDAGE] '
    if token.ent_iob != 0 and token.ent_type_ == 'SIT_FAM':
        return '[REDACTEDSIT_FAM] '
    if token.ent_iob != 0 and token.ent_type_ == 'FORMATION':
        return '[REDACTEDFORMATION] '
    if token.ent_iob != 0 and token.ent_type_ == 'NBR_ENFANTS':
        return '[REDACTEDNBR_ENFANTS] '
    if token.ent_iob != 0 and token.ent_type_ == 'VERB_FEM':
        return token.text[:-1]+' '    
    if token.text == 'elle':
        return 'il '    
    return token.string

def redact_names(nlp_doc):
    for ent in nlp_doc.ents:        
        ent.merge()
    tokens = map(anonymize_entities, nlp_doc)
    return ''.join(tokens)

In [None]:
def anonymize(phrase):
    sentence = Sentence(phrase)
    # predict PoS tags
    tagger.predict(sentence)
  

    i=0
    for ent in merging_entities(sentence) :
        if ent in phrase:
            phrase=phrase.replace(ent,entitiesListName[i])
        i=i+1    
     
    
    sentence2 = Sentence(phrase)
    # predict PoS tags
    tagger.predict(sentence2)
    
  
    doc = nlp(merging_inclusive(sentence2))
    matches = matcher(doc)
    
    
    
    return redact_names(doc)

In [None]:
for i in df_right['Anonym']:
    print('Before Anonymisation: ',i)
    print(anonymize(str(i)))