## Partie 1: Text bloc detection à partir du PDF

In [97]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer
import pandas as pd 
import numpy as np

In [98]:
# Open a PDF file.
fp = open('<resume name here>.pdf', 'rb')

# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)

# Create a PDF document object that stores the document structure.
# Password for initialization as 2nd parameter
document = PDFDocument(parser)

# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed

# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()

# Create a PDF device object.
device = PDFDevice(rsrcmgr)

# BEGIN LAYOUT ANALYSIS
# Set parameters for analysis.
laparams = LAParams()

# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)

# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
df = pd.DataFrame( columns = ['x', 'y','value']) 
def parse_obj(lt_objs,df):

    # loop over the object list
    for obj in lt_objs:

        # if it's a textbox, print text and location
        if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
            print ("%6d, %6d, %s" % (obj.bbox[0], obj.bbox[1], obj.get_text().replace('\n', '_')))
            #new_row = {'x':obj.bbox[0], 'y':obj.bbox[1], 'value':obj.get_text().replace('\n', '_')}
            new_row = {'x':obj.bbox[0], 'y':obj.bbox[1], 'value':obj.get_text()}
            df = df.append(new_row, ignore_index=True)

        # if it's a container, recurse
        elif isinstance(obj, pdfminer.layout.LTFigure):
            parse_obj(obj._objs,df)
            
    return df    

# loop over all pages in the document
for page in PDFPage.create_pages(document):

    # read the page into a layout object
    interpreter.process_page(page)
    layout = device.get_result()

    # extract text from this object
    df=parse_obj(layout._objs,df)
    

    50,    618, Compétences_
   190,    623, Formation_
   188,    755, Bedoui Amal_
   188,    714, Elève ingénieur en Data Science_
   449,    785, amal.bedoui@esprit.tn_
   463,    748, (+216) 99 995 795_
   482,    707,  Tunis, Tunisie_
   496,    672,  amal.bdo1_
   182,    597, Depuis 2014_
   249,    583, Cycle d’ingénieur à l’Ecole Supérieure Privée et de Technologie (ESPRIT) :_Option Data Science._
   182,    565, 2013-2014_
   249,    552, Baccalauréat au Lycée Pilote Bourguiba de Tunis (LPBT) :_Section sciences expérimentales._
   190,    519, Expérience professionnelle_
   168,    496,  Juin - Juil  2018  Stagiaire à TUNISIE TELECOM (Département Customer Value Management)_
   246,    466,   moyennant SAS._
   190,    430, Projets académiques_
   172,    405, Jan - Mai 2018_
   242,    377, Projet Data Science : Prédiction du diabète chez les femmes_Prédire si une femme sera atteinte du diabète._Réduire le taux d’atteinte du diabète par la détection précoce._
   173,    360,

In [99]:
print(df)

             x           y                                              value
0    50.413100  618.269500                                      Compétences\n
1   190.562500  623.986300                                        Formation\n
2   188.376000  755.792000                                      Bedoui Amal\n
3   188.376000  714.125500                  Elève ingénieur en Data Science\n
4   449.865200  785.835400                            amal.bedoui@esprit.tn\n
5   463.868200  748.039400                                (+216) 99 995 795\n
6   482.865200  707.427400                                   Tunis, Tunisie\n
7   496.505200  672.568400                                        amal.bdo1\n
8   182.906200  597.513075                                      Depuis 2014\n
9   249.503400  583.982500  Cycle d’ingénieur à l’Ecole Supérieure Privée ...
10  182.905300  565.847075                                        2013-2014\n
11  249.502900  552.315500  Baccalauréat au Lycée Pilote Bourgui

In [100]:
df['bloc']=''
df['bloc'] = np.where(df['x'].between(0,100), 'L', df['bloc'])

df['bloc'] = np.where(df['x'].between(101,500), 'R', df['bloc'])


In [101]:
df_left = df[df['bloc'] == 'L']
df_right = df[df['bloc'] == 'R']

In [102]:
df_left=df_left.sort_values("y", ascending=False)
df_right=df_right.sort_values("y", ascending=False)

In [103]:
df_left['value'] = df_left['value'].map(lambda x: x.rstrip('\n'))

In [104]:
df_left

Unnamed: 0,x,y,value,bloc
0,50.4131,618.2695,Compétences,L
30,23.3237,595.3613,Python,L
31,23.0747,569.2871,R,L
32,23.0752,544.1338,SAS,L
33,23.0747,520.2171,SQL SERVER,L
34,23.0747,494.1471,POWER BI,L
35,23.0747,468.0771,HADOOP,L
36,23.0747,445.0645,SPARK,L
37,23.0747,421.584,"SQL, PL/SQL",L
39,23.2261,396.4238,JEE,L


In [105]:
df_right['value'] = df_right['value'].map(lambda x: ' '+x)

In [106]:
df_right = df_right.append(df_right).reset_index().drop_duplicates(subset='index').drop(columns='index')
df_right

Unnamed: 0,x,y,value,bloc
0,449.8652,785.8354,amal.bedoui@esprit.tn\n,R
1,188.376,755.792,Bedoui Amal\n,R
2,463.8682,748.0394,(+216) 99 995 795\n,R
3,188.376,714.1255,Elève ingénieur en Data Science\n,R
4,482.8652,707.4274,"Tunis, Tunisie\n",R
5,496.5052,672.5684,amal.bdo1\n,R
6,190.5625,623.9863,Formation\n,R
7,182.9062,597.513075,Depuis 2014\n,R
8,249.5034,583.9825,Cycle d’ingénieur à l’Ecole Supérieure Privée...,R
9,182.9053,565.847075,2013-2014\n,R


## Partie 2: Détection des données à caractère personnel + Anonymisation

In [107]:
import spacy
from spacy import displacy
import fr_core_news_lg
from spacy.matcher import Matcher #Adding custom entities
from spacy.tokens import Span #Adding custom entities

In [108]:
nlp = fr_core_news_lg.load()
matcher = Matcher(nlp.vocab)

def add_email(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="EMAIL")
    doc.ents += (entity,)
    
def add_url(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="URL")
    doc.ents += (entity,)

def add_tel(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="TEL")
    doc.ents += (entity,)
    
def add_date(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    entity = Span(doc, start, end, label="DATE")
    doc.ents += (entity,)

In [109]:
colors = {"URL": "linear-gradient(90deg, #aa9cfc, #fc9ce7)",
          "EMAIL": "RGB(241, 13, 105)",
          "TEL" : "RGB(95, 205, 231)",
          "LOC" : "RGB(32, 240, 171)",
          "PER" : "RGB(200, 232, 40)"
         }
options = { "colors": colors}

In [110]:
patternMail = [{"LIKE_EMAIL": True}]
patternUrl = [{"LIKE_URL": True}]
patternTel = [{"LIKE_NUM": True},{"LENGTH": {">": 4}}]
patternDate = [{"LIKE_NUM": True},{"LENGTH": {"==": 4}}]

matcher.add("mail", add_email, patternMail)
matcher.add("url", add_url, patternUrl)
matcher.add("tel", add_tel, patternTel)
matcher.add("date", add_date, patternDate)

In [111]:
count = 0
df_right['Anonym']=spacy.tokens.doc.Doc

for i in df_right['value']:
    doc = nlp(i)
    matches = matcher(doc)
    displacy.render(doc, style="ent", options=options)
    df_right['Anonym'][count]=doc
    count = count+1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  




In [112]:
df_right

Unnamed: 0,x,y,value,bloc,Anonym
0,449.8652,785.8354,amal.bedoui@esprit.tn\n,R,"( , amal.bedoui@esprit.tn, \n)"
1,188.376,755.792,Bedoui Amal\n,R,"( , Bedoui, Amal, \n)"
2,463.8682,748.0394,(+216) 99 995 795\n,R,"( , (, +216, ), 99, 995, 795, \n)"
3,188.376,714.1255,Elève ingénieur en Data Science\n,R,"( , Elève, ingénieur, en, Data, Science, \n)"
4,482.8652,707.4274,"Tunis, Tunisie\n",R,"( , Tunis, ,, Tunisie, \n)"
5,496.5052,672.5684,amal.bdo1\n,R,"( , amal.bdo1, \n)"
6,190.5625,623.9863,Formation\n,R,"( , Formation, \n)"
7,182.9062,597.513075,Depuis 2014\n,R,"( , Depuis, 2014, \n)"
8,249.5034,583.9825,Cycle d’ingénieur à l’Ecole Supérieure Privée...,R,"( , Cycle, d’, ingénieur, à, l’, Ecole, Supéri..."
9,182.9053,565.847075,2013-2014\n,R,"( , 2013, -, 2014, \n)"


In [113]:
def replace_person_names(token):
    if token.ent_iob != 0 and token.ent_type_ == 'PER':
        return '[REDACTED_PER] '
    if token.ent_iob != 0 and token.ent_type_ == 'LOC':
        return '[REDACTED_LOC] '
    if token.ent_iob != 0 and token.ent_type_ == 'EMAIL':
        return '[REDACTEDE_MAIL] '
    if token.ent_iob != 0 and token.ent_type_ == 'TEL':
        return '[REDACTED_TEL] '
    if token.ent_iob != 0 and token.ent_type_ == 'URL':
        return '[REDACTED_URL] '
   
    return token.string

def redact_names(nlp_doc):
    if  nlp_doc == spacy.tokens.doc.Doc:
        return nlp_doc.text
    else :
        for ent in nlp_doc.ents:        
            ent.merge()
        tokens = map(replace_person_names, nlp_doc)
        return ''.join(tokens)
        
    

for i in df_right['Anonym']:
    print(redact_names(i))




 [REDACTEDE_MAIL] 

 [REDACTED_PER] 

 (+216) 99 995 795

 Elève ingénieur en Data Science

  [REDACTED_LOC] , [REDACTED_LOC] 

  amal.bdo1

 Formation

 Depuis 2014

 Cycle d’ingénieur à l’Ecole Supérieure Privée et de Technologie (ESPRIT) :
Option Data Science.

 2013-2014

 Baccalauréat au Lycée Pilote Bourguiba de Tunis (LPBT) :
Section sciences expérimentales.

 Expérience professionnelle

  Juin - [REDACTED_PER]  2018  Stagiaire à TUNISIE TELECOM (Département Customer Value Management)

   moyennant SAS.

 Projets académiques

 [REDACTED_PER] - Mai 2018

 Projet Data Science : Prédiction du diabète chez les femmes
Prédire si une femme sera atteinte du diabète.
Réduire le taux d’atteinte du diabète par la détection précoce.

 [REDACTED_PER] - Mai 2018

 Mini-Projets Machine Learning

 Dégagement des règles d’association des personnes arrêtées mais innocentes.

 Régression des sinistres automobiles corporels.
Text mining et NLP sur les articles de la chirurgie esthétique.

 Sep - D