In [20]:
from pdfminer.pdfparser import PDFParser
from pdfminer.pdfdocument import PDFDocument
from pdfminer.pdfpage import PDFPage
from pdfminer.pdfpage import PDFTextExtractionNotAllowed
from pdfminer.pdfinterp import PDFResourceManager
from pdfminer.pdfinterp import PDFPageInterpreter
from pdfminer.pdfdevice import PDFDevice
from pdfminer.layout import LAParams
from pdfminer.converter import PDFPageAggregator
import pdfminer
import pandas as pd 
import numpy as np

In [21]:
# Open a PDF file.
fp = open('<resume name here>.pdf', 'rb')

# Create a PDF parser object associated with the file object.
parser = PDFParser(fp)

# Create a PDF document object that stores the document structure.
# Password for initialization as 2nd parameter
document = PDFDocument(parser)

# Check if the document allows text extraction. If not, abort.
if not document.is_extractable:
    raise PDFTextExtractionNotAllowed

# Create a PDF resource manager object that stores shared resources.
rsrcmgr = PDFResourceManager()

# Create a PDF device object.
device = PDFDevice(rsrcmgr)

# BEGIN LAYOUT ANALYSIS
# Set parameters for analysis.
laparams = LAParams()

# Create a PDF page aggregator object.
device = PDFPageAggregator(rsrcmgr, laparams=laparams)

# Create a PDF interpreter object.
interpreter = PDFPageInterpreter(rsrcmgr, device)
df = pd.DataFrame( columns = ['x', 'y','value']) 
def parse_obj(lt_objs,df):

    # loop over the object list
    for obj in lt_objs:

        # if it's a textbox, print text and location
        if isinstance(obj, pdfminer.layout.LTTextBoxHorizontal):
            print ("%6d, %6d, %s" % (obj.bbox[0], obj.bbox[1], obj.get_text().replace('\n', '_')))
            #new_row = {'x':obj.bbox[0], 'y':obj.bbox[1], 'value':obj.get_text().replace('\n', '_')}
            new_row = {'x':obj.bbox[0], 'y':obj.bbox[1], 'value':obj.get_text()}
            df = df.append(new_row, ignore_index=True)

        # if it's a container, recurse
        elif isinstance(obj, pdfminer.layout.LTFigure):
            parse_obj(obj._objs,df)
            
    return df    

# loop over all pages in the document
for page in PDFPage.create_pages(document):

    # read the page into a layout object
    interpreter.process_page(page)
    layout = device.get_result()

    # extract text from this object
    df=parse_obj(layout._objs,df)
    

    50,    618, Compétences_
   190,    623, Formation_
   188,    755, Bedoui Amal_
   188,    714, Elève ingénieur en Data Science_
   449,    785, amal.bedoui@esprit.tn_
   463,    748, (+216) 99 995 795_
   482,    707,  Tunis, Tunisie_
   496,    672,  amal.bdo1_
   182,    597, Depuis 2014_
   249,    583, Cycle d’ingénieur à l’Ecole Supérieure Privée et de Technologie (ESPRIT) :_Option Data Science._
   182,    565, 2013-2014_
   249,    552, Baccalauréat au Lycée Pilote Bourguiba de Tunis (LPBT) :_Section sciences expérimentales._
   190,    519, Expérience professionnelle_
   168,    496,  Juin - Juil  2018  Stagiaire à TUNISIE TELECOM (Département Customer Value Management)_
   246,    466,   moyennant SAS._
   190,    430, Projets académiques_
   172,    405, Jan - Mai 2018_
   242,    377, Projet Data Science : Prédiction du diabète chez les femmes_Prédire si une femme sera atteinte du diabète._Réduire le taux d’atteinte du diabète par la détection précoce._
   173,    360,

In [22]:
print(df)

             x           y                                              value
0    50.413100  618.269500                                      Compétences\n
1   190.562500  623.986300                                        Formation\n
2   188.376000  755.792000                                      Bedoui Amal\n
3   188.376000  714.125500                  Elève ingénieur en Data Science\n
4   449.865200  785.835400                            amal.bedoui@esprit.tn\n
5   463.868200  748.039400                                (+216) 99 995 795\n
6   482.865200  707.427400                                   Tunis, Tunisie\n
7   496.505200  672.568400                                        amal.bdo1\n
8   182.906200  597.513075                                      Depuis 2014\n
9   249.503400  583.982500  Cycle d’ingénieur à l’Ecole Supérieure Privée ...
10  182.905300  565.847075                                        2013-2014\n
11  249.502900  552.315500  Baccalauréat au Lycée Pilote Bourgui

In [23]:
df['x']=df['x'].astype(int)

In [24]:
df['bloc']=''
df['bloc'] = np.where(df['x'].between(0,100), 'L', df['bloc'])

df['bloc'] = np.where(df['x'].between(101,500), 'R', df['bloc'])


In [25]:
df_left = df[df['bloc'] == 'L']
df_right = df[df['bloc'] == 'R']

In [26]:
df_left=df_left.sort_values("y", ascending=False)
df_right=df_right.sort_values("y", ascending=False)


In [27]:
df_left

Unnamed: 0,x,y,value,bloc
0,50,618.2695,Compétences\n,L
30,23,595.3613,Python\n,L
31,23,569.2871,R\n,L
32,23,544.1338,SAS\n,L
33,23,520.2171,SQL SERVER\n,L
34,23,494.1471,POWER BI\n,L
35,23,468.0771,HADOOP\n,L
36,23,445.0645,SPARK\n,L
37,23,421.584,"SQL, PL/SQL\n",L
39,23,396.4238,JEE\n,L


In [28]:
with open('right.txt', 'w', encoding = 'utf-8') as f:
    for rec_index, rec in df_right.iterrows():
        f.write(rec['value'] + '\n')

In [29]:
with open('left.txt', 'w', encoding = 'utf-8') as f:
    for rec_index, rec in df_left.iterrows():
        f.write(rec['value'] + '\n')