In [1]:
import spacy
import os
import re
import pandas as pd
import pdftotext

In [3]:
#1. Converting pdf to text

In [9]:
def convert_pdf(f):
    output_filename = os.path.basename(os.path.splitext(f)[0]) + ".txt"
    output_filepath = os.path.join("output/txt/", output_filename)
    with open(f, "rb") as f:
        pdf = pdftotext.PDF(f)
    with open(output_filepath, "w") as f:
        f.write(''.join(pdf))
    print(output_filepath + " saved successfully!")
    return open(output_filepath).read()

In [3]:
nlp = spacy.load('en_core_web_sm')

In [18]:
def parse_content(text, required_skillset):
    skillset = re.compile("|".join(required_skillset))
    phone_num = re.compile(
        "(\d{3}[-\.\s]??\d{3}[-\.\s]??\d{4}|\(\d{3}\)\s*\d{3}[-\.\s]??\d{4}|\d{3}[-\.\s]??\d{4})"
    )
    doc = nlp(text)
    name = [entity.text for entity in doc.ents if entity.label_ == "PERSON"][0]
    email = [word for word in doc if word.like_email == True][0]
    phone = str(re.findall(phone_num, text.lower()))
    skills_list = re.findall(skillset, text.lower())
    unique_skills_list = str(set(skills_list))
    print("Extraction completed successfully!!!")
    return (name, email, phone, unique_skills_list)

In [24]:
def process_content(text):
    # remove "broken" space character
    text = re.sub('\u200b', '', text)
    
    # process for all uppercase name
    uppercase_words_regex = re.compile("[A-Z]+")
    uppercase_words = [w for w in re.findall(uppercase_words_regex, text) if len(w) > 1]
    for i in range(len(uppercase_words)):
        w = uppercase_words[i]
        new_w = w[0]+w[1:].lower()
        text = re.sub(w, new_w, text)
    return text

In [28]:
required_skillset = {"ios", "swift", "xcode", "objective c", "realm", "fastlane", "android"}
result_dict = {'name': [], 'phone': [], 'email': [], 'skills': [], 'resumes': []} 
names = []
phones = []
emails = []
skills = []
resumes = []

for file in os.listdir('resumes/'):
    if file.endswith('.pdf'):
        print('Reading.....' + file)
        txt = process_content(convert_pdf(os.path.join('resumes/',file)))
        name, email, phone, skill_set = parse_content(txt, required_skillset)
        names.append(name)
        emails.append(email)
        phones.append(phone)
        skills.append(skill_set)
        resumes.append(file)

Reading.....Resume.pdf
output/txt/Resume.txt saved successfully!
Extraction completed successfully!!!
Reading.....Resume-DinhThanhAn.pdf
output/txt/Resume-DinhThanhAn.txt saved successfully!
Extraction completed successfully!!!
Reading.....ios-developer-1561975506.pdf
output/txt/ios-developer-1561975506.txt saved successfully!
Extraction completed successfully!!!


In [29]:
result_dict['name'] = names
result_dict['phone'] = phones
result_dict['email'] = emails
result_dict['skills'] = skills
result_dict['resumes'] = resumes

In [30]:
result_df = pd.DataFrame(result_dict)
result_df

Unnamed: 0,name,phone,email,skills,resumes
0,Samantha Ru,['202-555-0135'],info@resumekraft.com,"{'xcode', 'swift', 'ios'}",Resume.pdf
1,Dinh Thanh,['080-7258'],thanhan.uit@gmail.com,"{'fastlane', 'ios', 'objective c', 'xcode', 'r...",Resume-DinhThanhAn.pdf
2,Robert Smith,[],info@qwikresume.com,"{'xcode', 'swift', 'android', 'ios'}",ios-developer-1561975506.pdf


In [23]:
result_df.to_csv('output/csv/parsed_resumes.csv')