In [7]:
import pdfplumber
from pdf2image import convert_from_path
import pytesseract
import re
import fitz


def extract_text_from_pdf(pdf_path):
    text = ""
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text()
            
            # Fallback to OCR if no text is extracted
            if not page_text or len(page_text.strip()) < 20:
                image = convert_from_path(pdf_path, 500, first_page=pdf.pages.index(page) + 1, last_page=pdf.pages.index(page) + 1)[0]
                page_text = pytesseract.image_to_string(image)

            text += page_text + "\n"

    return text



def infer_bullets_from_indentation(text):
    """
    Detects bullet points based on indentation instead of explicit symbols.
    """
    lines = text.split("\n")
    cleaned_lines = []
    prev_indent = 0

    for line in lines:
        stripped_line = line.lstrip()
        indent = len(line) - len(stripped_line)  # Measure indentation

        # If the indentation is deeper than the previous line, assume it's a bullet point
        if indent > prev_indent + 2:  # Small threshold to avoid false positives
            cleaned_lines.append(f"- {stripped_line}")
        else:
            cleaned_lines.append(stripped_line)

        prev_indent = indent  # Update previous indentation for next check

    return "\n".join(cleaned_lines).strip()



def detect_dominant_header_format(text):
    lines = text.split('\n')

    # Patterns to check
    patterns = {
        'all_caps': r'^[A-Z\s]{3,}$',
        'colon_headers': r'^[A-Z][a-z]+.*:$',
        'mixed_case': r'^[A-Z][a-z]+(?: [A-Z][a-z]+)*$'
    }

    # Count occurrences of each pattern
    pattern_counts = {key: 0 for key in patterns}

    for line in lines:
        line = line.strip()
        for key, pattern in patterns.items():
            if re.match(pattern, line):
                pattern_counts[key] += 1

    # Choose the pattern with the highest count
    dominant_pattern = max(pattern_counts, key=pattern_counts.get)
    return patterns[dominant_pattern] if pattern_counts[dominant_pattern] > 0 else None


def extract_resume_sections(text):
    # Define flexible patterns for detecting section headers
    # header_pattern = r'(?:(?:^[A-Z\s]{3,}$)|(?:^[A-Z][a-z]+(?: [A-Z][a-z]+)*:))'
    header_pattern = detect_dominant_header_format(text)

    # Split text into lines for better header detection
    lines = text.split('\n')

    sections = {}
    current_section = 'Summary'  # Default section if no header detected
    sections[current_section] = ''

    for line in lines:
        line = line.strip()
        if re.match(header_pattern, line):
            current_section = line.replace(':', '').strip()
            sections[current_section] = ''
        else:
            sections[current_section] += line + ' '

    # Clean up extra spaces
    for section in sections:
        sections[section] = sections[section].strip()

    return sections


In [2]:
from sandbox import bullet_point_sug as bs

bs.improve_bullets(1)

{'- Developed a Python-based solution to automate financial audit requests, reducing work for analysts by dynamically generating and scheduling 15 personalized emails bi-weekly': 'Developed a Python-based automation tool that reduced analyst workload by generating and scheduling over 15 personalized financial audit emails bi-weekly, leading to a X% decrease in processing time',
 '- Led the project from concept to deployment, including training future maintainers and creating a Power BI dashboard for monitoring requests and manually sending custom emails ad hoc': 'Directed the project lifecycle from concept to deployment, trained X future maintainers, and designed a Power BI dashboard to monitor requests and facilitate ad hoc custom email distribution, enhancing operational efficiency by X%',
 '- Identified the need for and developed a Python GUI for generating customized anonymous test data for users of all technical expertise': 'Designed and developed a user-friendly Python GUI to gen

In [2]:
from services import resume_scraper

resume_path = 'data\\resumes\\Giemza_Jackson_Resume.pdf'
print(resume_scraper.extract_text_from_pdf(resume_path))

Jackson Giemza
Boulder, CO | (708)-340-8105| jackson.giemza@gmail.com | github.com/JacksonGiemza

Data scientist with hands-on experience in financial risk analysis, data automation, and Al-driven insights. Skilled in
Python, SQL, and machine learning, with a passion for solving complex problems through data-driven decision-making.

Education

University of Colorado, Boulder, CO | Expected Graduation Dec 2025

Bachelor of Science, Information Science | Minor, Philosophy

Relevant Courses: Data Visualization, Statistics, Python for Info Sci 1&2, Linear Algebra, R for Data Science, Logic,
Quantitative Reasoning, Physics 1&2, Calculus, Economics, UI/UX Design

Experience
Risk Technology Analyst | RJ O’Brien & Associates May 2024 - Aug. 2024, Chicago, IL
- Developed a Python-based solution to automate financial audit requests, reducing work for analysts by
dynamically generating and scheduling 15+ personalized emails bi-weekly.

- Led the project from concept to deployment, including train

In [43]:
# def extract_text_from_pdf(pdf_path):
#     text = ""
    
#     with pdfplumber.open(pdf_path) as pdf:
#         for page in pdf.pages:
#             page_text = page.extract_text()
            
#             # Fallback to OCR if no text is extracted
#             if not page_text or len(page_text.strip()) < 20:
#                 image = convert_from_path(pdf_path, 500, first_page=pdf.pages.index(page) + 1, last_page=pdf.pages.index(page) + 1)[0]
#                 page_text = pytesseract.image_to_string(image)

#             text += page_text + "\n"

#     return text

def extract_text_from_pdf(pdf_path):
    text = ""
    
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_text = page.extract_text(layout=True)  # Preserve layout spacing
            
            if not page_text or len(page_text.strip()) < 20:
                continue  # Skip empty pages
            
            text += page_text + "\n"
    
    return text

In [23]:
def extract_text_from_pdf(pdf_path):
    # Convert PDF pages to images
    pages = convert_from_path(pdf_path, 500)

    text = ''
    for page in pages:
        text += pytesseract.image_to_string(page)

    return text

In [45]:
RESUME_PATH

'data\\resumes\\1_8aa6f01d260b452a86faa34a524cf3f3.pdf'

In [44]:
import db_tools
import os

# Get user resume
resume_file_name = db_tools.get_resumes_by_user_id(1)[0]['filename']
RESUME_PATH = os.path.join("data", "resumes", resume_file_name)
raw_resume = extract_text_from_pdf(RESUME_PATH)


In [40]:
def clean_resume_text(text):
    # Replace unwanted characters with a space or appropriate formatting
    text = re.sub(r'[«¢]', '-', text)  # Replace unusual bullets with standard "-"
    text = re.sub(r'[D>.\s]{3,}', '\n\n', text)  # Remove errant "D>. ." patterns
    text = text.replace('--','-')
    # text = re.sub(r'\n\s*\n', '\n\n', text)  # Ensure proper paragraph spacing
    # text = re.sub(r'(\s*-\s*)+', ' - ', text)  # Ensure proper spacing around bullet points
    # text = re.sub(r'[^\S\r\n]+', ' ', text)  # Normalize excessive spaces
    
    return text.strip()

In [41]:
print(clean_resume_text(raw_resume))

Jackson Giemza
Boulder, CO | (708)-340-8105 | jackson.giemza@gmail.com | github.com/JacksonGiemza

Enthusiastic information science student with hands-on experience as a data scientist, equipped with
technical and analytical skills to derive data-driven decisions. Passionate about harnessing data to
solve complex, meaningful problems in a vibrant, innovative setting

Education
University of Colorado, Boulder, CO | Expected Graduation Dec 2025
Bachelor of Science, Information Science | Minor, Philosophy

Relevant Courses: Data Visualization, Statistics, Python for Info Sci 1&2, Linear Algebra, R for
Data Science, Logic, Quantitative Reasoning, Physics 1&2, Calculus, Economics, UI/UX Design

Experience

Risk Technology Analyst | RJ O’Brien & Associates May 2024 - Aug. 2024, Chicago, IL

- Developed a Python-based solution to automate financial audit requests, reducing work for
analysts by dynamically generating and scheduling 15+ personalized emails bi-weekly

- Led the project from conc

In [105]:
import fitz  # the PyMuPDF package

doc = fitz.open(RESUME_PATH)
page = doc[0]  # first page

# Extract font sizes from text spans
text_info = page.get_text("dict")
font_sizes = [span["size"] for block in text_info["blocks"] for line in block.get("lines", []) for span in line.get("spans", [])]
fontsize = max(set(font_sizes), key=font_sizes.count) if font_sizes else 10  # Most common font size or default 10

# Extract vector graphics (potential bullet points)
paths = page.get_drawings()
bullets = []

for path in paths:
    rect = path["rect"]  # bounding box of the vector graphic
    
    # Check if it's a small graphic (bullet-like) based on font size
    if rect.width <= fontsize and rect.height <= fontsize:
        bullets.append(path)

print(bullets)


[{'items': [('c', Point(61.631553649902344, 323.3423767089844), Point(61.631553649902344, 323.5191955566406), Point(61.597721099853516, 323.68927001953125), Point(61.53005599975586, 323.8526306152344)), ('c', Point(61.53005599975586, 323.8526306152344), Point(61.46239471435547, 324.0159912109375), Point(61.366050720214844, 324.1601867675781), Point(61.241024017333984, 324.2851867675781)), ('c', Point(61.241024017333984, 324.2851867675781), Point(61.115997314453125, 324.41021728515625), Point(60.971805572509766, 324.5065612792969), Point(60.808448791503906, 324.5742492675781)), ('c', Point(60.808448791503906, 324.5742492675781), Point(60.64509582519531, 324.64190673828125), Point(60.47500991821289, 324.67572021484375), Point(60.298194885253906, 324.67572021484375)), ('c', Point(60.298194885253906, 324.67572021484375), Point(60.12137985229492, 324.67572021484375), Point(59.9512939453125, 324.64190673828125), Point(59.787940979003906, 324.5742492675781)), ('c', Point(59.787940979003906, 3

In [110]:
import fitz  # the PyMuPDF package

doc = fitz.open(RESUME_PATH)
page = doc[0]  # first page

# Extract font sizes from text spans
text_info = page.get_text("dict")
font_sizes = [span["size"] for block in text_info["blocks"] for line in block.get("lines", []) for span in line.get("spans", [])]
fontsize = max(set(font_sizes), key=font_sizes.count) if font_sizes else 10  # Most common font size or default 10

# Extract vector graphics (potential bullet points)
paths = page.get_drawings()
bullets = []

for path in paths:
    rect = path["rect"]  # bounding box of the vector graphic
    
    # Check if it's a small graphic (bullet-like) based on font size
    if rect.width <= fontsize and rect.height <= fontsize:
        bullets.append(path)

print(bullets)

[{'items': [('c', Point(61.631553649902344, 323.3423767089844), Point(61.631553649902344, 323.5191955566406), Point(61.597721099853516, 323.68927001953125), Point(61.53005599975586, 323.8526306152344)), ('c', Point(61.53005599975586, 323.8526306152344), Point(61.46239471435547, 324.0159912109375), Point(61.366050720214844, 324.1601867675781), Point(61.241024017333984, 324.2851867675781)), ('c', Point(61.241024017333984, 324.2851867675781), Point(61.115997314453125, 324.41021728515625), Point(60.971805572509766, 324.5065612792969), Point(60.808448791503906, 324.5742492675781)), ('c', Point(60.808448791503906, 324.5742492675781), Point(60.64509582519531, 324.64190673828125), Point(60.47500991821289, 324.67572021484375), Point(60.298194885253906, 324.67572021484375)), ('c', Point(60.298194885253906, 324.67572021484375), Point(60.12137985229492, 324.67572021484375), Point(59.9512939453125, 324.64190673828125), Point(59.787940979003906, 324.5742492675781)), ('c', Point(59.787940979003906, 3

In [26]:
import re
import string
import nltk
# nltk.download('stopwords')
# nltk.download('punkt_tab')
# nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

set_of_stopwords = set(stopwords.words("english") + list(string.punctuation))
lemmatizer = WordNetLemmatizer()

def clean_text(raw_text):        

        # Convert text to lowercase and tokenize into words
        tokens = word_tokenize(raw_text.lower())
        # Remove stopwords and punctuation
        tokens = [token for token in tokens if token not in set_of_stopwords]
        # Lemmatize the remaining words
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        # Join the tokens back into a single string
        cleaned_text = " ".join(tokens)
        return cleaned_text

def clean_resume(resume):
        skills_pattern = re.compile(r'Skills\s*[:\n]', re.IGNORECASE)
        skills_match = skills_pattern.search(resume)

        if skills_match:
            skills_start = skills_match.end()
            skills_end = resume.find('\n\n', skills_start)
            skills_section = resume[skills_start:skills_end].strip()
            skills_lines = skills_section.split('\n')

            extracted_skills = []
            for line in skills_lines:
                line_skills = re.split(r'[:,-]', line)
                extracted_skills.extend([skill.strip() for skill in line_skills if skill.strip()])

            skills = list(set(extracted_skills))
        else:
            skills = []

        skills = ", ".join(skills)

        RESUME_SECTIONS = [
            "Contact Information", "Objective", "Summary", "Education", "Experience", 
            "Skills", "Projects", "Certifications", "Licenses", "Awards", "Honors", 
            "Publications", "References", "Technical Skills", "Computer Skills", 
            "Programming Languages", "Software Skills", "Soft Skills", "Language Skills", 
            "Professional Skills", "Transferable Skills", "Work Experience", 
            "Professional Experience", "Employment History", "Internship Experience", 
            "Volunteer Experience", "Leadership Experience", "Research Experience", 
            "Teaching Experience",
        ]

        experience_start = resume.find("Experience")
        if experience_start == -1:
            return ""

        experience_end = len(resume)
        for section in RESUME_SECTIONS:
            if section != "Experience":
                section_start = resume.find(section, experience_start)
                if section_start != -1:
                    experience_end = min(experience_end, section_start)

        experience_section = resume[experience_start:experience_end].strip()
        cleaned_experience = clean_text(experience_section)
        cleaned_skills = clean_text(skills)

        return cleaned_experience + cleaned_skills

In [45]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

set_of_stopwords = set(stopwords.words("english") + list(string.punctuation))
lemmatizer = WordNetLemmatizer()

def clean_text(raw_text):        
    tokens = word_tokenize(raw_text.lower())
    tokens = [token for token in tokens if token not in set_of_stopwords]
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    cleaned_text = " ".join(tokens)
    return cleaned_text

def parse_resume(resume):
    RESUME_SECTIONS = [
        "Contact Information", "Objective", "Summary", "Education", "Experience", 
        "Skills", "Projects", "Certifications", "Licenses", "Awards", "Honors", 
        "Publications", "References", "Technical Skills", "Computer Skills", 
        "Programming Languages", "Software Skills", "Soft Skills", "Language Skills", 
        "Professional Skills", "Transferable Skills", "Work Experience", 
        "Professional Experience", "Employment History", "Internship Experience", 
        "Volunteer Experience", "Leadership Experience", "Research Experience", 
        "Teaching Experience"
    ]
    
    section_dict = {}
    
    # Create regex pattern for section headers
    section_pattern = re.compile(r'(?P<header>' + '|'.join(RESUME_SECTIONS) + r')\s*[:\n]', re.IGNORECASE)
    
    sections = list(section_pattern.finditer(resume))
    
    for i, section in enumerate(sections):
        header = section.group("header").strip()
        start = section.end()
        end = sections[i + 1].start() if i + 1 < len(sections) else len(resume)
        section_content = resume[start:end].strip().split('\n')
        
        # Preserve bullet points
        cleaned_lines = []
        temp_line = ""
        for line in section_content:
            line = line.strip()
            if line.startswith("-") or line.startswith("•"):
                if temp_line:
                    cleaned_lines.append(clean_text(temp_line))
                temp_line = line
            else:
                temp_line += " " + line if temp_line else line
        if temp_line:
            cleaned_lines.append(clean_text(temp_line))
        
        section_dict[header] = cleaned_lines
    
    return section_dict


In [46]:
sections = parse_resume(raw_resume)

In [48]:
sections['Experience']

['risk technology analyst rj ’ brien associate may 2024 aug. 2024 chicago il « developed python-based solution automate financial audit request reducing work analyst dynamically generating scheduling 15+ personalized email bi-weekly « led project concept deployment including training future maintainer creating power bi dashboard monitoring request manually sending custom email ad hoc',
 'identified need developed python gui generating customized anonymous test data user technical expertise',
 'served ambassador new ai tool interviewing coworkers identify pain point ai adoption conducting 10+ one-on-one demo teaching leverage gen. ai data science intern cloudquant may 2023 aug. 2023 chicago il',
 'expanded data catalog 8000+ datasets seamless integration various data apis dynamic web scraper ¢ strategically utilized new openai api automate data entry cleaning process significantly enhancing efficiency reducing manual workload',
 'drove insight analysis data visualization summer intern r

In [44]:
print(raw_resume)

Jackson Giemza
Boulder, CO | (708)-340-8105 | jackson.giemza@gmail.com | github.com/JacksonGiemza

Enthusiastic information science student with hands-on experience as a data scientist, equipped with
technical and analytical skills to derive data-driven decisions. Passionate about harnessing data to
solve complex, meaningful problems in a vibrant, innovative setting.

Education
University of Colorado, Boulder, CO | Expected Graduation Dec 2025
Bachelor of Science, Information Science | Minor, Philosophy

Relevant Courses: Data Visualization, Statistics, Python for Info Sci 1&2, Linear Algebra, R for
Data Science, Logic, Quantitative Reasoning, Physics 1&2, Calculus, Economics, UI/UX Design

Experience

Risk Technology Analyst | RJ O’Brien & Associates May 2024 - Aug. 2024, Chicago, IL

« Developed a Python-based solution to automate financial audit requests, reducing work for
analysts by dynamically generating and scheduling 15+ personalized emails bi-weekly

« Led the project from con

In [1]:
raw_resume

NameError: name 'raw_resume' is not defined

In [1]:
import db_tools
import os

# Get user resume
resume_file_name = db_tools.get_resumes_by_user_id(1)[0]['filename']
RESUME_PATH = os.path.join("data", "resumes", resume_file_name)
# raw_resume = extract_text_from_pdf(RESUME_PATH)


In [6]:
from pyresparser import ResumeParser
data = ResumeParser(RESUME_PATH).get_extracted_data()

OSError: [E053] Could not read config file from c:\Users\jacks\anaconda3\Lib\site-packages\pyresparser\config.cfg

In [4]:
pip show pyresparser

Name: pyresparser
Version: 1.0.6
Summary: A simple resume parser used for extracting information from resumes
Home-page: https://github.com/OmkarPathak/pyresparser
Author: Omkar Pathak
Author-email: omkarpathak27@gmail.com
License: GPL-3.0
Location: c:\Users\jacks\anaconda3\Lib\site-packages
Requires: attrs, blis, certifi, chardet, cymem, docx2txt, idna, jsonschema, nltk, numpy, pandas, pdfminer.six, preshed, pycryptodome, pyrsistent, python-dateutil, pytz, requests, six, sortedcontainers, spacy, srsly, thinc, tqdm, urllib3, wasabi
Required-by: 
Note: you may need to restart the kernel to use updated packages.


In [5]:
pip install --upgrade spacy

Note: you may need to restart the kernel to use updated packages.
