In [36]:
'''
The task is to create a system that will match your uploaded resume to a posted job link and calculate how much your profile matches the
job description and give your percentage and also gives you suggestions on your resume to increase the match percentage. 

Steps needed:
1. Setup openAI key to access some LLM models ✅
2. Parse the job website successfully 
    - Making sure it is a job website 
    - Remove unecessary tags from the website 
    - Extract contents like requirements, qualification, responsibilities... from the website and convert it to a class
3. Parse the uploaded resume 
    - Make sure the uploaded file is actually a resume
    - Extract contents like education, work_experience, skills, certificates.... from the resume and convert it to a class
4. ==== have some logic to compare the match of resume and job description
5. ==== give suggestions to make it more fit for the job and show the updated matching percentage
'''

'\nThe task is to create a system that will match your uploaded resume to a posted job link and calculate how much your profile matches the\njob description and give your percentage and also gives you suggestions on your resume to increase the match percentage. \n\nSteps needed:\n1. Setup openAI key to access some LLM models ✅\n2. Parse the job website successfully \n    - Making sure it is a job website \n    - Remove unecessary tags from the website \n    - Extract contents like requirements, qualification, responsibilities... from the website and convert it to a class\n3. Parse the uploaded resume \n    - Make sure the uploaded file is actually a resume\n    - Extract contents like education, work_experience, skills, certificates.... from the resume and convert it to a class\n4. ==== have some logic to compare the match of resume and job description\n5. ==== give suggestions to make it more fit for the job and show the updated matching percentage\n'

In [9]:
import os
import io
import re
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI
from PyPDF2 import PdfReader

In [3]:
#Load openAI key and check if the key is valid
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')
openai = OpenAI(api_key=api_key)

def check_openai_api_key(api_key):
    try:
        openai.models.list()
    except:
        return False
    else:
        return True

check_openai_api_key(api_key)

True

In [3]:
#Sample conversation to check if we have a working api 
message = "Hello, GPT! This is my first ever message to you! Hi!"
response = openai.chat.completions.create(model="gpt-4o-mini", messages=[{"role":"user", "content":message}])
print(response.choices[0].message.content)

Hello! Welcome! I'm glad you're here. How can I assist you today?


In [4]:
#Create a jobwebsite object that will have the necessary actions and attributes
class JobWebsite:
    def __init__(self, url):
        self.url = url
        self.raw_html = None
        self.title = None
        self.company = None
        self.location = None
        self.description = None
        self.requirements = []
        self.qualifications = []
        self.responsibilities = []

        self._parse()

    def _parse(self):
        resp = requests.get(self.url)
        self.raw_html = resp.text
        soup = BeautifulSoup(self.raw_html, 'html.parser')

        # Sanity check for job-related keywords
        # print("Soup",soup.prettify())
        text = soup.get_text(separator=' ').lower()
        print("TExtt",text)
        if not any(k in text for k in ('responsibilities', 'qualifications', 'requirements')):
            raise ValueError("URL does not look like a job posting: {}".format(self.url))

        # Adjust these selectors to suit your target site
        title_node = soup.find('h1')
        company_node = soup.find(class_='company')
        location_node = soup.find(class_='location')
        desc_node = soup.find(class_='job-description')

        def extract_list(class_name):
            section = soup.find(class_=class_name)
            if not section:
                return []
            return [li.get_text(strip=True) for li in section.find_all('li')]

        self.title = title_node.get_text(strip=True) if title_node else None
        self.company = company_node.get_text(strip=True) if company_node else None
        self.location = location_node.get_text(strip=True) if location_node else None
        self.description = desc_node.get_text(separator=' ', strip=True) if desc_node else None

        self.requirements = extract_list('requirements')
        self.qualifications = extract_list('qualifications')
        self.responsibilities = extract_list('responsibilities')
    def get_parsed_info(self):
        return {
            'url':            self.url,
            'title':          title_node.get_text(strip=True) if title_node else None,
            'company':        company_node.get_text(strip=True) if company_node else None,
            'location':       location_node.get_text(strip=True) if location_node else None,
            'description':    desc_node.get_text(separator=' ', strip=True) if desc_node else None,
            'requirements':   extract_list('requirements'),
            'qualifications': extract_list('qualifications'),
            'responsibilities': extract_list('responsibilities'),
            'raw_html':       html
        }
        

        
        

In [10]:
# sample_job = JobWebsite('https://edwarddonner.com')
sample_job = JobWebsite('https://jobs.micro1.ai/post/86d3f0f5-2f68-42ff-9a3c-59ccf43ccb2b')

Soup <!DOCTYPE html>
<html lang="en">
 <head>
  <meta charset="utf-8"/>
  <meta content="width=device-width, initial-scale=1" name="viewport"/>
  <link as="image" href="/logo.svg" rel="preload"/>
  <link data-precedence="next" href="/_next/static/css/3eb275c9b0a2c957.css" rel="stylesheet"/>
  <link data-precedence="next" href="/_next/static/css/5776914706de1d86.css" rel="stylesheet"/>
  <link as="script" fetchpriority="low" href="/_next/static/chunks/webpack-e68d94a56ae6c771.js" rel="preload"/>
  <script async="" src="/_next/static/chunks/fd9d1056-a7223fb5cd24b6ec.js">
  </script>
  <script async="" src="/_next/static/chunks/23-3e86cc9c501708b0.js">
  </script>
  <script async="" src="/_next/static/chunks/main-app-a8c7f57d619965d5.js">
  </script>
  <script async="" src="/_next/static/chunks/7a49ec60-357eae05c8a1f1a9.js">
  </script>
  <script async="" src="/_next/static/chunks/561-c20e650c30ac1ca9.js">
  </script>
  <script async="" src="/_next/static/chunks/100-f1425e84874f561e.js">


ValueError: URL does not look like a job posting: https://jobs.micro1.ai/post/86d3f0f5-2f68-42ff-9a3c-59ccf43ccb2b

In [19]:
class Resume:
    def __init__(self, url):
        """
        Initialize with a URL or local path to a PDF resume.
        Downloads (if remote), extracts all text, then parses
        out common sections into lists of lines.
        """
        self.url = url
        self.raw_text = self._load_pdf_text()
        self.education = []
        self.work_experience = []
        self.certificates = []
        self.skills = []
        self.other_sections = {}  # catch-all for any additional headings
        self._parse_sections()

    def _load_pdf_text(self):
        # Fetch PDF bytes
        if self.url.startswith(('http://', 'https://')):
            resp = requests.get(self.url)
            resp.raise_for_status()
            pdf_stream = io.BytesIO(resp.content)
        else:
            pdf_stream = open(self.url, 'rb')

        # Extract text from each page
        reader = PdfReader(pdf_stream)
        text = []
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text.append(page_text)
        return "\n".join(text)

    def _parse_sections(self):
        # Define the section headers we care about (case-insensitive)
        headers = {
            "education":      r"education",
            "work_experience":r"(work experience|professional experience|experience)",
            "certificates":   r"(certificates|certifications)",
            "skills":         r"skills",
        }
        # Build a regex that matches any of these headers
        header_regex = re.compile(
            r"^(%s)\s*$" % "|".join(v for v in headers.values()),
            flags=re.IGNORECASE | re.MULTILINE
        )

        # Find all header positions
        matches = list(header_regex.finditer(self.raw_text))
        # Append a dummy match at end to capture last section
        end_of_doc = re.Match  # placeholder
        matches.append(re.match(r"^$", ""))  # zero-length at end

        for i in range(len(matches) - 1):
            header_text = matches[i].group(1).strip().lower()
            start = matches[i].end()
            end = matches[i+1].start()
            section_lines = [
                line.strip() for line in
                self.raw_text[start:end].splitlines()
                if line.strip()
            ]

            # Map header_text back to our attribute names
            for attr, pattern in headers.items():
                if re.fullmatch(pattern, header_text, flags=re.IGNORECASE):
                    setattr(self, attr, section_lines)
                    break
            else:
                # anything else goes to other_sections
                self.other_sections[header_text] = section_lines

    def as_dict(self):
        """
        Return all parsed content as a dict.
        """
        data = {
            "url": self.url,
            "education": self.education,
            "work_experience": self.work_experience,
            "certificates": self.certificates,
            "skills": self.skills,
        }
        data.update(self.other_sections)
        return data

In [20]:
# resume = Resume("https://drive.google.com/file/d/1qtSwz3WcG2t2unuggYTUx2oitxbpYJSD/view?usp=sharing")
resume = Resume("https://drive.usercontent.google.com/u/0/uc?id=1qtSwz3WcG2t2unuggYTUx2oitxbpYJSD&export=download")

In [23]:
print(resume.work_experience)

['Software Engineer', 'Africa to Silicon Valley', 'Project Name - Adot, AI-based Pregnancy Companion platform 08/2022', 'Addis Ababa,', 'Ethiopia', '•Worked on the development of a platform that help pregnant women in their journey.', "•Boosted platform accessibility with OpenAI's Whisper-1 for Speech-to-text, and designed and", 'prompt-engineered a Generative AI health assistant.', '•Tech stack - Python, GPT, NLP, GCP and Mocha', 'Project Name - SkillBridge', '•Developed an AI-powered learning platform that tailored personalized learning paths and', 'quizzes to enhance the learning experience, while also offering AI assistance to university', 'exam takers.', '•Utilized AI models like hkunlp/instructor-xl, text-embedding-ada-002, and gpt-3.5-turbo for', 'content and questions generation.', '•Integrated extra models like Wolfram Alpha for math content validation and llama and palm', 'models to check question accuracy and content quality.', '•Currently working on development of adaptive 