In [None]:
'''
The task is to create a system that will match your uploaded resume to a posted job link and calculate how much your profile matches the
job description and give your percentage and also gives you suggestions on your resume to increase the match percentage. 

Steps needed:
1. Setup openAI key to access some LLM models ✅
2. Parse the job website successfully 
    - Making sure it is a job website 
    - Remove unecessary tags from the website 
    - Extract contents like requirements, qualification, responsibilities... from the website and convert it to a class
3. Parse the uploaded resume 
    - Make sure the uploaded file is actually a resume
    - Extract contents like education, work_experience, skills, certificates.... from the resume and convert it to a class
4. ==== have some logic to compare the match of resume and job description
5. ==== give suggestions to make it more fit for the job and show the updated matching percentage
'''

In [None]:
import os
import io
import re
import json
import requests
from dotenv import load_dotenv
from bs4 import BeautifulSoup
from IPython.display import Markdown, display
from openai import OpenAI
from PyPDF2 import PdfReader

In [None]:
#Load openAI key and check if the key is valid
load_dotenv(override=True)
api_key = os.getenv('OPENAI_API_KEY')
openai = OpenAI(api_key=api_key)

def check_openai_api_key(api_key):
    try:
        openai.models.list()
    except:
        return False
    else:
        return True

check_openai_api_key(api_key)

In [None]:
#Sample conversation to check if we have a working api 
message = "Hello, GPT! This is my first ever message to you! Hi!"
response = openai.chat.completions.create(model="gpt-4o-mini", messages=[{"role":"user", "content":message}])
print(response.choices[0].message.content)

In [None]:
#Create a jobwebsite object that will have the necessary actions and attributes
class JobWebsite:
    def __init__(self, url):
        self.url = url
        self.raw_html = None
        self.title = None
        self.company = None
        self.location = None
        self.description = None
        self.requirements = []
        self.qualifications = []
        self.responsibilities = []

        self._parse()

    def _parse(self):
        resp = requests.get(self.url)
        self.raw_html = resp.text
        soup = BeautifulSoup(self.raw_html, 'html.parser')

        # Sanity check for job-related keywords
        # print("Soup",soup.prettify())
        text = soup.get_text(separator=' ').lower()
        print("TExtt",text)
        if not any(k in text for k in ('responsibilities', 'qualifications', 'requirements')):
            raise ValueError("URL does not look like a job posting: {}".format(self.url))

        # Adjust these selectors to suit your target site
        title_node = soup.find('h1')
        company_node = soup.find(class_='company')
        location_node = soup.find(class_='location')
        desc_node = soup.find(class_='job-description')

        def extract_list(class_name):
            section = soup.find(class_=class_name)
            if not section:
                return []
            return [li.get_text(strip=True) for li in section.find_all('li')]

        self.title = title_node.get_text(strip=True) if title_node else None
        self.company = company_node.get_text(strip=True) if company_node else None
        self.location = location_node.get_text(strip=True) if location_node else None
        self.description = desc_node.get_text(separator=' ', strip=True) if desc_node else None

        self.requirements = extract_list('requirements')
        self.qualifications = extract_list('qualifications')
        self.responsibilities = extract_list('responsibilities')
    def get_parsed_info(self):
        return {
            'url':            self.url,
            'title':          title_node.get_text(strip=True) if title_node else None,
            'company':        company_node.get_text(strip=True) if company_node else None,
            'location':       location_node.get_text(strip=True) if location_node else None,
            'description':    desc_node.get_text(separator=' ', strip=True) if desc_node else None,
            'requirements':   extract_list('requirements'),
            'qualifications': extract_list('qualifications'),
            'responsibilities': extract_list('responsibilities'),
            'raw_html':       html
        }
        

        
        

In [None]:
# sample_job = JobWebsite('https://edwarddonner.com')
sample_job = JobWebsite('https://jobs.micro1.ai/post/86d3f0f5-2f68-42ff-9a3c-59ccf43ccb2b')

In [None]:
class Resume:
    def __init__(self, url):
        """
        Initialize with a URL or local path to a PDF resume.
        Downloads (if remote), extracts all text, then parses
        out common sections into lists of lines.
        """
        self.url = url
        self.raw_text = self._load_pdf_text()
        self.education = []
        self.work_experience = []
        self.certificates = []
        self.skills = []
        self.other_sections = {}  # catch-all for any additional headings
        self._parse_sections()

    def _load_pdf_text(self):
        # Fetch PDF bytes
        if self.url.startswith(('http://', 'https://')):
            resp = requests.get(self.url)
            resp.raise_for_status()
            pdf_stream = io.BytesIO(resp.content)
        else:
            pdf_stream = open(self.url, 'rb')

        # Extract text from each page
        reader = PdfReader(pdf_stream)
        text = []
        for page in reader.pages:
            page_text = page.extract_text()
            if page_text:
                text.append(page_text)
        return "\n".join(text)

    def _parse_sections(self):
        # Define the section headers we care about (case-insensitive)
        headers = {
            "education":      r"education",
            "work_experience":r"(work experience|professional experience|experience)",
            "certificates":   r"(certificates|certifications)",
            "skills":         r"skills",
        }
        # Build a regex that matches any of these headers
        header_regex = re.compile(
            r"^(%s)\s*$" % "|".join(v for v in headers.values()),
            flags=re.IGNORECASE | re.MULTILINE
        )

        # Find all header positions
        matches = list(header_regex.finditer(self.raw_text))
        # Append a dummy match at end to capture last section
        end_of_doc = re.Match  # placeholder
        matches.append(re.match(r"^$", ""))  # zero-length at end

        for i in range(len(matches) - 1):
            header_text = matches[i].group(1).strip().lower()
            start = matches[i].end()
            end = matches[i+1].start()
            section_lines = [
                line.strip() for line in
                self.raw_text[start:end].splitlines()
                if line.strip()
            ]

            # Map header_text back to our attribute names
            for attr, pattern in headers.items():
                if re.fullmatch(pattern, header_text, flags=re.IGNORECASE):
                    setattr(self, attr, section_lines)
                    break
            else:
                # anything else goes to other_sections
                self.other_sections[header_text] = section_lines

    def as_dict(self):
        """
        Return all parsed content as a dict.
        """
        data = {
            "url": self.url,
            "education": self.education,
            "work_experience": self.work_experience,
            "certificates": self.certificates,
            "skills": self.skills,
        }
        data.update(self.other_sections)
        return data

In [None]:
# resume = Resume("https://drive.google.com/file/d/1qtSwz3WcG2t2unuggYTUx2oitxbpYJSD/view?usp=sharing")
resume = Resume("https://drive.usercontent.google.com/u/0/uc?id=1qtSwz3WcG2t2unuggYTUx2oitxbpYJSD&export=download")

In [None]:
def summarize_resume(resume):
    """
    Summarize a Resume instance by sending all of its parsed sections
    to OpenAI in one shot. Returns a dict parsed from the JSON that
    the model generates.
    """
    # 1) Extract everything from the Resume object
    # ------------------------------------------------
    sections: Dict[str, Any] = {
        "education": resume.education,
        "work_experience": resume.work_experience,
        "certificates": resume.certificates,
        "skills": resume.skills,
    }
    # Include any extra headings that were captured:
    sections.update(resume.other_sections)

    # 2) Build a single prompt string that includes all section data.
    # ------------------------------------------------
    #    We label each section, then dump the Python lists/dicts as JSON
    #    inside the prompt so the model can “see” the raw content exactly.
    serialized_sections = json.dumps(sections, indent=2)
    prompt = (
        "You are a resume‐parsing assistant. You will receive a JSON object whose keys are section names\n"
        "(e.g. \"education\", \"work_experience\", \"certificates\", \"skills\", plus any other headings the parser found)\n"
        "and whose values are lists of lines (strings) extracted from that section. Some sections may be empty lists.\n\n"
        "Your job:\n"
        "1. For each of the known keys:\n"
        "   - “education”: produce a cleaned‐up list of educational entries (e.g. school, degree, dates).\n"
        "   - “work_experience”: produce a cleaned‐up list of job entries (e.g. company, title, dates, bullets).\n"
        "   - “certificates”: produce a cleaned‐up list of certification names (and issuing organizations, if available).\n"
        "   - “skills”: produce a cleaned‐up list of skills (e.g. “Python”, “Project Management”, etc.).\n"
        "2. For any additional key (in “other_sections”), keep the title as‐is and produce a cleaned list of lines under it.\n"
        "3. Return exactly one JSON object. The top‐level keys must match the input section names. Under each key,\n"
        "   supply an array of objects or strings as appropriate. For example:\n\n"
        "{\n"
        "  \"education\": [\n"
        "    {\n"
        "      \"school\": \"University of X\",\n"
        "      \"degree\": \"B.Sc. Computer Science\",\n"
        "      \"dates\": \"2015 – 2019\"\n"
        "    },\n"
        "    …\n"
        "  ],\n"
        "  \"work_experience\": [\n"
        "    {\n"
        "      \"company\": \"Acme Corp\",\n"
        "      \"title\": \"Software Engineer\",\n"
        "      \"dates\": \"Jan 2020 – Present\",\n"
        "      \"details\": [\n"
        "        \"Built REST APIs in Python that served 1M+ users\",\n"
        "        \"Led a team of 3 engineers on feature XYZ\"\n"
        "      ]\n"
        "    },\n"
        "    …\n"
        "  ],\n"
        "  \"certificates\": [\n"
        "    {\n"
        "      \"name\": \"AWS Certified Solutions Architect\",\n"
        "      \"issuer\": \"Amazon Web Services\",\n"
        "      \"date\": \"2021\"\n"
        "    },\n"
        "    …\n"
        "  ],\n"
        "  \"skills\": [\n"
        "    \"Python\",\n"
        "    \"Docker\",\n"
        "    \"Project Management\",\n"
        "    …\n"
        "  ],\n"
        "  \"<other_heading_1>\": [\n"
        "    \"…\",\n"
        "    \"…\"\n"
        "  ],\n"
        "  \"<other_heading_2>\": [\n"
        "    \"…\"\n"
        "  ]\n"
        "}\n\n"
        "Don’t include any explanations—just raw JSON. Here is the raw extracted‐sections input:\n\n"
        "```json\n"
        + serialized_sections
        + "\n```"
    )

    # 3) Call OpenAI’s chat completion endpoint (single request).
    # ------------------------------------------------
    response = openai.chat.completions.create(
        model="gpt-4",           
        messages=[
            {"role": "system",  "content": "You are a helpful assistant specialized in structuring resume data."},
            {"role": "user",    "content": prompt.strip()},
        ]
    )
    
    # 4) Extract the assistant’s JSON string from the response,
    #    parse it back into a Python dict, and return.
    # ------------------------------------------------
    raw_output = response.choices[0].message.content.strip()
    return raw_output

In [None]:
structured_resume = summarize_resume(resume)

In [None]:
print(structured_resume)