<a href="https://colab.research.google.com/github/vivekswamy021/AI_Resume_Parser/blob/main/AI_RESUME_PARSER.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [9]:
!pip install gradio python-docx pdfplumber openpyxl pandas  requests groq



In [11]:
# import library
import gradio as gr
import os
import pdfplumber
import docx
import openpyxl
import json
import tempfile
from groq import Groq


# GROQ-SETUP
client = Groq(api_key="Add Groq API Key")


#  Identify file type
def get_file_type(file_path):
    ext = os.path.splitext(file_path)[1].lower()
    if ext == '.pdf':
        return 'pdf'
    elif ext == '.docx':
        return 'docx'
    else:
        return 'unknown'

#  Extract content
def extract_content(file_type, file_path):
    try:
        if file_type == 'pdf':
            text = ''
            with pdfplumber.open(file_path) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + '\n'
            return text
        elif file_type == 'docx':
            doc = docx.Document(file_path)
            return '\n'.join([para.text for para in doc.paragraphs])
        else:
            return None
    except Exception as e:
        print(f"Extraction error: {e}")
        return None


#Parsing Function with LLM
def parse_with_llm(text, return_type='json'):
    prompt = f"""Extract the following information from the resume in structured JSON:
    - Name
    - Email
    - Phone
    - Skills
    - Education
    - Experience
    - Certifications
    - Projects
    - strength
    - Personal Details
    - Github
    - linkedin

    Resume:
    {text}
    """
    response = client.chat.completions.create(
        model="llama3-8b-8192",
        messages=[{"role": "user", "content": prompt}],
        temperature=0.2)

    content = response.choices[0].message.content.strip()

 # Try to extract JSON from the response
    json_start = content.find('{')
    json_end = content.rfind('}') + 1
    if json_start != -1 and json_end != -1 and json_end > json_start:
        json_str = content[json_start:json_end]
    else:
        json_str = content
    try:
        parsed = json.loads(json_str)
    except Exception:
        parsed = {"error": "Invalid JSON", "raw_output": content}
    if return_type == 'json':
        return parsed
    elif return_type == 'markdown':
        # Convert parsed JSON to Markdown
        if "error" in parsed:
            return f"**Error:** {parsed.get('raw_output','')}"
        md = ""
        for k, v in parsed.items():
            if isinstance(v, list):
                md += f"**{k.title()}**:\n"
                for item in v:
                    md += f"- {item}\n"
            else:
                md += f"**{k.title()}**: {v}\n\n"
        return md
    else:
        return {"error": "Invalid return_type"}



#def dump_to_excel(parsed_json, filename):
import openpyxl
def dump_to_excel(parsed_json, filename):
    wb = openpyxl.Workbook()
    ws = wb.active
    ws.title = "Profile"
    row = 1

    # Basic info
    for field in ['name', 'email', 'phone']:
        if field in parsed_json:
            ws.cell(row=row, column=1, value=field.title())
            ws.cell(row=row, column=2, value=parsed_json[field])
            row += 1




    # Flexible key matching map
    section_map = {
        'Skills': ['skills'],
        'Education': ['education'],
        'Experience': ['experience'],
        'Certifications': ['certifications'],
        'Projects': ['projects'],
        'Strength': ['strength', 'strengths'],
        'Personal Details': ['Personal_Details'],
        'Github': ['github'],
        'LinkedIn': ['linkedin'],
        'Email': ['email']}

    for section_title, key_variants in section_map.items():
        matched_key = next((k for k in parsed_json if k.lower() in [v.lower() for v in key_variants]), None)

        if matched_key and parsed_json[matched_key]:
            ws.cell(row=row, column=1, value=section_title)
            row += 1
            content = parsed_json[matched_key]

            if isinstance(content, list):
                for item in content:
                    ws.cell(row=row, column=2, value=str(item))
                    row += 1
            elif isinstance(content, dict):
                for k, v in content.items():
                    ws.cell(row=row, column=2, value=f"{k}: {v}")
                    row += 1
            else:
                ws.cell(row=row, column=2, value=str(content))
                row += 1
        else:
            print(f" Skipping section '{section_title}' — not found or empty.")

    wb.save(filename)


#  Main Parser Function
def ParserCV(file_type, file_path, return_type='json'):
    text = extract_content(file_type, file_path)
    if not text:
        return {"error": "Unable to extract content from the file."}, text, None
    parsed = parse_with_llm(text, return_type='json')
    if not parsed or "error" in parsed:
        return parsed, text, None
    # Name Excel file after candidate
    name = parsed.get('name', 'candidate').replace(' ', '_')
    excel_filename = os.path.join(tempfile.gettempdir(), f"{name}.xlsx")
    dump_to_excel(parsed, excel_filename)
    if return_type == 'json':
        return parsed, text, excel_filename
    elif return_type == 'markdown':
        return parse_with_llm(text, return_type='markdown'), text, excel_filename
    else:
        return {"error": "Invalid return_type."}, text, None

#  Gradio UI
def gradio_interface(file, output_format, section):
    file_path = file.name
    file_type = get_file_type(file_path)
    parsed, full_text, excel_path = ParserCV(file_type, file_path, return_type='json')


    # Section extraction
    section_content = ""
    section_file = None
    if not parsed or "error" in parsed:
        main_output = parsed.get("error", "Unknown error")
        json_output = json.dumps(parsed, indent=2)
        markdown_output = main_output
        excel_path = None
        section_content = parsed.get("raw_output", "")
    else:
        json_output = json.dumps(parsed, indent=2)
        markdown_output = parse_with_llm(full_text, return_type='markdown')
        main_output = json_output if output_format == "json" else markdown_output


        # Section logic
        if section == "full resume":
            section_content = full_text
        elif section == "raw output":
            section_content = json_output
        elif section == "download raw output":
            tmp_raw = tempfile.NamedTemporaryFile(delete=False, suffix=".txt")
            tmp_raw.write(json_output.encode('utf-8'))
            tmp_raw.close()
            section_file = tmp_raw.name
        elif section in parsed:
            section_val = parsed[section]
            section_content = json.dumps(section_val, indent=2) if isinstance(section_val, (list, dict)) else str(section_val)
        else:
            section_content = f"Section '{section}' not found."



    # Return outputs
    if section == "download raw output" and section_file:
        return main_output, excel_path, section_file
    else:
        return main_output, excel_path, section_content


# Section options for dropdown
section_options = [
    "name", "email", "experience", "projects", "skills",
    "education", "full resume","github","linkedin"]

with gr.Blocks(title="AI Resume Parser") as demo:
    gr.Markdown("## AI Resume Parser (PDF/DOCX) with Groq and Gradio")
    with gr.Row():
        file_input = gr.File(label="Upload Resume (PDF/DOCX)", file_types=[".pdf", ".docx"])
        output_format = gr.Radio(['json', 'markdown'], label="Output Format", value="json")
        section = gr.Dropdown(section_options, label="Section", value="full resume")
    main_output = gr.Textbox(label="Parsed Output (JSON/Markdown)", lines=20)
    excel_output = gr.File(label="Download Excel")
    section_output = gr.Textbox(label="Section Output", lines=10)
    section_file_output = gr.File(label="Download Section Output", visible=False)


    # Interface wrapper
    def wrapped_interface(file, output_format, section):
        result = gradio_interface(file, output_format, section)
        # result: main_output, excel_path, section_content/section_file
        if section == "download raw output" and result[2]:
            return result[0], result[1], gr.update(visible=False), result[2]
        else:
            return result[0], result[1], result[2], gr.update(visible=False)


    # Bind input changes to wrapped_interface to update outputs
    file_input.change(
        wrapped_interface,
        inputs=[file_input, output_format, section],
        outputs=[main_output, excel_output, section_output, section_file_output])

    output_format.change(
        wrapped_interface,
        inputs=[file_input, output_format, section],
        outputs=[main_output, excel_output, section_output, section_file_output])

    section.change(
        wrapped_interface,
        inputs=[file_input, output_format, section],
        outputs=[main_output, excel_output, section_output, section_file_output])

    demo.load(lambda: ("", None, "", None),
                inputs=None,
                outputs=[main_output, excel_output, section_output, section_file_output])

demo.launch()


It looks like you are running Gradio on a hosted a Jupyter notebook. For the Gradio app to work, sharing must be enabled. Automatically setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://37e1eaf2df7b77bb74.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


