In [None]:
!pip install numpy pandas matplotlib scikit-learn nltk PyMuPDF gradio




Collecting PyMuPDF
  Downloading PyMuPDF-1.24.11-cp38-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Collecting gradio
  Downloading gradio-4.44.1-py3-none-any.whl.metadata (15 kB)
Collecting aiofiles<24.0,>=22.0 (from gradio)
  Downloading aiofiles-23.2.1-py3-none-any.whl.metadata (9.7 kB)
Collecting fastapi<1.0 (from gradio)
  Downloading fastapi-0.115.0-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.4.0-py3-none-any.whl.metadata (2.9 kB)
Collecting gradio-client==1.3.0 (from gradio)
  Downloading gradio_client-1.3.0-py3-none-any.whl.metadata (7.1 kB)
Collecting httpx>=0.24.1 (from gradio)
  Downloading httpx-0.27.2-py3-none-any.whl.metadata (7.1 kB)
Collecting orjson~=3.0 (from gradio)
  Downloading orjson-3.10.7-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.4/50.4 kB[0m [31m584.2 kB/s[0m eta [36m0:00:00[0m
Collecting py

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
import fitz  # PyMuPDF
import re
import gradio as gr
import nltk
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier

warnings.filterwarnings('ignore')

# Load and clean resume dataset
resumeDataSet = pd.read_csv('/content/archive (15).zip')  # Adjust the path as needed

# Clean resumes function
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]', r' ', resumeText)
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace
    return resumeText

resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x))

# Label encoding for categories
le = LabelEncoder()
resumeDataSet['Category'] = le.fit_transform(resumeDataSet['Category'])

# Prepare features and labels
requiredText = resumeDataSet['cleaned_resume'].values
requiredTarget = resumeDataSet['Category'].values

word_vectorizer = TfidfVectorizer(sublinear_tf=True, stop_words='english')
word_vectorizer.fit(requiredText)
WordFeatures = word_vectorizer.transform(requiredText)

X_train, X_test, y_train, y_test = train_test_split(WordFeatures, requiredTarget, random_state=42, test_size=0.2, shuffle=True, stratify=requiredTarget)

clf = OneVsRestClassifier(KNeighborsClassifier())
clf.fit(X_train, y_train)

# Function to extract text and skills from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with fitz.open(pdf_path) as doc:
        for page in doc:
            text += page.get_text()
    return text

def extract_skills(resume_text):
    skills_list = ["Python", "Java", "Machine Learning", "SQL", "JavaScript", "Data Analysis","Excel","A/B Testing","C","CPP","Leadership","Communication","Numpy","Tableau","Matplotlib","Seaborn","reactJs","Angularjs","html","css","ecplise","Bootstrap","API's","DOM","UX","UI","XML","JSON","REST",'JSP',"Node","Agile",'Scum','Version Control',"Spring","Attention to detail","Expanding Customer Base","jQuery","MongoDB","AWS","Java EE","Git","Azure","Github"]
    extracted_skills = []

    # Convert the resume text to lowercase and find skills in brackets
    resume_text = resume_text.lower()
    skills_in_brackets = re.findall(r'\((.*?)\)', resume_text)

    # Flatten the list of skills from brackets and check for skills
    for skill in skills_in_brackets:
        for sub_skill in skill.split('/'):
            sub_skill = sub_skill.strip().lower()
            if sub_skill in [s.lower() for s in skills_list] and sub_skill not in extracted_skills:
                extracted_skills.append(sub_skill)

    # Check for skills directly in the text
    for skill in skills_list:
        if skill.lower() in resume_text and skill.lower() not in extracted_skills:
            extracted_skills.append(skill.lower())

    return extracted_skills

# Main function to analyze resume
def analyze_resume(pdf_file, user_input_skills):
    resume_text = extract_text_from_pdf(pdf_file.name)
    extracted_skills = extract_skills(resume_text)

    # Predict job category
    transformed_resume = word_vectorizer.transform([resume_text])
    predicted_category = le.inverse_transform(clf.predict(transformed_resume))[0]  # Predict category from model

    # Prepare matched skills
    user_skills = [skill.strip().lower() for skill in user_input_skills.split(',')]
    matched_skills = set(extracted_skills).intersection(set(user_skills))

    return ', '.join(extracted_skills), predicted_category, ', '.join(matched_skills)  # Return all required outputs

# Gradio Interface
def main_interface(pdf_file, user_input_skills):
    extracted_skills, predicted_category, matched_skills = analyze_resume(pdf_file, user_input_skills)
    return extracted_skills, predicted_category, matched_skills  # Return all formatted outputs

# Gradio setup
inputs = [
    gr.File(label="Upload PDF Resume"),
    gr.Textbox(lines=2, placeholder="Enter required skills separated by commas...")
]
outputs = [
    gr.Textbox(label="Extracted Skills"),  # Separate output box for extracted skills
    gr.Textbox(label="Predicted Domain"),  # Separate output box for predicted category
    gr.Textbox(label="Matched Skills")  # Separate output box for matched skills
]

gr.Interface(fn=main_interface, inputs=inputs, outputs=outputs, title="HireWise").launch()


Setting queue=True in a Colab notebook requires sharing enabled. Setting `share=True` (you can turn this off by setting `share=False` in `launch()` explicitly).

Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
Running on public URL: https://bcf583f595666f4597.gradio.live

This share link expires in 72 hours. For free permanent hosting and GPU upgrades, run `gradio deploy` from Terminal to deploy to Spaces (https://huggingface.co/spaces)


