## Installing Required Libraries

In [1]:
pip install PyMuPDF

Note: you may need to restart the kernel to use updated packages.


In [2]:
pip install python-docx

Note: you may need to restart the kernel to use updated packages.


## Importing Libraries

In [3]:
import re
import fitz
import re
import os
import fitz  # PyMuPDF for PDF handling
import os
import docx  # python-docx for Word document handling

## Function for content extraction and cleaning resume and job description

In [4]:
def extract_text_from_pdf(pdf_path):
    try:
        document = fitz.open(pdf_path)
        text = ''
        for page_num in range(document.page_count):
            page = document.load_page(page_num)
            text += page.get_text()
        return text
    except Exception as e:
        print(f"An error occurred while processing {pdf_path}: {e}")
        return None


In [5]:
def extract_text_from_docx(docx_path):
    try:
        doc = docx.Document(docx_path)
        full_text = []
        for para in doc.paragraphs:
            full_text.append(para.text)
        return '\n'.join(full_text)
    except Exception as e:
        print(f"An error occurred while processing {docx_path}: {e}")
        return None

In [6]:
def extract_education(resume_text):
    education_section = re.search(r'Education(.*?)(?=Experience|Skills|$)', resume_text, re.DOTALL)
    if education_section:
        return education_section.group(1).strip()
    return None

In [7]:
def extract_experience(resume_text):
    experience_section = re.search(r'Experience(.*?)(?=Education|Skills|$)', resume_text, re.DOTALL)
    if experience_section:
        return experience_section.group(1).strip()
    return None

In [8]:
def extract_skills(resume_text):
    skills_section = re.search(r'Skills(.*?)(?=Experience|Education|$)', resume_text, re.DOTALL)
    if skills_section:
        return skills_section.group(1).strip()
    return None

In [9]:
def extract_resume_details(resume_text):
    resume_details = {}
    resume_details['name'] = extract_name(resume_text)
    resume_details['contact_info'] = extract_contact_info(resume_text)
    resume_details['education'] = extract_education(resume_text)
    resume_details['experience'] = extract_experience(resume_text)
    resume_details['skills'] = extract_skills(resume_text)
    return resume_details

In [10]:
def clean_text(text):
    # Combine multiple spaces into one
    text = ' '.join(text.split())
    # Replace newline characters with a space
    text = text.replace('\n', ' ')
    # Further clean using regex
    text = re.sub(r'\s+', ' ', text)
    return text


In [11]:
def extract_skills(resume_text):
    # Known section headers for skills
    skills_headers = ['Skills', 'Technical Skills', 'Core Competencies', 'Expertise']
    
    for header in skills_headers:
        if header in resume_text:
            start_index = resume_text.index(header)
            # Find the end of the section
            for end_header in ['Experience', 'EXPERIENCE', 'Education', 'EDUCATION', 'Projects', 'PROJECTS', '\n\n']:
                end_index = resume_text.find(end_header, start_index)
                if end_index != -1:
                    break
            skills_section = resume_text[start_index:end_index].strip()
            # Clean up the section
            skills_section = remove_unnecessary_symbols(skills_section)
            skills_section = ' '.join(skills_section.split())
            return skills_section
    
    return None


In [12]:
def extract_experience(resume_text):
    # Known section headers for experience
    experience_headers = ['Experience', 'Work Experience', 'Professional Experience']
    
    for header in experience_headers:
        if header in resume_text:
            start_index = resume_text.index(header)
            # Find the end of the section
            for end_header in ['Skills', 'SKILLS', 'Education', 'EDUCATION', 'Projects', 'PROJECTS', '\n\n']:
                end_index = resume_text.find(end_header, start_index)
                if end_index != -1:
                    break
            experience_section = resume_text[start_index:end_index].strip()
            # Clean up the section
            experience_section = remove_unnecessary_symbols(experience_section)
            experience_section = ' '.join(experience_section.split())
            return experience_section
    
    return None

In [13]:
def remove_unnecessary_symbols(text):
    # Replace common unnecessary symbols with empty string
    unnecessary_symbols = ['•', '●', '‣', '○', '■', '-', '*', '+', '|', ':', ';', '~', '`', '!', '@', '#', '$', '%', '^', '&', '(', ')', '[', ']', '{', '}', '<', '>', '=', '_', '\\', '/', '"', "'", ',', '.']
    for symbol in unnecessary_symbols:
        text = text.replace(symbol, '')
    return text

In [14]:
import spacy
import numpy as np
from scipy.spatial.distance import cosine
import fitz  # PyMuPDF for PDF handling
import os
import docx  # python-docx for Word document handling








## Function to process multiple PDF

In [15]:
def process_multiple_pdfs(pdf_folder_path):
    resume_details_list = []
    for file_name in os.listdir(pdf_folder_path):
        if file_name.endswith('.pdf'):
            pdf_path = os.path.join(pdf_folder_path, file_name)
            resume_text = extract_text_from_pdf(pdf_path)
            resume_details = extract_resume_details(resume_text)
            resume_details['file_name'] = file_name
            resume_details_list.append(resume_details)
    return resume_details_list

## Function to process multiple word document

In [16]:
def extract_text_from_docx(docx_path):
    try:
        doc = docx.Document(docx_path)
        full_text = []
        for para in doc.paragraphs:
            full_text.append(para.text)
        return '\n'.join(full_text)
    except Exception as e:
        print(f"An error occurred while processing {docx_path}: {e}")
        return None

## Function to parse all files in the directory

In [17]:
def process_multiple_files(folder_path):
    resume_details_list = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if file_name.endswith('.pdf'):
            resume_text = extract_text_from_pdf(file_path)
        elif file_name.endswith('.docx'):
            resume_text = extract_text_from_docx(file_path)
        else:
            continue
        
        if resume_text:
            resume_details = {}
            resume_details['file_name'] = file_name
            resume_details['skills'] = extract_skills(resume_text)
            resume_details['experience'] = extract_experience(resume_text)
            resume_details_list.append(resume_details)
        else:
            print(f"Failed to extract text from {file_name}.")
    
    return resume_details_list

## Set the folder path of dataset/ Extract details from sections

In [18]:

folder_path = 'archive/data/data/ACCOUNTANT'
resume_details_list = process_multiple_files(folder_path)

for resume_details in resume_details_list:
    print(f"File: {resume_details['file_name']}")
    
    if resume_details['skills']:
        print("Skills:")
        print(resume_details['skills'])
    else:
        print("No skills section found.")
        
    if resume_details['experience']:
        print("Experience:")
        print(resume_details['experience'])
    else:
        print("No experience section found.")
    
    print()


File: 27980446.pdf
Skills:
Skills Microsoft Office Excel Outlook and Word SAGE 100 Ramp WMS software and Syspro ERP program
Experience:
Experience Company Name City State Accountant 042011 to 052017 Performed general accounting functions journal entries reconciliations and accruals Completed monthly assigned account analysis compared to budget and prior periods and reconciliations as well Participated in monthly quarterly and annual financial closing processes Participated in the annual budget process Researched and approved all credit memos and debit memos to be issued returns damages mispicks and price discrepancies Implemented and oversaw RGA spreadsheet for returns used by customer service accounting and upper management Initiated and tracked claim process with carriers for damages Built relationships with other departments including logistics planning customer service and sales Participated in identifying and executing the companys business process improvement efforts and assisted

## Interpreting as Fetch JD from Linkedin API

In [19]:
def fetch_job_description_from_api(job_id):
    # Simulated function to fetch job description from LinkedIn API
    job_description = "We are looking for a Software Engineer to join our dynamic team. As a Software Engineer, you will be responsible for designing, developing, and testing software solutions. You should have strong programming skills in languages like Python, Java, or C++. Experience with web development frameworks such as Django or Flask is highly desirable. Familiarity with database management systems like MySQL or PostgreSQL is a plus. The ideal candidate will have a solid understanding of software engineering principles and a passion for delivering high-quality code. Excellent communication skills and the ability to work in a collaborative environment are essential."
    return job_description

In [20]:
job_description = "We are looking for a Software Engineer to join our dynamic team. As a Software Engineer, you will be responsible for designing, developing, and testing software solutions. You should have strong programming skills in languages like Python, Java, or C++. Experience with web development frameworks such as Django or Flask is highly desirable. Familiarity with database management systems like MySQL or PostgreSQL is a plus. The ideal candidate will have a solid understanding of software engineering principles and a passion for delivering high-quality code. Excellent communication skills and the ability to work in a collaborative environment are essential."

## Installing SpaCy Library

In [21]:
##pip install spacy

In [22]:
##!python -m spacy download en_core_web_sm


In [23]:
import spacy

# Load the English language model
nlp = spacy.load('en_core_web_sm')

In [24]:
def preprocess_text(text):
    doc = nlp(text)
    # Remove stop words and lemmatize tokens
    tokens = [token.lemma_ for token in doc if not token.is_stop]
    return ' '.join(tokens)

In [25]:
def compute_similarity(text1, text2):
    doc1 = nlp(text1)
    doc2 = nlp(text2)
    return doc1.similarity(doc2)

## Comparision of JD with Resume

In [26]:
def compare_resume_with_job(resume_text, job_description):
    # Preprocess resume and job description
    processed_resume = preprocess_text(resume_text)
    processed_job_description = preprocess_text(job_description)
    
    # Compute similarity score
    similarity_score = compute_similarity(processed_resume, processed_job_description)
    
    return similarity_score

In [27]:
def process_multiple_resumes(resumes_folder_path, job_id):
    job_description = fetch_job_description_from_api(job_id)
    
    resume_details_list = []
    for file_name in os.listdir(resumes_folder_path):
        file_path = os.path.join(resumes_folder_path, file_name)
        if file_name.endswith('.pdf'):
            resume_text = extract_text_from_pdf(file_path)
        elif file_name.endswith('.docx'):
            resume_text = extract_text_from_docx(file_path)
        else:
            continue
        
        if resume_text:
            similarity_score = compare_resume_with_job(resume_text, job_description)
            
            resume_details = {
                'file_name': file_name,
                'similarity_score': similarity_score
            }
            resume_details_list.append(resume_details)
        else:
            print(f"Failed to extract text from {file_name}.")
    
    return resume_details_list

## Calculating the Similarity score between Resume and JD

In [28]:
resumes_folder_path = 'archive/data/data/ACCOUNTANT'
job_id = '123456'  # Replace with actual LinkedIn job ID

resume_details_list = process_multiple_resumes(resumes_folder_path, job_id)

for resume_details in resume_details_list:
    print(f"File: {resume_details['file_name']}")
    print(f"Similarity Score: {resume_details['similarity_score']}")
    print()

  return doc1.similarity(doc2)


File: 27980446.pdf
Similarity Score: 0.892055810541324

File: 14491649.pdf
Similarity Score: 0.8894565337013404

File: 11759079.pdf
Similarity Score: 0.8838119538405773

File: 24799301.pdf
Similarity Score: 0.7582498502347209

File: 18635654.pdf
Similarity Score: 0.8472571407665143

File: 10674770.pdf
Similarity Score: 0.8976119765098279

File: 62809577.pdf
Similarity Score: 0.8603580063951947

File: 39115899.pdf
Similarity Score: 0.8195141412072683

File: 24817041.pdf
Similarity Score: 0.783609901722758

File: 10554236.pdf
Similarity Score: 0.8774182641872194

File: 29999135.pdf
Similarity Score: 0.8594172031792917

File: 25067742.pdf
Similarity Score: 0.8085904253696373

File: 25462793.pdf
Similarity Score: 0.7956445359538921

File: 17556527.pdf
Similarity Score: 0.8839235984825368

File: 39674178.pdf
Similarity Score: 0.8624107081476561

File: 63137898.pdf
Similarity Score: 0.8624447836460158

File: 13072019.pdf
Similarity Score: 0.8473376655816904

File: 15289348.pdf
Similarity Sco

## Ranking Reusme similarity based on their simialrity score with the JD 

In [31]:

# Example usage
resumes_folder_path = '/Users/juyinshafaqinamdar/Downloads/archive/data/data/ACCOUNTANT'
resume_details_list = process_multiple_resumes(resumes_folder_path, job_description)

# Sort resumes by similarity score in descending order
ranked_resumes = sorted(resume_details_list, key=lambda x: x['similarity_score'], reverse=True)

# Output the ranked resumes
print("Ranked Resumes:")
for resume_details in ranked_resumes:
    print(f"File: {resume_details['file_name']}")
    print(f"Similarity Score: {resume_details['similarity_score']:.4f}")
    print()

Ranked Resumes:
File: 17407184.pdf
Similarity Score: 0.9195

File: 19545827.pdf
Similarity Score: 0.9132

File: 28614791.pdf
Similarity Score: 0.9121

File: 27558837.pdf
Similarity Score: 0.9120

File: 18569929.pdf
Similarity Score: 0.9111

File: 22465498.pdf
Similarity Score: 0.9105

File: 20624984.pdf
Similarity Score: 0.9103

File: 14055988.pdf
Similarity Score: 0.9097

File: 23246831.pdf
Similarity Score: 0.9093

File: 62809577.pdf
Similarity Score: 0.9090

File: 13072019.pdf
Similarity Score: 0.9088

File: 63137898.pdf
Similarity Score: 0.9086

File: 33527446.pdf
Similarity Score: 0.9086

File: 21338490.pdf
Similarity Score: 0.9082

File: 31602598.pdf
Similarity Score: 0.9080

File: 50222417.pdf
Similarity Score: 0.9079

File: 12338274.pdf
Similarity Score: 0.9076

File: 23734441.pdf
Similarity Score: 0.9074

File: 30813919.pdf
Similarity Score: 0.9073

File: 78403342.pdf
Similarity Score: 0.9073

File: 18669563.pdf
Similarity Score: 0.9073

File: 27573855.pdf
Similarity Score: 0.

In [30]:
def extract_keywords(text):
    doc = nlp(text.lower())
    keywords = [token.lemma_ for token in doc if token.is_alpha and not token.is_stop]
    return ' '.join(keywords)
    
def vectorize_text(text):
    doc = nlp(text)
    return doc.vector

def calculate_similarity(job_description, resume_text):
    job_keywords = extract_keywords(job_description)
    resume_keywords = extract_keywords(resume_text)
    
    job_vec = vectorize_text(job_keywords)
    resume_vec = vectorize_text(resume_keywords)
    
    similarity_score = 1 - cosine(job_vec, resume_vec)
    return similarity_score

def process_multiple_resumes(folder_path, job_description):
    resume_details_list = []
    for file_name in os.listdir(folder_path):
        file_path = os.path.join(folder_path, file_name)
        if file_name.endswith('.pdf'):
            resume_text = extract_text_from_pdf(file_path)
        elif file_name.endswith('.docx'):
            resume_text = extract_text_from_docx(file_path)
        else:
            continue
        
        if resume_text:
            resume_details = {}
            resume_details['file_name'] = file_name
            resume_details['similarity_score'] = calculate_similarity(job_description, resume_text)
            resume_details_list.append(resume_details)
        else:
            print(f"Failed to extract text from {file_name}.")
    
    return resume_details_list

In [31]:
pdf_path = 'archive/data/data/ACCOUNTANT/27980446.pdf'
resume_text = extract_text_from_pdf(pdf_path)
skills = extract_skills(resume_text)
print(skills)

Skills Microsoft Office Excel Outlook and Word SAGE 100 Ramp WMS software and Syspro ERP program


## returning missing keywords

In [32]:
import spacy
from collections import Counter

def extract_keywords(text):
    """
    Extract keywords from text using SpaCy.
    """
    doc = nlp(text)
    keywords = [token.lemma_.lower() for token in doc if not token.is_stop and not token.is_punct]
    return set(keywords)

def find_missing_keywords(job_description, resume_details):
    """
    Find keywords that are in the job description but missing from the resume.
    """
    job_keywords = extract_keywords(job_description)
    resume_keywords = extract_keywords(resume_details)
    
    missing_keywords = job_keywords - resume_keywords
    return missing_keywords

# Example job description and resume texts
missing_keywords = find_missing_keywords(job_description, resume_text)

print(f"Missing Keywords: {missing_keywords}")


Missing Keywords: {'learning', 'engineer', 'machine', 'python', 'look'}
