In [1]:
import os
import textract
import docx2txt
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:

nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

def read_text_from_pdf_or_docx(file_path):
    text = ""
    if file_path.endswith(".pdf"):
        text = textract.process(file_path).decode('utf-8')
    elif file_path.endswith(".docx"):
        text = docx2txt.process(file_path)
    return text

def preprocess_text(text):
    # Tokenize and lemmatize text using spacy
    doc = nlp(text)
    lemmatized_text = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    return lemmatized_text

def rank_resumes(job_description_path, resumes_folder):

    with open(job_description_path, 'r') as f:
        job_description = f.read()

    job_description_tokens = preprocess_text(job_description)

  
    resumes = []
    resume_names = []
    for resume_file in os.listdir(resumes_folder):
        if resume_file.endswith((".pdf", ".docx")):
            resume_path = os.path.join(resumes_folder, resume_file)
            resume_text = read_text_from_pdf_or_docx(resume_path)
            resume_tokens = preprocess_text(resume_text)
            resumes.append(" ".join(resume_tokens))  
            resume_names.append(resume_file)

    
    vectorizer = TfidfVectorizer(stop_words='english')
    job_description_tfidf = vectorizer.fit_transform([" ".join(job_description_tokens)])
    resumes_tfidf = vectorizer.transform(resumes)

   
    similarity_scores = cosine_similarity(job_description_tfidf, resumes_tfidf)[0]

    ranked_resumes = sorted(zip(resume_names, similarity_scores), key=lambda x: x[1], reverse=True)


    ranked_resumes_with_ranking = [(resume_name, similarity_score, rank+1) for rank, (resume_name, similarity_score) in enumerate(ranked_resumes)]

    return ranked_resumes_with_ranking

def save_rankings_to_excel(ranked_resumes, output_excel):
    df = pd.DataFrame(ranked_resumes, columns=['Resume_Path', 'Similarity_Score', 'Ranking'])
    df.to_excel(output_excel, index=False)

if __name__ == "__main__":
    job_description_path = "F:/Resume_Ranking/job_description.txt"
    resumes_folder = "F:/Resume_Ranking/Resumes"
    output_excel = "F:/Resume_Ranking/resume_rankings1.xlsx"

    ranked_resumes = rank_resumes(job_description_path, resumes_folder)
    save_rankings_to_excel(ranked_resumes, output_excel)

    print("Rankings saved to:", output_excel)


Rankings saved to: F:/Resume_Ranking/resume_rankings1.xlsx
