In [1]:
import os
import shutil
import docx2txt
import PyPDF2
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from sklearn.feature_extraction.text import TfidfVectorizer

# function to preprocess the text
def preprocess_text(text):
    # tokenize the text into words
    tokenizer = RegexpTokenizer(r'\w+')
    words = tokenizer.tokenize(text.lower())
    # remove stop words
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]
    # stem the words
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in words]
    # join the stemmed words back into a single string
    return ' '.join(stemmed_words)

# read the JD
jd_filename = "/home/vishal/Documents/Python/Job-Descriptions/3Pillar_Global-MLE.docx"
jd_text = docx2txt.process(jd_filename)
jd_text = preprocess_text(jd_text)

# read the resumes from the folder
resumes_folder = "/home/vishal/Documents/Python/Resume"
resumes = []
resume_filenames = []
for filename in os.listdir(resumes_folder):
    if filename.endswith('.pdf'):
        with open(os.path.join(resumes_folder, filename), 'rb') as f:
            resume_pdf = PyPDF2.PdfReader(f)
            resume_text = ""
            for i in range(len(resume_pdf.pages)):
                page = resume_pdf.pages[i]
                resume_text += page.extract_text()
            resume_text = preprocess_text(resume_text)
            resumes.append(resume_text)
            resume_filenames.append(filename)

# create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# compute the TF-IDF matrix for the JD and the resumes
tfidf_matrix = vectorizer.fit_transform([jd_text] + resumes)

# compute the cosine similarities between the JD and the resumes
cosine_similarities = []
for i in range(1, len(resumes)+1):
    cosine_similarity = tfidf_matrix[0].dot(tfidf_matrix[i].T).toarray()[0][0]
    cosine_similarities.append(cosine_similarity)

# rank the resumes based on their cosine similarities to the JD
ranked_resumes = sorted(list(enumerate(cosine_similarities)), key=lambda x: x[1], reverse=True)

# create a new folder to save the ranked resumes
new_folder_name = "/home/vishal/Documents/Python/Ranked-Resumes"
os.makedirs(new_folder_name, exist_ok=True)

# save the ranked resumes in the new folder
for i, (resume_index, similarity) in enumerate(ranked_resumes):
    old_filepath = os.path.join(resumes_folder, resume_filenames[resume_index])
    new_filename = f"{i+1}_Resume_{similarity:.2f}.pdf"
    new_filepath = os.path.join(new_folder_name, new_filename)
    shutil.copy2(old_filepath, new_filepath)
    print(f"{i+1}. {new_filename} saved in {new_folder_name}")


1. 1_Resume_0.22.pdf saved in /home/vishal/Documents/Python/Ranked-Resumes
2. 2_Resume_0.10.pdf saved in /home/vishal/Documents/Python/Ranked-Resumes
3. 3_Resume_0.10.pdf saved in /home/vishal/Documents/Python/Ranked-Resumes
4. 4_Resume_0.03.pdf saved in /home/vishal/Documents/Python/Ranked-Resumes
5. 5_Resume_0.00.pdf saved in /home/vishal/Documents/Python/Ranked-Resumes
