In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn import metrics
from sklearn.metrics import accuracy_score
from pandas.plotting import scatter_matrix
from sklearn.neighbors import KNeighborsClassifier
from sklearn import metrics

In [3]:
resumeDataSet = pd.read_csv(r'Training Data/UpdatedResumeDataSet.csv',encoding='utf-8')

In [4]:
import re
def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)
    resumeText = re.sub('RT|cc', ' ', resumeText)
    resumeText = re.sub('#\S+', '', resumeText)
    resumeText = re.sub('@\S+', '  ', resumeText)
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)
    return resumeText
    
resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x))

In [5]:
import nltk
from nltk.corpus import stopwords
import string

oneSetOfStopWords = set(stopwords.words('english')+['``',"''"])
totalWords =[]
Sentences = resumeDataSet['Resume'].values
cleanedSentences = ""
for i in range(0,160):
    cleanedText = cleanResume(Sentences[i])
    cleanedSentences += cleanedText
    requiredWords = nltk.word_tokenize(cleanedText)
    for word in requiredWords:
        if word not in oneSetOfStopWords and word not in string.punctuation:
            totalWords.append(word)

In [6]:
from sklearn.preprocessing import LabelEncoder

var_mod = ['Category']
le = LabelEncoder()
for i in var_mod:
    resumeDataSet[i] = le.fit_transform(resumeDataSet[i])

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack

requiredText = resumeDataSet['cleaned_resume'].values
requiredTarget = resumeDataSet['Category'].values

word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    stop_words='english',
    max_features=1500)
word_vectorizer.fit(requiredText)
WordFeatures = word_vectorizer.transform(requiredText)

X_train,X_test,y_train,y_test = train_test_split(WordFeatures,requiredTarget,random_state=0, test_size=0.2)
clf = OneVsRestClassifier(KNeighborsClassifier())
clf.fit(X_train, y_train)

In [8]:
from pdf2docx import Converter
from docx import Document
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn

def convert_pdf_to_docx(pdf_path, docx_path):
    cv = Converter(pdf_path)
    cv.convert(docx_path, start=0, end=None)
    cv.close()

def extract_text_from_docx(file_path):
    doc = Document(file_path)
    full_text = []
    for paragraph in doc.paragraphs:
        full_text.append(paragraph.text)
    return '\n'.join(full_text)

In [9]:
def readresumes(folder_path):
    cleaned_resumes = []
    resume_files = os.listdir(folder_path)
    for file in resume_files:
        if file.endswith('.docx'):
            file_path = os.path.join(folder_path, file)
            user_resume = extract_text_from_docx(file_path)
            cleaned_resume = cleanResume(user_resume)
            cleaned_resumes.append(cleaned_resume)
    return cleaned_resumes

In [10]:
cleaned_resumes = readresumes(folder_path="Resumes_Input")

In [11]:
prediction = clf.predict(X_test) 
print('Accuracy on training set: {:.2f}'.format(clf.score(X_train, y_train)*100))
print('Accuracy on test set: {:.2f}'.format(clf.score(X_test, y_test)*100))

Accuracy on training set: 99.35
Accuracy on test set: 98.96


In [12]:
for cleaned_resume in cleaned_resumes:
    user_text = [cleaned_resume]
    user_features = word_vectorizer.transform(user_text)

    user_prediction = clf.predict(user_features)

    predicted_category = le.inverse_transform(user_prediction)

    user_prediction_proba = clf.predict_proba(user_features)
    class_labels = le.classes_

    print("\nPredicting Category...")
    for label, proba in zip(class_labels, user_prediction_proba[0]):
        print(f"{label}: {proba:.2%}")

    print("Predicted Category for the User Input Resume: ", predicted_category[0])


Predicting Category...
Advocate: 0.00%
Arts: 0.00%
Automation Testing: 0.00%
Blockchain: 0.00%
Business Analyst: 0.00%
Civil Engineer: 0.00%
Data Science: 0.00%
Database: 0.00%
DevOps Engineer: 0.00%
DotNet Developer: 0.00%
ETL Developer: 0.00%
Electrical Engineering: 0.00%
HR: 80.00%
Hadoop: 0.00%
Health and fitness: 20.00%
Java Developer: 0.00%
Mechanical Engineer: 0.00%
Network Security Engineer: 0.00%
Operations Manager: 0.00%
PMO: 0.00%
Python Developer: 0.00%
SAP Developer: 0.00%
Sales: 0.00%
Testing: 0.00%
Web Designing: 0.00%
Predicted Category for the User Input Resume:  HR

Predicting Category...
Advocate: 0.00%
Arts: 0.00%
Automation Testing: 0.00%
Blockchain: 0.00%
Business Analyst: 0.00%
Civil Engineer: 0.00%
Data Science: 0.00%
Database: 0.00%
DevOps Engineer: 0.00%
DotNet Developer: 0.00%
ETL Developer: 0.00%
Electrical Engineering: 0.00%
HR: 0.00%
Hadoop: 0.00%
Health and fitness: 0.00%
Java Developer: 0.00%
Mechanical Engineer: 100.00%
Network Security Engineer: 0.00%

In [13]:
from transformers import BartTokenizer, BartForConditionalGeneration

def generate_resume_summary(resume_texts):
    tokenizer = BartTokenizer.from_pretrained('facebook/bart-large-cnn')
    model = BartForConditionalGeneration.from_pretrained('facebook/bart-large-cnn')
    summaries = []
    for resume_text in resume_texts:
        inputs = tokenizer(resume_text, max_length=1024, return_tensors='pt', truncation=True)
        summary_ids = model.generate(inputs['input_ids'], num_beams=4, max_length=100, early_stopping=True)
        summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False)
        summaries.append(summary)
    
    return summaries

In [14]:
def save_summary_to_csv(output_folder, summary, candidate_number):
    df= pd.DataFrame([summary])
    output_file = os.path.join(output_folder, f"{candidate_number}_summary.csv")
    df.to_csv(output_file, index=False)
    print(f"Summary saved to: {output_file}")

In [15]:
summaries = generate_resume_summary(cleaned_resumes)
output_folder = "Candidate Summary_Output"

for i, summary in enumerate(summaries):
    print(f"Summary for Resume {i + 1}:")
    save_summary_to_csv(output_folder, summary, i+1)
    # print(summary)
    print()

Summary for Resume 1:
Summary saved to: Candidate Summary_Output\1_summary.csv

Summary for Resume 2:
Summary saved to: Candidate Summary_Output\2_summary.csv

Summary for Resume 3:
Summary saved to: Candidate Summary_Output\3_summary.csv

Summary for Resume 4:
Summary saved to: Candidate Summary_Output\4_summary.csv



In [16]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn

def calculate_resume_rating(resume_texts):
    ratings = []
    for resume_text in resume_texts:
        tokens = word_tokenize(resume_text.lower())
        word_lengths = [len(word) for word in tokens]
        avg_word_length = sum(word_lengths) / len(word_lengths) if word_lengths else 0

        unique_words = set(tokens)
        vocabulary_richness = len(unique_words) / len(tokens) if tokens else 0

        synsets = set()
        for word in unique_words:
            synsets.update(wn.synsets(word))
        semantic_richness = len(synsets) / len(unique_words) if unique_words else 0
        
        rating = (avg_word_length * 2 + vocabulary_richness * 4 + semantic_richness * 4) / 10
        ratings.append(rating)
    
    return ratings

resume_ratings = calculate_resume_rating(cleaned_resumes)

for i, rating in enumerate(resume_ratings):
    print(f"Rating for Resume {i + 1}: {rating:.2f}/10")

Rating for Resume 1: 3.36/10
Rating for Resume 2: 3.49/10
Rating for Resume 3: 3.41/10
Rating for Resume 4: 3.64/10


In [17]:
resume_rating_tuples = list(zip(cleaned_resumes, resume_ratings))
sorted_resumes = sorted(resume_rating_tuples, key=lambda x: x[1], reverse=True)

print("Ranking List:")
for rank, (resume_text, rating) in enumerate(sorted_resumes, start=1):

    resume_index = cleaned_resumes.index(resume_text) + 1
    print(f"Rank {rank}: Resume {resume_index}")

Ranking List:
Rank 1: Resume 4
Rank 2: Resume 2
Rank 3: Resume 3
Rank 4: Resume 1
