In [3]:
# Replace resume.pdf with the file name of your resume
resume='resume.pdf'
# Define the job post URL
url = 'https://www.linkedin.com/jobs/view/3580814554/'

In [4]:
import PyPDF2
import re
from fuzzywuzzy import fuzz
import requests
from bs4 import BeautifulSoup
import json
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from Levenshtein import distance as levenshtein_distance
import pandas as pd
from rake_nltk import Rake
from fuzzywuzzy import process

# Extract text from PDF file
def extract_text_from_pdf(file_path):
    pdf_file = PyPDF2.PdfReader(file_path)
    text = ''
    for page in range(len(pdf_file.pages)):
        text += pdf_file.pages[page].extract_text()
    return text


def preprocess_text(text):
    text = text.lower()
    # remove HTML tags
    text = re.sub(r'<[^>]+>', ' ', text)
    # remove special characters
    text = re.sub(r'[^\w\s]', ' ', text)
    # remove digits
    text = re.sub(r'\d+', ' ', text)
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text



# Extract keywords from text
def extract_keywords(text, ratio_threshold=80):
    words = text.split()
    keywords = []
    for word in words:
        if fuzz.token_set_ratio(word, text) >= ratio_threshold:
            keywords.append(word)
    return keywords

##########################################################################
resume_keywords = extract_keywords(preprocess_text(extract_text_from_pdf(resume)))
##########################################################################

# Send a request to the URL
response = requests.get(url)

soup = BeautifulSoup(response.content, 'html.parser')

# Find the script tag with the JSON-LD data
job_post_script = soup.find('script', {'type': 'application/ld+json'})

# Extract the text from the script tag and load it as JSON
job_post_json = json.loads(job_post_script.text)

# Extract the description text from the JSON
job_post_desc = job_post_json.get('description', '')

# Use BeautifulSoup to remove HTML tags from the description text
job_post = BeautifulSoup(job_post_desc, 'html.parser').get_text()

# Preprocess the job post description
job_post = preprocess_text(job_post)

# Tokenize the text into words
words = word_tokenize(job_post)

# Get the POS tags for each word
pos_tags = pos_tag(words)

# Define a list of POS tags that you want to include
included_tags = ['NN', 'NNS', 'JJ', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

# Filter out the words that have POS tags that are not included
keywords = [word for (word, tag) in pos_tags if tag in included_tags]

stop = set(nltk.corpus.stopwords.words('english'))

# Print the keywords
job_post_keywords = list(filter(lambda x: len(x) >= 5 and x not in stop, list(dict.fromkeys(keywords))))



# Merge similar keywords
merged_keywords = {}
for keyword in job_post_keywords:
    # Use fuzzy matching to find the most similar keyword
    closest_match = process.extractOne(keyword, merged_keywords.keys(), scorer=fuzz.ratio)
    if closest_match and closest_match[1] >= 90:
        merged_keywords[closest_match[0]] += [keyword]
    else:
        merged_keywords[keyword] = [keyword]

# Use the merged keywords as the final keyword list
job_post_keywords = list(merged_keywords.keys())

# Remove similar keywords
final_keywords = []
for i, keyword1 in enumerate(job_post_keywords):
    # Check if keyword1 is too similar to any of the previous keywords
    is_similar = False
    for keyword2 in job_post_keywords[:i]:
        # Calculate the Levenshtein distance between the two keywords
        distance_score = levenshtein_distance(keyword1, keyword2)
        # If the distance is below a certain threshold, consider the keywords too similar
        if distance_score < 3:
            is_similar = True
            break
    # If keyword1 is not too similar to any previous keywords, add it to the final keyword list
    if not is_similar:
        final_keywords.append(keyword1)

job_post_keywords = final_keywords


# Define the minimum threshold for the Levenshtein ratio
levenshtein_threshold = .625
# Create a list to store the matched keywords
matched_keywords = []
for x in range(len(job_post_keywords)):
    max_ratio = 0
    for y in range(len(resume_keywords)):
        ratio = 1 - levenshtein_distance(job_post_keywords[x],resume_keywords[y]) / max(len(job_post_keywords[x]),len(resume_keywords[y]))
        if ratio >= max_ratio:
            max_ratio = ratio
            best_match = resume_keywords[y]
        else:
            continue
    if max_ratio >= levenshtein_threshold:
        matched_keywords.append(best_match)
    else:
        matched_keywords.append('None')
        

job_phrases = Rake()
job_phrases.extract_keywords_from_text(job_post)
ranked_phrases = job_phrases.get_ranked_phrases()
ranked_phrases = list(dict.fromkeys(ranked_phrases))
ranked_phrases = [phrase for phrase in ranked_phrases if len(phrase.split()) > 1]

# Create a list to store the matched keywords
matched_keyphrases= []
for x in range(len(ranked_phrases)):
    phrase_words = ranked_phrases[x].split()
    tally = 0
    for y in range(len(phrase_words)):
        for a in range(len(resume_keywords)):
            ratio = 1 - levenshtein_distance(phrase_words[y],resume_keywords[a]) / max(len(phrase_words[y]),len(resume_keywords[a]))
            if ratio >= levenshtein_threshold :
                tally += 1
                break
    if tally >= len(phrase_words)/3:
        matched_keyphrases.append(ranked_phrases[x])
    else:
        matched_keyphrases.append('None')
##########################################################################
# WEIGHT TO PHRASE
weight_phrase = .65
weight_word = 1-weight_phrase

# PERCENTAGE SIMILAR SCORE
sim_score = round((sum(x != 'None' for x in matched_keywords)/len(job_post_keywords)*weight_word) + (weight_phrase *sum(x != 'None' for x in matched_keyphrases)/len(ranked_phrases)),4)

print('Similarity score:', sim_score)

THRESHOLD = 0.4

if sim_score >= THRESHOLD:
    print("Your resume is a good match for the job post.")
else:
    print("Your resume is not a good match for the job post.")

Similarity score: 0.3971
Your resume is not a good match for the job post.


In [5]:
############################################
# RESULTS ANALYSIS #

In [6]:
keyword_df = pd.DataFrame({'Job Post Keyword': job_post_keywords,'Matched Resume Keyword':matched_keywords})

In [7]:
keyphrase_df = pd.DataFrame({'Job Post Keyphrase': ranked_phrases,'Matched Resume Keyphrase':matched_keyphrases})