In [1]:
# YOUR RESUME
resume = """
Summary

Experienced Data Analyst with over 6 years of experience in data analysis, reporting, and visualization. Proven track record of providing valuable insights to inform business decisions. Skilled in data modeling, SQL, Python, and Tableau.

Experience

Data Analyst - ABC Company, New York, NY (2019 - Present)

Analyze large datasets to identify trends and patterns, providing insights to inform business decisions.
Develop and maintain dashboards and reports to track key performance indicators.
Collaborate with stakeholders to identify business needs and requirements.
Data Analyst - XYZ Inc., San Francisco, CA (2017 - 2019)

Conducted ad hoc analysis to support business operations and strategic decision-making.
Built and maintained predictive models using Python and machine learning algorithms.
Collaborated with cross-functional teams to implement data-driven solutions.
Junior Data Analyst - DEF Corporation, Boston, MA (2015 - 2017)

Extracted and transformed data from various sources to support reporting and analysis.
Developed data visualizations and dashboards using Tableau.
Conducted exploratory analysis to identify trends and patterns in data.
Education

Master of Science in Data Science - University of California, Berkeley (2015)

Bachelor of Science in Mathematics - Boston University (2013)Skills

Proficient in SQL, Python, and Tableau
Experience with machine learning and predictive modeling
Strong analytical and problem-solving skills
Excellent communication and collaboration abilities.
"""

In [2]:
# JOB DESCRIPTION
job_post = """
Consult with business stakeholders on appropriate experimental design to answer important questions and ensure new features/initiatives are being tested rigorously
Partner with product and technology teams to make sure that we are capturing relevant data; collaborate with data engineering teams to get appropriate datasets built in a format that streamlines analysis and enables automation and reporting
Help us enhance our experimentation platform by constantly evaluating if we are measuring appropriate business metrics, implementing statistical best practices, and validating our approach is driving good business outcomes
Evaluate financial impact of experiments and other business changes/initiatives
Build statistical/machine learning models to understand customer behavior and what drives experimentation results
Be curious! Surface insights and trends that improve our customer experience, help us grow faster and generate more profit
3+ years of experience in related field
Strong analytical and problem-solving skills with a t rack record of leveraging data to influence business decisions and strategy
Ability to communicate complex analyses to both technical and non-technical stakeholders, including senior leadership; synthesize results with clear, data visualizations
Ability to work on cross-functional projects and manage expectations across diverse stakeholders with competing priorities
Solid grasp of statistical techniques (regression models, significance testing, power calculations, causal inference, etc); advanced training in statistics and machine learning preferred
Ability to work with large datasets and perform custom aggregations and data cleaning to meet analysis needs; high level of proficiency using SQL (required)
Proficiency with data analysis in Python or R (required)
Experience working in an e-commerce or other direct to consumer business preferred
Experience with A/B testing, search and recommender systems, and/or personalization preferred
"""

In [3]:
import re
from fuzzywuzzy import fuzz
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from Levenshtein import distance as levenshtein_distance
import pandas as pd
from rake_nltk import Rake
from fuzzywuzzy import process

def preprocess_text(text):
    text = text.lower()
    # remove special characters
    text = re.sub(r'[^\w\s]', ' ', text)
    # remove digits
    text = re.sub(r'\d+', ' ', text)
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Extract keywords from text
def extract_keywords(text, ratio_threshold=80):
    words = text.split()
    keywords = []
    for word in words:
        if fuzz.token_set_ratio(word, text) >= ratio_threshold:
            keywords.append(word)
    return keywords

##########################################################################
# Preprocess resume text
resume_keywords = extract_keywords(preprocess_text(resume))

# Preprocess job post text
job_post = preprocess_text(job_post)

# Tokenize the text into words
words = word_tokenize(job_post)

# Get the POS tags for each word
pos_tags = pos_tag(words)

# Define a list of POS tags that you want to include
included_tags = ['NN', 'NNS', 'JJ', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

# Filter out the words that have POS tags that are not included
keywords = [word for (word, tag) in pos_tags if tag in included_tags]

stop = set(nltk.corpus.stopwords.words('english'))

# Print the keywords
job_post_keywords = list(filter(lambda x: len(x) >= 5 and x not in stop, list(dict.fromkeys(keywords))))



# Merge similar keywords
merged_keywords = {}
for keyword in job_post_keywords:
    # Use fuzzy matching to find the most similar keyword that has already been seen
    closest_match = process.extractOne(keyword, merged_keywords.keys(), scorer=fuzz.ratio)
    if closest_match and closest_match[1] >= 90:
        merged_keywords[closest_match[0]] += [keyword]
    else:
        merged_keywords[keyword] = [keyword]

# Use the merged keywords as the final keyword list
job_post_keywords = list(merged_keywords.keys())

# Remove similar keywords
final_keywords = []
for i, keyword1 in enumerate(job_post_keywords):
    # Check if keyword1 is too similar to any of the previous keywords
    is_similar = False
    for keyword2 in job_post_keywords[:i]:
        # Calculate the Levenshtein distance between the two keywords
        distance_score = levenshtein_distance(keyword1, keyword2)
        # If the distance is below a certain threshold, consider the keywords too similar
        if distance_score < 3:
            is_similar = True
            break
    # If keyword1 is not too similar to any previous keywords, add it to the final keyword list
    if not is_similar:
        final_keywords.append(keyword1)

job_post_keywords = final_keywords


# Define the minimum threshold for the Levenshtein ratio
levenshtein_threshold = .625
# Create a list to store the matched keywords
matched_keywords = []
for x in range(len(job_post_keywords)):
    max_ratio = 0
    for y in range(len(resume_keywords)):
        ratio = 1 - levenshtein_distance(job_post_keywords[x],resume_keywords[y]) / max(len(job_post_keywords[x]),len(resume_keywords[y]))
        if ratio >= max_ratio:
            max_ratio = ratio
            best_match = resume_keywords[y]
        else:
            continue
    if max_ratio >= levenshtein_threshold:
        matched_keywords.append(best_match)
    else:
        matched_keywords.append('None')
        

job_phrases = Rake()
job_phrases.extract_keywords_from_text(job_post)
ranked_phrases = job_phrases.get_ranked_phrases()
ranked_phrases = list(dict.fromkeys(ranked_phrases))
ranked_phrases = [phrase for phrase in ranked_phrases if len(phrase.split()) > 1]

# Create a list to store the matched keywords
matched_keyphrases= []
for x in range(len(ranked_phrases)):
    phrase_words = ranked_phrases[x].split()
    tally = 0
    for y in range(len(phrase_words)):
        for a in range(len(resume_keywords)):
            ratio = 1 - levenshtein_distance(phrase_words[y],resume_keywords[a]) / max(len(phrase_words[y]),len(resume_keywords[a]))
            if ratio >= levenshtein_threshold :
                tally += 1
                break
    if tally >= len(phrase_words)/3:
        matched_keyphrases.append(ranked_phrases[x])
    else:
        matched_keyphrases.append('None')
##########################################################################
# WEIGHT TO PHRASE
weight_phrase = .65
weight_word = 1-weight_phrase

# PERCENTAGE SIMILAR SCORE
sim_score = round((sum(x != 'None' for x in matched_keywords)/len(job_post_keywords)*weight_word) + (weight_phrase *sum(x != 'None' for x in matched_keyphrases)/len(ranked_phrases)),4)

print('Similarity score:', sim_score)

THRESHOLD = 0.40

if sim_score >= THRESHOLD:
    print("Your resume is a good match for the job post.")
else:
    print("Your resume is not a good match for the job post.")

Similarity score: 0.5518
Your resume is a good match for the job post.


In [4]:
############################################
# RESULTS ANALYSIS #

In [5]:
keyword_df = pd.DataFrame({'Job Post Keyword': job_post_keywords,'Matched Resume Keyword':matched_keywords})

In [6]:
keyphrase_df = pd.DataFrame({'Job Post Keyphrase': ranked_phrases,'Matched Resume Keyphrase':matched_keyphrases})

In [7]:
keyword_df

Unnamed: 0,Job Post Keyword,Matched Resume Keyword
0,consult,
1,business,business
2,stakeholders,stakeholders
3,appropriate,
4,experimental,experience
...,...,...
116,consumer,
117,search,
118,recommender,
119,systems,


In [8]:
keyphrase_df

Unnamed: 0,Job Post Keyphrase,Matched Resume Keyphrase
0,statistical techniques regression models signi...,
1,measuring appropriate business metrics impleme...,
2,business changes initiatives build statistical...,business changes initiatives build statistical...
3,non technical stakeholders including senior le...,
4,driving good business outcomes evaluate financ...,driving good business outcomes evaluate financ...
5,customer experience help us grow faster,customer experience help us grow faster
6,manage expectations across diverse stakeholders,manage expectations across diverse stakeholders
7,proficiency using sql required proficiency,proficiency using sql required proficiency
8,meet analysis needs high level,meet analysis needs high level
9,machine learning preferred ability,machine learning preferred ability
