In [None]:
# https://sbert.net/

import re
import spacy
from sentence_transformers import SentenceTransformer
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize


def clean_text(raw_text):
    set_of_stopwords = set(stopwords.words("english") + list(string.punctuation))
    lemmatizer = WordNetLemmatizer()

    # Convert text to lowercase and tokenize into words
    tokens = word_tokenize(raw_text.lower())
    # Remove stopwords and punctuation
    tokens = [token for token in tokens if token not in set_of_stopwords]
    # Lemmatize the remaining words
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    # Join the tokens back into a single string
    cleaned_text = " ".join(tokens)
    return cleaned_text

def clean_resume(resume_content):
    skills_pattern = re.compile(r'Skills\s*[:\n]', re.IGNORECASE)
    skills_match = skills_pattern.search(resume_content)

    if skills_match:
        skills_start = skills_match.end()
        skills_end = resume_content.find('\n\n', skills_start)
        skills_section = resume_content[skills_start:skills_end].strip()
        skills_lines = skills_section.split('\n')

        extracted_skills = []
        for line in skills_lines:
            line_skills = re.split(r'[:,-]', line)
            extracted_skills.extend([skill.strip() for skill in line_skills if skill.strip()])

        skills = list(set(extracted_skills))
    else:
        skills = []

    skills = ", ".join(skills)

    RESUME_SECTIONS = [
        "Contact Information", "Objective", "Summary", "Education", "Experience", 
        "Skills", "Projects", "Certifications", "Licenses", "Awards", "Honors", 
        "Publications", "References", "Technical Skills", "Computer Skills", 
        "Programming Languages", "Software Skills", "Soft Skills", "Language Skills", 
        "Professional Skills", "Transferable Skills", "Work Experience", 
        "Professional Experience", "Employment History", "Internship Experience", 
        "Volunteer Experience", "Leadership Experience", "Research Experience", 
        "Teaching Experience",
    ]

    experience_start = resume_content.find("Experience")
    if experience_start == -1:
        return ""

    experience_end = len(resume_content)
    for section in RESUME_SECTIONS:
        if section != "Experience":
            section_start = resume_content.find(section, experience_start)
            if section_start != -1:
                experience_end = min(experience_end, section_start)

    experience_section = resume_content[experience_start:experience_end].strip()

    cleaned_experience = clean_text(experience_section)

    cleaned_skills = clean_text(skills)


    return cleaned_experience + cleaned_skills

def compute_similarity(cleaned_resume, cleaned_jd):

    model = SentenceTransformer('all-MiniLM-L6-v2')
    sentences = [cleaned_resume, cleaned_jd]
    embeddings1 = model.encode(sentences[0])
    embeddings2 = model.encode(sentences[1])
    
    similarity_score = model.similarity(embeddings1, embeddings2)

    return similarity_score

In [44]:
resume_content = """
    Jackson Giemza
    Boulder, CO | (708)-340-8105 | jackson.giemza@gmail.com | github.com/JacksonGiemza 

    Enthusiastic information science student with hands-on experience as a data scientist, equipped with
    technical and analytical skills to derive data-driven decisions. Passionate about harnessing data to
    solve complex, meaningful problems in a vibrant, innovative setting.
    Education
    
    University of Colorado, Boulder, CO | Expected Graduation Dec 2025
    Bachelor of Science, Information Science | Minor, Philosophy 
    Relevant Courses: Data Visualization, Statistics, Python for Info Sci 1&2, Linear Algebra, R for
    Data Science, Logic, Quantitative Reasoning, Physics 1&2, Calculus, Economics, UI/UX Design

    Experience
    Risk Technology Analyst | RJ O’Brien & Associates
    May 2024 - Aug. 2024, Chicago, IL
    Developed a Python-based solution to automate financial audit requests, reducing work for
    analysts by dynamically generating and scheduling 15+ personalized emails bi-weekly
    Led the project from concept to deployment, including training future maintainers and creating
    a Power BI dashboard for monitoring requests and manually sending custom emails ad hoc
    Identified the need for and developed a Python GUI for generating customized anonymous test
    data for users of all technical expertise
    Served as an ambassador for new AI tools by interviewing coworkers to identify pain points in
    AI adoption and conducting 10+ one-on-one demos teaching how to leverage gen. AI 

    Data Science Intern | CloudQuant
    May 2023 - Aug. 2023, Chicago, IL
    Expanded the data catalog by 8000+ datasets through seamless integration of various data APIs
    and dynamic web scrapers
    Strategically utilized the new OpenAI API to automate data entry and cleaning processes,
    significantly enhancing efficiency while reducing manual workload
    Drove insights through analysis and data visualization

    Summer Intern | RJ O’Brien & Associates
    May 2022 - Aug. 2022, Chicago, IL
    Engineered automated risk reports for quarterly and daily distribution.
    Designed a comprehensive financial analysis dashboard for vendor viability.
    Conducted rigorous User Acceptance Testing for the credit API. 

    Projects
    Statistical Arbitrage Trading Strategy
    Leveraged public historical market data to identify mean-reverting stock pairs based on
    cointegration analysis
    Backtesting the strategy over 3 years of data, evaluating performance with a Sharpe ratio (1.4),
    max drawdown (-12%), and win rate (60%), demonstrating consistent profitability in diverse
    market conditions.

    Skills: Python (Pandas, NumPy, Matplotlib, Altair, Spacy, Selenium, Scikit, PyTorch, TensorFlow),
    SQL (MySQL, PostgreSQL), Git, Tableau, PowerBI, NLP
"""

# jd_content = """
# About the job
# Machine Learning Engineer - Cambridge - Cutting Edge Consultancy



# A Machine Learning Engineer is required for a very exciting consultancy. As a Machine Learning Engineer, you will play a key role in making real-world impact and work with an elite teams of some of the top scientists, engineers, and designers in the company.


# Machine Learning Engineer - Ideal skillset would include:

# Top Academics in a relevant field
# Strong knowledge of TensorFlow, PyTorch, Keras and Scikit-Learn
# Ideally 1+ year professional experience as a Machine Learning Engineer
# Research minded and experimental approach to problem solving


# This is a very innovative consultancy in the heart of Cambridge working on some exciting cyber projects.


# Machine Learning Engineer - Benefits:

# Private medical insurance for you and your family
# A comprehensive relocation package
# Working within an exceptional team
# 25 Days annual holiday + bank holidays
# In office perks such as lunch and snacks provided


# Machine Learning Engineer - Cambridge - Cutting Edge Consultancy
# """

jd_content = """
About the job
About the Company: Oeson is a leading IT corporation globally recognized for its expertise in providing top-notch IT and Ed-tech services. Specializing in digital marketing, data science, data analytics, UI-UX design, web development, and app development, we are dedicated to innovation, excellence, and empowering talents worldwide.

Learn More: www.oesonlearning.com



Job Summary:



Oeson is seeking enthusiastic individuals who are looking to learn with us in the field of Data Science while working on live projects internationally. We are not just offering a flexible work environment but also offering to work with people in a global team.



Projects You Will Work On:



- Finance Fraud Detection: Develop advanced fraud detection algorithms leveraging financial data analysis.

- Recommender System: Contribute to personalized recommendation systems, enhancing user experiences across platforms.

- Sentiment Analysis: Explore sentiment analysis to extract insights from textual data, shaping user sentiment understanding.

- Chatbots: Engage in intelligent chatbot development, revolutionizing customer interactions and support.

- Image/Audio Video Classification: Push boundaries with multimedia technology by working on image and audio video classification projects.

- Text Analysis: Uncover hidden patterns in textual data through sophisticated text analysis techniques.



Roles & Responsibilities:



- Collaborate with our esteemed data science experts to collect, clean, and analyze extensive datasets, honing skills in data preprocessing and visualization.

- Contribute to the development of predictive models and algorithms, employing cutting-edge machine learning techniques to solve real-world challenges.

- Work closely with team members to design, implement, and evaluate experiments, fostering a collaborative and innovative environment.

- Stay updated with the latest industry trends and best practices in data science, applying newfound knowledge to enhance project outcomes.



Qualifications:



- Currently pursuing any degree showcasing a strong commitment to continuous learning and professional growth.

- Exceptional written and verbal communication skills, vital for effective collaboration and articulation of complex ideas.

- Demonstrated ability to work both independently and as part of a cohesive team, highlighting adaptability and strong teamwork capabilities.



Note:



This position is unpaid. After submitting your application, our team will contact you to proceed with the application details and joining process.



Location: 

Remote : United States



"""

In [45]:
cleaned_resume = clean_resume(resume_content)
cleaned_jd = clean_text(jd_content)
compute_similarity(cleaned_resume, cleaned_jd)

tensor([[0.6409]])

In [None]:
cleaned_resume

'experience risk technology analyst rj ’ brien associate may 2024 aug. 2024 chicago il developed python-based solution automate financial audit request reducing work analyst dynamically generating scheduling 15+ personalized email bi-weekly led project concept deployment including training future maintainer creating power bi dashboard monitoring request manually sending custom email ad hoc identified need developed python gui generating customized anonymous test data user technical expertise served ambassador new ai tool interviewing coworkers identify pain point ai adoption conducting 10+ one-on-one demo teaching leverage gen. ai data science intern cloudquant may 2023 aug. 2023 chicago il expanded data catalog 8000+ datasets seamless integration various data apis dynamic web scraper strategically utilized new openai api automate data entry cleaning process significantly enhancing efficiency reducing manual workload drove insight analysis data visualization summer intern rj ’ brien as

In [5]:
import re

def clean_markdown(text):
    """Removes advanced Markdown syntax while preserving readable text."""
    
    # Remove headers (e.g., ###, ##, #)
    text = re.sub(r'#{1,6}\s*', '', text)  

    # Remove bold (**text**) and italic (*text* or _text_)
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)  # Bold
    text = re.sub(r'\*(.*?)\*', r'\1', text)      # Italic
    text = re.sub(r'_(.*?)_', r'\1', text)        # Italic with underscore
    
    # Remove inline code (`code`)
    text = re.sub(r'`(.*?)`', r'\1', text)

    # Remove links but keep the anchor text: [text](url)
    text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)

    # Remove horizontal rules (---, ***, ___)
    text = re.sub(r'(\n[-*_]{3,}\n)', '\n', text)

    # Remove list markers (-, *, +) and extra spaces
    text = re.sub(r'^\s*[-*+] ', '', text, flags=re.MULTILINE)

    # Remove extra spaces caused by formatting
    text = re.sub(r'\n\s*\n', '\n\n', text).strip()

    text = text.lower()

    return text

resume_cleaned = clean_markdown(resume)

print(resume_cleaned)

p j g ( )
https://github.com/jacksongiemza boulder, co

enthusiastic information science student with hands-on experience as a data scientist,
equipped with technical and analytical skills to derive data-driven decisions. passionate
about harnessing data to solve complex meaningful problems in a vibrant, innovative setting.

education
university of colorado, boulder, co | expected graduation may 2025 bachelor of science, information science | minor: philosophy
relevant courses:                               applications & programs:
data visualization, statistics, python for info sci 1&2       python (pandas, numpy, matplotlib, altair), excel,
r for data science, logic, quantitative reason.            openai api, selenium, git, r, sql, excel, tableau,
tkinter, power platforms.

experience
risk technology analyst | rj o’brien associates        may 2024 - aug. 2024, chicago, il
i was invited to return to rjo’s risk team following my internship with them in 2022. drawing on the
skills i de

In [13]:
# https://github.com/LIAAD/yake

import yake

def yake_extract_keywords(text, num_keywords=10):
    """Extracts top keywords using YAKE."""
    kw_extractor = yake.KeywordExtractor(lan="en", n=2, dedupLim=1, top=200)
    keywords = kw_extractor.extract_keywords(text)
    return [kw[0] for kw in keywords]  # Return only the keyword text


keywords = yake_extract_keywords(resume_cleaned)

print(len(keywords))
print(keywords)

200
['data-driven decisions', 'derive data-driven', 'data', 'science', 'information science', 'science student', 'enthusiastic information', 'data science', 'data scientist', 'hands-on experience', 'analytical skills', 'information', 'developed', 'boulder', 'risk', 'aug', 'chicago', 'projects', 'python', 'excel', 'scientist', 'equipped', 'decisions', 'data visualization', 'summer', 'credit', 'enthusiastic', 'student', 'hands-on', 'analytical', 'derive', 'data-driven', 'financial', 'experience', 'o’brien associates', 'science intern', 'openai api', 'innovative setting', 'skills', 'concept', 'technical', 'solve complex', 'complex meaningful', 'meaningful problems', 'api', 'financial information', 'power', 'o’brien', 'associates', 'harnessing data', 'team', 'work', 'visualization', 'openai', 'successfully', 'experience risk', 'led', 'intern', 'requests', 'emails', 'generating', 'automate', 'selenium', 'adoption', 'dashboard', 'rjo', 'reducing', 'cloudquant', 'datasets', 'tasks', 'analysis

In [6]:
# https://pypi.org/project/rake-nltk/

from rake_nltk import Rake

def nltk_rake(text):
    rake_nltk_var = Rake()

    rake_nltk_var.extract_keywords_from_text(text)
    keyword_extracted = rake_nltk_var.get_ranked_phrases()

    return keyword_extracted

nltk_keywords = nltk_rake(resume_cleaned)

print(len(nltk_keywords))
print(nltk_keywords)

165
['developed dynamic web scrapers using selenium', 'largest independent futures brokerage firm', 'expected graduation may 2025 bachelor', 'manually sending emails ad hoc', 'streamlined customer financial information retrieval', 'successfully executed 4 major projects', 'efficiently managed numerous smaller tasks', '’ brien associates may 2024', 'generating customized anonymous test data', '’ brien associates may 2022', 'co enthusiastic information science student', 'solve complex meaningful problems', 'including training future maintainers', 'engineered automated risk reports', 'automate financial audit requests', 'successfully led key projects', 'comprehensive financial analysis dashboard', 'experience risk technology analyst', 'data visualization summer intern', 'scheduling personalized emails', 'automate data entry', 'significantly enhancing efficiency', 'power bi dashboard', 'philosophy relevant courses', 'p j g', 'info sci 1', 'identify pain points', 'https :// github', 'evalua

In [4]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jacks\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True