In [1]:
with open("resume.md", "r", encoding="utf-8") as file:
    resume = file.read()


In [12]:
import re

def clean_markdown(text):
    """Removes advanced Markdown syntax while preserving readable text."""
    
    # Remove headers (e.g., ###, ##, #)
    text = re.sub(r'#{1,6}\s*', '', text)  

    # Remove bold (**text**) and italic (*text* or _text_)
    text = re.sub(r'\*\*(.*?)\*\*', r'\1', text)  # Bold
    text = re.sub(r'\*(.*?)\*', r'\1', text)      # Italic
    text = re.sub(r'_(.*?)_', r'\1', text)        # Italic with underscore
    
    # Remove inline code (`code`)
    text = re.sub(r'`(.*?)`', r'\1', text)

    # Remove links but keep the anchor text: [text](url)
    text = re.sub(r'\[([^\]]+)\]\([^\)]+\)', r'\1', text)

    # Remove horizontal rules (---, ***, ___)
    text = re.sub(r'(\n[-*_]{3,}\n)', '\n', text)

    # Remove list markers (-, *, +) and extra spaces
    text = re.sub(r'^\s*[-*+] ', '', text, flags=re.MULTILINE)

    # Remove extra spaces caused by formatting
    text = re.sub(r'\n\s*\n', '\n\n', text).strip()

    text = text.lower()

    return text

resume_cleaned = clean_markdown(resume)


In [13]:
# https://github.com/LIAAD/yake

import yake

def yake_extract_keywords(text, num_keywords=10):
    """Extracts top keywords using YAKE."""
    kw_extractor = yake.KeywordExtractor(lan="en", n=2, dedupLim=1, top=200)
    keywords = kw_extractor.extract_keywords(text)
    return [kw[0] for kw in keywords]  # Return only the keyword text


keywords = yake_extract_keywords(resume_cleaned)

print(len(keywords))
print(keywords)

200
['data-driven decisions', 'derive data-driven', 'data', 'science', 'information science', 'science student', 'enthusiastic information', 'data science', 'data scientist', 'hands-on experience', 'analytical skills', 'information', 'developed', 'boulder', 'risk', 'aug', 'chicago', 'projects', 'python', 'excel', 'scientist', 'equipped', 'decisions', 'data visualization', 'summer', 'credit', 'enthusiastic', 'student', 'hands-on', 'analytical', 'derive', 'data-driven', 'financial', 'experience', 'o’brien associates', 'science intern', 'openai api', 'innovative setting', 'skills', 'concept', 'technical', 'solve complex', 'complex meaningful', 'meaningful problems', 'api', 'financial information', 'power', 'o’brien', 'associates', 'harnessing data', 'team', 'work', 'visualization', 'openai', 'successfully', 'experience risk', 'led', 'intern', 'requests', 'emails', 'generating', 'automate', 'selenium', 'adoption', 'dashboard', 'rjo', 'reducing', 'cloudquant', 'datasets', 'tasks', 'analysis

In [6]:
# https://pypi.org/project/rake-nltk/

from rake_nltk import Rake

def nltk_rake(text):
    rake_nltk_var = Rake()

    rake_nltk_var.extract_keywords_from_text(text)
    keyword_extracted = rake_nltk_var.get_ranked_phrases()

    return keyword_extracted

nltk_keywords = nltk_rake(resume_cleaned)

print(len(nltk_keywords))
print(nltk_keywords)

165
['developed dynamic web scrapers using selenium', 'largest independent futures brokerage firm', 'expected graduation may 2025 bachelor', 'manually sending emails ad hoc', 'streamlined customer financial information retrieval', 'successfully executed 4 major projects', 'efficiently managed numerous smaller tasks', '’ brien associates may 2024', 'generating customized anonymous test data', '’ brien associates may 2022', 'co enthusiastic information science student', 'solve complex meaningful problems', 'including training future maintainers', 'engineered automated risk reports', 'automate financial audit requests', 'successfully led key projects', 'comprehensive financial analysis dashboard', 'experience risk technology analyst', 'data visualization summer intern', 'scheduling personalized emails', 'automate data entry', 'significantly enhancing efficiency', 'power bi dashboard', 'philosophy relevant courses', 'p j g', 'info sci 1', 'identify pain points', 'https :// github', 'evalua

In [4]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jacks\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True