In [None]:
import pdfplumber
from docx import Document
import spacy
import re

In [65]:
nlp = spacy.load("en_core_web_sm")



In [66]:
pdf_file_path = "../data/mock_resume.pdf"
docx_file_path = "../data/mock_job_description.docx"

In [80]:
CORE_SKILLS = [
    "python", "sql", "docker", "aws", "azure", "gcp", "machine learning",
    "nlp", "tensorflow", "pytorch", "scikit learn", "pandas", "numpy",
    "javascript", "react", "nodejs", "git", "kubernetes", "rest api",
    "docker-compose", "linux", "bash", "spark", "data visualization",
    "tableau", "power bi", "hadoop", "etl", "ci/cd", "agile", "scrum",
    "jira", "confluence", "data analysis", "data science", "deep learning",
    "computer vision", "time series", "regression", "classification", "clustering",
    "natural language processing", "big data", "data engineering", "data mining", "sql server",
    "postgresql", "mysql", "mongodb", "redis", "elasticsearch", "html", "css", "typescript",
    "vuejs", "angular", "flask", "django", "fastapi", "graphql", "restful api", "oauth",
    "jwt", "microservices", "serverless", "lambda", "cloudformation", "terraform", "devops",
    "monitoring", "logging", "prometheus", "grafana", "kafka", "rabbitmq", "celery", "airflow",
    "spark", "hive", "pig", "impala", "presto", "snowflake", "redshift", "bigquery", "databricks",
    "mlflow", "dvc", "wandb", "neptune", "jupyter", "jupyterlab", "notebooks", "visual studio code", "pycharm", "intellij", "eclipse", "netbeans", "vscode", "nltk", "spacy", "gensim", "word2vec", "bert", "transformers", "opencv", "pillow", "seaborn", "matplotlib", "plotly", "dash", "bokeh", "streamlit", "flask", "django", "fastapi", "rest api", "graphql", "oauth", "jwt", "microservices", "serverless", "lambda", "cloudformation", "terraform", "devops", "model deployment", "monitoring", "logging", "r", "c", "c++", "java", "scala", "go", "ruby", "php", "swift", "kotlin"
]

In [68]:
def extract_text_from_pdf(file_path: str) -> str:
    with pdfplumber.open(file_path) as pdf:
        text = ""
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

In [69]:

resume_text = extract_text_from_pdf(pdf_file_path)
print(resume_text)

Jane Doe
Email: janedoe@email.com | Phone: +1 555-123-4567 | Location: Lagos, Nigeria
LinkedIn: linkedin.com/in/janedoe | GitHub: github.com/janedoe
PROFESSIONAL SUMMARY
Data-driven professional with strong analytical skills and hands-on experience in Python, data
visualization, and machine learning. Passionate about leveraging data insights to solve business
problems and improve decision-making.
SKILLS
- Programming: Python, SQL, R
- Data Tools: Pandas, NumPy, Scikit-learn, Matplotlib, Power BI
- Machine Learning: Regression, Classification, Clustering
- NLP Tools: spaCy, NLTK, Transformers
- Web: Flask, FastAPI, HTML/CSS
- Version Control: Git, GitHub
EXPERIENCE
Data Analyst Intern — TechNova Analytics (Jan 2024 – Jul 2024)
- Cleaned and analyzed datasets using Python (Pandas, NumPy).
- Developed data visualization dashboards in Power BI.
- Collaborated with engineers to deploy a model predicting customer churn.
Freelance Web Developer (2023 – Present)
- Built responsive websites wit

In [70]:
def extract_text_from_docx(file_path: str) -> str:
    doc = Document(file_path)
    paragraph = [para.text for para in doc.paragraphs if para.text and para.text.strip()]
    text = "\n".join(paragraph).strip()
    return text

In [71]:
job_description_text = extract_text_from_docx(docx_file_path)
print(job_description_text)

Job Title: Data Analyst / Junior Machine Learning Engineer
Company: InsightWorks Technologies
Location: Remote (Lagos, Nigeria preferred)
Employment Type: Full-Time
About the Role
We’re looking for a motivated Data Analyst with a passion for machine learning and data-driven problem-solving. The ideal candidate will analyze large datasets, build predictive models, and communicate insights that drive business decisions.
Key Responsibilities
- Collect, clean, and preprocess structured and unstructured datasets.
- Build and evaluate machine learning models using Python (scikit-learn, pandas, numpy).
- Perform exploratory data analysis and visualization with Matplotlib or Power BI.
- Collaborate with engineers to integrate ML models into production using FastAPI or Flask.
- Present insights to stakeholders through dashboards and reports.
Required Skills
- Strong proficiency in Python, SQL, and data manipulation libraries (Pandas, NumPy).
- Experience with machine learning algorithms (classi

In [72]:
def preprocess_text(text: str) -> str:
    doc = nlp(text)
    tokens = []

    for tok in doc:
        if tok.is_stop or tok.is_punct or tok.is_space:
            continue
        tok_lemma = tok.lemma_.lower()

        if len(tok_lemma) <= 1 and not tok_lemma.isalpha():
            continue

        tokens.append(tok_lemma)

    return " ".join(tokens)

In [73]:
resume_text_preprocessed = preprocess_text(resume_text)
job_description_text_preprocessed = preprocess_text(job_description_text)

In [90]:
def extract_skills_from_text(text: str, skills: list) -> set:
    text = text.lower()
    matched_skills = set()

    for skill in skills:
        pattern = r'\b' + re.escape(skill.lower()) + r'\b'
        if re.search(pattern, text):
            matched_skills.add(skill)
    return matched_skills

In [91]:
resume_text_skills = extract_skills_from_text(resume_text_preprocessed, CORE_SKILLS)
job_description_text_skills = extract_skills_from_text(job_description_text_preprocessed, CORE_SKILLS)

In [75]:
def compute_match_info(resume_skills: set, job_skills: set) -> dict:
    match_info = dict()
    matched_skills = resume_skills.intersection(job_skills)
    absent_skills = job_skills.difference(resume_skills)

    score = len(matched_skills) / len(job_skills) * 100

    match_info['matched_skills'] = list(matched_skills)
    match_info['absent_skills'] = list(absent_skills)
    match_info['score'] = round(score, 2)

    return match_info

In [92]:
match_info = compute_match_info(resume_text_skills, job_description_text_skills)
match_info

{'matched_skills': ['machine learning',
  'rest api',
  'spacy',
  'sql',
  'regression',
  'pandas',
  'python',
  'matplotlib',
  'power bi',
  'classification',
  'numpy',
  'fastapi',
  'nltk',
  'flask',
  'scikit learn',
  'nlp'],
 'absent_skills': ['azure', 'model deployment', 'gcp', 'seaborn', 'aws'],
 'score': 76.19}

In [83]:
job_description_text_skills

{'aws',
 'azure',
 'classification',
 'fastapi',
 'flask',
 'gcp',
 'matplotlib',
 'nlp',
 'nltk',
 'numpy',
 'pandas',
 'python',
 'regression',
 'seaborn',
 'spacy',
 'sql'}

In [84]:
job_description_text_preprocessed.split()

['job',
 'title',
 'data',
 'analyst',
 'junior',
 'machine',
 'learning',
 'engineer',
 'company',
 'insightworks',
 'technologies',
 'location',
 'remote',
 'lagos',
 'nigeria',
 'prefer',
 'employment',
 'type',
 'time',
 'role',
 'look',
 'motivated',
 'data',
 'analyst',
 'passion',
 'machine',
 'learning',
 'data',
 'drive',
 'problem',
 'solving',
 'ideal',
 'candidate',
 'analyze',
 'large',
 'dataset',
 'build',
 'predictive',
 'model',
 'communicate',
 'insight',
 'drive',
 'business',
 'decision',
 'key',
 'responsibility',
 'collect',
 'clean',
 'preprocess',
 'structure',
 'unstructured',
 'dataset',
 'build',
 'evaluate',
 'machine',
 'learning',
 'model',
 'python',
 'scikit',
 'learn',
 'panda',
 'numpy',
 'perform',
 'exploratory',
 'datum',
 'analysis',
 'visualization',
 'matplotlib',
 'power',
 'bi',
 'collaborate',
 'engineer',
 'integrate',
 'ml',
 'model',
 'production',
 'fastapi',
 'flask',
 'present',
 'insight',
 'stakeholder',
 'dashboard',
 'report',
 'requ