# Centralized Tools

## Centralized Functions - JSON Files

In [2]:
import json
import re
from typing import Dict
from pathlib import Path
import textwrap
from IPython.display import display, Markdown
from utils.generic_utils import read_from_json_file


def clean_json_string(json_string: str):
    """
    Remove unwanted control characters from the JSON string.
    """
    return re.sub(r"[\x00-\x1F\x7F]", "", json_string)


def load_and_clean_json_file(json_file: Path | str) -> Dict:
    """
    Load a JSON file and decode all Unicode escape sequences.

    This function reads a JSON file, ensures it is a valid JSON string,
    and removes unwanted control characters before parsing it.

    Args:
        json_file (Path | str): The path to the JSON file.

    Returns:
        dict: The cleaned JSON data.

    Raises:
        FileNotFoundError: If the file does not exist.
        ValueError: If the JSON data is invalid or cannot be parsed.

    Example Usage:
    --------------
    >>> json_data = load_and_clean_json_file("config.json")
    >>> print(json_data["api_key"])
    """
    json_data = read_from_json_file(json_file)  # This could be Dict or Any

    if isinstance(json_data, dict):
        json_string = json.dumps(
            json_data, ensure_ascii=False
        )  # Convert dict to string
    else:
        json_string = str(json_data)  # Convert single value to string

    cleaned_json_string = clean_json_string(json_string)  # Ensure it's cleaned
    return json.loads(cleaned_json_string)  # Convert back to a dictionary


def format_json_readable(json_obj, indent=2, wrap_width=80) -> str:
    """
    Formats JSON data with indentation and wraps long text for better readability.

    Args:
        json_obj (dict | list): JSON object or list to format.
        indent (int): Number of spaces to use for indentation. Default is 2.
        wrap_width (int): Maximum width of text before wrapping. Default is 80.

    Returns:
        str: The formatted and wrapped JSON string.

    Example Usage:
    --------------
    >>> json_data = {"name": "Alice", "role": "Senior Data Scientist", "skills": ["Python", "ML", "NLP"]}
    >>> formatted_json = format_json_readable(json_data)
    >>> print(formatted_json)
    """
    formatted_json = json.dumps(json_obj, indent=indent, ensure_ascii=False)

    # Wrap long lines for better readability
    formatted_json = "\n".join(
        [
            textwrap.fill(line, width=wrap_width) if len(line) > wrap_width else line
            for line in formatted_json.split("\n")
        ]
    )

    return formatted_json


def display_json_pretty(json_input: dict | str | Path, wrap_width: int = 120):
    """
    Displays JSON data in a human-readable format using Markdown.

    This function takes a JSON dictionary, a JSON string, or a file path,
    formats it, and displays it in a scrollable Markdown code block.

    Args:
        json_input (dict | str | Path): JSON data (dict), JSON string, or JSON file path.
        wrap_width (int): Maximum width of text before wrapping. Default is 120.

    Raises:
        ValueError: If the JSON string is invalid or cannot be parsed.

    Example Usage:
    --------------
    Display JSON from a dictionary:
    >>> json_data = {"name": "Alice", "role": "Senior Data Scientist"}
    >>> display_json_pretty(json_data)

    Display JSON from a file:
    >>> display_json_pretty("config.json")
    """
    if isinstance(json_input, dict):
        data = json_input  # Use dictionary directly
    elif isinstance(json_input, Path) or (
        isinstance(json_input, str) and not json_input.strip().startswith(("{", "["))
    ):
        data = load_and_clean_json_file(str(json_input))  # Load JSON from file
    else:
        data = json.loads(json_input)  # Parse JSON string

    # Format JSON for readability
    formatted_json = format_json_readable(data, wrap_width=wrap_width)

    # Display with Markdown to prevent horizontal scrolling
    display(Markdown(f"```json\n{formatted_json}\n```"))

## Functions - DataFrames

In [4]:
from pathlib import Path
from typing import Tuple
import xlwings as xw
import pandas as pd

import numpy as np

from docx import Document


def load_excel_sheet(file_path: str | Path, sheet_name: str) -> Tuple:
    """Load an Excel sheet using xlwings."""
    wb = xw.Book(file_path)
    sheet = wb.sheets[sheet_name]
    return wb, sheet


def get_underlined_cells(sheet):
    """Extract values from underlined cells while keeping the first two columns as is."""
    table_range = sheet.range("A1").expand("table")

    underlined_cells = []

    for i in range(1, table_range.rows.count + 1):
        underlined_row = []

        for j in range(1, table_range.columns.count + 1):
            cell = table_range.api.Cells(i, j)
            if j <= 2:  # Keep the first 2 columns as is
                underlined_row.append(cell.Value)
            elif cell.Font.Underline != -4142:  # -4142 represents no underline
                underlined_row.append(cell.Value)
            else:
                underlined_row.append(None)  # or "empty"

        underlined_cells.append(underlined_row)

    return pd.DataFrame(underlined_cells).set_index(0)


def clean_and_remove_rows(df):
    """Remove None values and reorganize the dataframe."""
    max_length = df.apply(lambda row: row.dropna().shape[0], axis=1).max()
    new_df = pd.DataFrame(index=df.index)

    for i in range(max_length):
        new_df[i] = df.apply(lambda row: get_ith_non_nan(row, i), axis=1)
    return new_df.iloc[2:]  # Remove the first two rows


def get_ith_non_nan(row, i):
    """Retrieve the ith non-NaN value from a row."""
    non_nan = row.dropna()

    return non_nan.iloc[i] if i < len(non_nan) else np.nan


def rename_dataframe_columns(df):
    """Rename index and columns dynamically based on the number of columns."""
    df.index.name = "responsibility_key"
    column_names = ["original_responsibility"] + [
        f"edited_responsibility_{i}" for i in range(1, df.shape[1])
    ]
    df.columns = column_names[: df.shape[1]]

    return df


def json_to_docx(json_data: dict, output_file: Path | str):
    doc = Document()

    if isinstance(json_data, str):
        data = json.loads(json_data)

    else:
        data = json_data

    for main_key, sub_dict in data.items():

        for key, value in sub_dict.items():
            doc.add_paragraph(f"{key}:")
            doc.add_paragraph(value)
            doc.add_paragraph("")  # Add blank line

    if isinstance(output_file, Path):
        doc.save(str(output_file))
    else:
        doc.save(output_file)

## Centralized File/Dir Path Imports

In [9]:
from project_config import (
    JOB_POSTING_URLS_FILE,
    JOB_POSTING_URLS_FILTERED_FILE,
    JOB_POSTING_URLS_TO_EXCLUDE_FILE,
    JOB_DESCRIPTIONS_JSON_FILE,
    JOB_REQUIREMENTS_JSON_FILE,
)

# Check Files

## Check Job URLs, Descriptions/Postings, & Requirements

### Job Posting URLs

#### Run Pipeline to Filter URLs File

In [5]:
from pipelines.filter_job_posting_urls_mini_pipeline import (
    run_filtering_job_posting_urls_mini_pipe_line as filter_urls,
)

filter_urls()

2025-05-04 09:15:11,763 - pipelines.filter_job_posting_urls_mini_pipeline - INFO - Loading main job postings from /home/xzhang/dev/job_bot/input_output/input/job_posting_urls.json
2025-05-04 09:15:11,766 - utils.generic_utils - INFO - Loaded data from /home/xzhang/dev/job_bot/input_output/input/job_posting_urls.json
2025-05-04 09:15:11,768 - pipelines.filter_job_posting_urls_mini_pipeline - INFO - Loading exclusion URLs from /home/xzhang/dev/job_bot/input_output/input/job_posting_urls_to_exclude.json
2025-05-04 09:15:11,771 - utils.generic_utils - INFO - Loaded data from /home/xzhang/dev/job_bot/input_output/input/job_posting_urls_to_exclude.json
2025-05-04 09:15:11,773 - pipelines.filter_job_posting_urls_mini_pipeline - INFO - Excluding 21 URLs from main job postings.
2025-05-04 09:15:11,776 - pipelines.filter_job_posting_urls_mini_pipeline - INFO - Filtered out 21 job postings; 19 remain.
2025-05-04 09:15:11,781 - utils.generic_utils - INFO - Data successfully saved to /home/xzhang/d

#### All URLs

In [10]:
from project_config import JOB_POSTING_URLS_FILE

json_input = JOB_POSTING_URLS_FILE

data = load_and_clean_json_file(json_file=json_input)
print(len(data))
companies = sorted(
    [job_data.get("company", "Unknown Company") for job_data in data.values()]
)
print(companies)
# for key, value in data.items():


display_json_pretty(data, wrap_width=100)

2025-05-04 09:43:13,694 - utils.generic_utils - INFO - Loaded data from /home/xzhang/dev/job_bot/input_output/input/job_posting_urls.json


40
['Adobe', 'Advisor360 Degrees', 'Airtable', 'Amazon', 'Amazon', 'Amazon', 'Amazon', 'Amplitude', 'Blend', 'Boston Scientific', 'Capital One', 'DEPT', 'Deloitte', 'Deloitte', 'Deloitte', 'Deloitte', 'DigitalOcean', 'Figma', 'Flextronics', 'Glean', 'Google', 'Johnson Controls', 'Liberty Mutual', 'Liberty Mutual', 'Liberty Mutual Insurance', 'Meta', 'Microsoft', 'MongoDB', 'Oracle', 'PwC', 'PwC', 'S&P Global', 'Salesforce.com', 'Snowflake', 'TRACE3', 'Takeda', 'ThermoFisher Scientific', 'Thomson Reuters', 'Veeva', 'Zendesk']


```json
{
  "https://www.google.com/about/careers/applications/jobs/results/113657145978692294-ai-market-
intelligence-principal/?src=Online/LinkedIn/linkedin_us&utm_source=linkedin&utm_medium=jobposting&ut
m_campaign=contract&utm_medium=jobboard&utm_source=linkedin": {
    "url": "https://www.google.com/about/careers/applications/jobs/results/113657145978692294-ai-
market-intelligence-principal/?src=Online/LinkedIn/linkedin_us&utm_source=linkedin&utm_medium=jobpos
ting&utm_campaign=contract&utm_medium=jobboard&utm_source=linkedin",
    "company": "Google",
    "job_title": "AI Market Intelligence Principal"
  },
  "https://www.capitalonecareers.com/job/-/-
/234/66270465536?p_sid=ep3Sfxb&p_uid=sDBMWC5VxQ&source=rd_linkedin_job_posting_tm&ss=paid&utm_campai
gn=capone_all_jobs_24&utm_content=pj_board&utm_medium=jobad&utm_source=linkedin+slotted&dclid=CPGV3b
ef44gDFUEGTwgd4DoHPg": {
    "url": "https://www.capitalonecareers.com/job/-/-
/234/66270465536?p_sid=ep3Sfxb&p_uid=sDBMWC5VxQ&source=rd_linkedin_job_posting_tm&ss=paid&utm_campai
gn=capone_all_jobs_24&utm_content=pj_board&utm_medium=jobad&utm_source=linkedin+slotted&dclid=CPGV3b
ef44gDFUEGTwgd4DoHPg",
    "company": "Capital One",
    "job_title": "Director, AI Platforms"
  },
  "https://jobs.careers.microsoft.com/us/en/job/1771714/Head-of-Partner-Intelligence-and-
Strategy?jobsource=linkedin": {
    "url": "https://jobs.careers.microsoft.com/us/en/job/1771714/Head-of-Partner-Intelligence-and-
Strategy?jobsource=linkedin",
    "company": "Microsoft",
    "job_title": "Head of Partner Intelligence and Strategy"
  },
  "https://searchjobs.libertymutualgroup.com/careers/job/618499888480?microsite=libertymutual.com&do
main=libertymutual.com&utm_source=Job+Board&utm_campaign=LinkedIn+Jobs&extcmp=bof-paid-text-lkin-
aljb": {
    "url": "https://searchjobs.libertymutualgroup.com/careers/job/618499888480?microsite=libertymutu
al.com&domain=libertymutual.com&utm_source=Job+Board&utm_campaign=LinkedIn+Jobs&extcmp=bof-paid-
text-lkin-aljb",
    "company": "Liberty Mutual",
    "job_title": "Senior Manager I - Corporate Strategy & Research (Summer 2025)"
  },
  "https://boards.greenhouse.io/embed/job_app?token=7600823002&gh_src=ab9f35b82": {
    "url": "https://boards.greenhouse.io/embed/job_app?token=7600823002&gh_src=ab9f35b82",
    "company": "Amplitude",
    "job_title": "Marketing Strategy & Analytics Manager"
  },
  "https://www.amazon.jobs/en/jobs/2696123/research-manager-strategy-and-insights-gca-marketing?cmpi
d=SPLICX0248M&utm_source=linkedin.com&utm_campaign=cxro&utm_medium=social_media&utm_content=job_post
ing&ss=paid": {
    "url": "https://www.amazon.jobs/en/jobs/2696123/research-manager-strategy-and-insights-gca-marke
ting?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=cxro&utm_medium=social_media&utm_content
=job_posting&ss=paid",
    "company": "Amazon",
    "job_title": "Research Manager, Strategy and Insights, GCA Marketing"
  },
  "https://www.amazon.jobs/en/jobs/2742527/sr-generative-ai-strategist-generative-ai-innovation-cent
er?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=cxro&utm_medium=social_media&utm_content=j
ob_posting&ss=paid": {
    "url": "https://www.amazon.jobs/en/jobs/2742527/sr-generative-ai-strategist-generative-ai-
innovation-center?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=cxro&utm_medium=social_medi
a&utm_content=job_posting&ss=paid",
    "company": "Amazon",
    "job_title": "Marketing Strategy & Analytics Manager"
  },
  "https://www.metacareers.com/jobs/522232286825036/?rx_campaign=Linkedin1&rx_ch=connector&rx_group=
126320&rx_job=a1KDp00000E28eGMAR&rx_medium=post&rx_r=none&rx_source=Linkedin&rx_ts=20240927T121201Z&
rx_vp=slots&utm_campaign=Job%2Bboard&utm_medium=jobs&utm_source=LIpaid&rx_viewer=e3efacca649311ef917
d17a1705b89ba0dc4e1e7a57f4231bbce94a604c83931": {
    "url": "https://www.metacareers.com/jobs/522232286825036/?rx_campaign=Linkedin1&rx_ch=connector&
rx_group=126320&rx_job=a1KDp00000E28eGMAR&rx_medium=post&rx_r=none&rx_source=Linkedin&rx_ts=20240927
T121201Z&rx_vp=slots&utm_campaign=Job%2Bboard&utm_medium=jobs&utm_source=LIpaid&rx_viewer=e3efacca64
9311ef917d17a1705b89ba0dc4e1e7a57f4231bbce94a604c83931",
    "company": "Meta",
    "job_title": "Marketing Strategy & Analytics Manager"
  },
  "https://www.amazon.jobs/en/jobs/2684745/product-manager-artificial-general-intelligence-data-serv
ices?cmpid=SPLICX0248M&ss=paid&utm_campaign=cxro&utm_content=job_posting&utm_medium=social_media&utm
_source=linkedin.com": {
    "url": "https://www.amazon.jobs/en/jobs/2684745/product-manager-artificial-general-intelligence-
data-services?cmpid=SPLICX0248M&ss=paid&utm_campaign=cxro&utm_content=job_posting&utm_medium=social_
media&utm_source=linkedin.com",
    "company": "Amazon",
    "job_title": "Product Manager, Artificial General Intelligence - Data Services"
  },
  "https://www.mongodb.com/careers/jobs/6466537": {
    "url": "https://www.mongodb.com/careers/jobs/6466537",
    "company": "MongoDB",
    "job_title": "Director, Competitive Intelligence"
  },
  "https://searchjobs.libertymutualgroup.com/careers/job/618501232921?microsite=libertymutual.com&do
main=libertymutual.com&utm_source=Job+Board&utm_campaign=LinkedIn+Jobs&extcmp=bof-paid-text-lkin-
aljb": {
    "url": "https://searchjobs.libertymutualgroup.com/careers/job/618501232921?microsite=libertymutu
al.com&domain=libertymutual.com&utm_source=Job+Board&utm_campaign=LinkedIn+Jobs&extcmp=bof-paid-
text-lkin-aljb",
    "company": "Liberty Mutual",
    "job_title": "Senior Manager II, Corporate Strategy & Research"
  },
  "https://careers.adobe.com/us/en/job/ADOBUSR151695EXTERNALENUS/Sr-Director-Applied-AI-ML-
Discovery?utm_source=linkedin&utm_medium=phenom-feeds&source=LinkedIn": {
    "url": "https://careers.adobe.com/us/en/job/ADOBUSR151695EXTERNALENUS/Sr-Director-Applied-AI-ML-
Discovery?utm_source=linkedin&utm_medium=phenom-feeds&source=LinkedIn",
    "company": "Adobe",
    "job_title": "Sr. Director, Applied AI/ML (Discovery)"
  },
  "https://flextronics.wd1.myworkdayjobs.com/en-US/Careers/job/Sr-Manager-AI-
Strategy_WD191060?source=LinkedIn_Slots": {
    "url": "https://flextronics.wd1.myworkdayjobs.com/en-US/Careers/job/Sr-Manager-AI-
Strategy_WD191060?source=LinkedIn_Slots",
    "company": "Flextronics",
    "job_title": "Sr. Manager AI Strategy"
  },
  "https://job-boards.greenhouse.io/trace3/jobs/6163213?gh_src=b81c67b41us": {
    "url": "https://job-boards.greenhouse.io/trace3/jobs/6163213?gh_src=b81c67b41us",
    "company": "TRACE3",
    "job_title": "Senior Consultant, AI Strategy (Remote)"
  },
  "https://boards.greenhouse.io/gleanwork/jobs/4425502005?source=LinkedIn": {
    "url": "https://boards.greenhouse.io/gleanwork/jobs/4425502005?source=LinkedIn",
    "company": "Glean",
    "job_title": "Head of Competitive Intelligence"
  },
  "https://job-boards.greenhouse.io/airtable/jobs/7603873002?gh_src=aef790d02us": {
    "url": "https://job-boards.greenhouse.io/airtable/jobs/7603873002?gh_src=aef790d02us",
    "company": "Airtable",
    "job_title": "Product Manager, AI"
  },
  "https://careers.veeva.com/job/365ff44c-8e0a-42b4-a117-27b409a77753/director-crossix-analytics-
services-boston-ma/?lever-source=Linkedin": {
    "url": "https://careers.veeva.com/job/365ff44c-8e0a-42b4-a117-27b409a77753/director-crossix-
analytics-services-boston-ma/?lever-source=Linkedin",
    "company": "Veeva",
    "job_title": "Director - Crossix Analytics Services"
  },
  "https://jobs.thermofisher.com/global/en/job/R-01298008/Market-Competitive-Intelligence-Manager?rx
_ch=jobpost&rx_job=R-01298008-
1&rx_medium=post&rx_paid=0&rx_r=none&rx_source=linkedin&rx_ts=20250206T184002Z&rx_vp=linkedindirecti
ndex&utm_medium=post&utm_source=recruitics_linkedindirectindex&refId=34jd24&rx_viewer=e3efacca649311
ef917d17a1705b89ba0dc4e1e7a57f4231bbce94a604c83931": {
    "url": "https://jobs.thermofisher.com/global/en/job/R-01298008/Market-Competitive-Intelligence-M
anager?rx_ch=jobpost&rx_job=R-01298008-
1&rx_medium=post&rx_paid=0&rx_r=none&rx_source=linkedin&rx_ts=20250206T184002Z&rx_vp=linkedindirecti
ndex&utm_medium=post&utm_source=recruitics_linkedindirectindex&refId=34jd24&rx_viewer=e3efacca649311
ef917d17a1705b89ba0dc4e1e7a57f4231bbce94a604c83931",
    "company": "ThermoFisher Scientific",
    "job_title": "Market & Competitive Intelligence Manager"
  },
  "https://www.digitalocean.com/careers/position/apply?gh_jid=6437995&gh_src=312a08e31us": {
    "url": "https://www.digitalocean.com/careers/position/apply?gh_jid=6437995&gh_src=312a08e31us",
    "company": "DigitalOcean",
    "job_title": "Director, Product Management (AI/ML)"
  },
  "https://job-
boards.greenhouse.io/figma/jobs/5336109004?gh_jid=5336109004&gh_src=28109e334us&source=LinkedIn": {
    "url": "https://job-
boards.greenhouse.io/figma/jobs/5336109004?gh_jid=5336109004&gh_src=28109e334us&source=LinkedIn",
    "company": "Figma",
    "job_title": "Researcher, Strategic Growth"
  },
  "https://boards.greenhouse.io/dept/jobs/6564521": {
    "url": "https://boards.greenhouse.io/dept/jobs/6564521",
    "company": "DEPT",
    "job_title": "Director of Applied AI Strategy, Media"
  },
  "https://jobs.us.pwc.com/job/-/-
/932/76064801072?utm_source=linkedin.com&utm_campaign=core_media&utm_medium=social_media&utm_content
=job_posting&ss=paid&dclid=CjgKEAiAwaG9BhCY3ayl47PW8lcSJAA_gCfjt-
rzWhQetHLIbJdJBVocWQm2BRNcBgOARxhGyR9bgvD_BwE": {
    "url": "https://jobs.us.pwc.com/job/-/-
/932/76064801072?utm_source=linkedin.com&utm_campaign=core_media&utm_medium=social_media&utm_content
=job_posting&ss=paid&dclid=CjgKEAiAwaG9BhCY3ayl47PW8lcSJAA_gCfjt-
rzWhQetHLIbJdJBVocWQm2BRNcBgOARxhGyR9bgvD_BwE",
    "company": "PwC",
    "job_title": "Strategy& Manager - Digital Value Transformation Contact Center"
  },
  "https://apply.deloitte.com/careers/InviteToApply?jobId=210031&source=LinkedIn": {
    "url": "https://apply.deloitte.com/careers/InviteToApply?jobId=210031&source=LinkedIn",
    "company": "Deloitte",
    "job_title": "Market Research Sr Manager, Boston"
  },
  "https://apply.deloitte.com/careers/InviteToApply?jobId=199586&source=LinkedIn": {
    "url": "https://apply.deloitte.com/careers/InviteToApply?jobId=199586&source=LinkedIn",
    "company": "Deloitte",
    "job_title": "AI Data Specialist, Boston"
  },
  "https://apply.deloitte.com/careers/InviteToApply?jobId=201718&source=LinkedIn": {
    "url": "https://apply.deloitte.com/careers/InviteToApply?jobId=201718&source=LinkedIn",
    "company": "Deloitte",
    "job_title": "Global Business Services (GBS) Strategy Manager, Boston"
  },
  "https://www.amazon.jobs/en/jobs/2905092/senior-manger-partner-strategy-genai-innovation-center?cm
pid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=cxro&utm_medium=social_media&utm_content=job_po
sting&ss=paid": {
    "url": "https://www.amazon.jobs/en/jobs/2905092/senior-manger-partner-strategy-genai-innovation-
center?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=cxro&utm_medium=social_media&utm_conte
nt=job_posting&ss=paid",
    "company": "Amazon",
    "job_title": "Senior Manager, Partner Strategy - GenAI Innovation Center"
  },
  "https://jobs.smartrecruiters.com/Blend360/744000042638791-director-ai-
strategy?trid=2d92f286-613b-4daf-9dfa-6340ffbecf73": {
    "url": "https://jobs.smartrecruiters.com/Blend360/744000042638791-director-ai-
strategy?trid=2d92f286-613b-4daf-9dfa-6340ffbecf73",
    "company": "Blend",
    "job_title": "Director, AI Strategy"
  },
  "https://careers.snowflake.com/us/en/job/SNCOUS5AF10A9C7A01464788ABD17AECBEE52EEXTERNALENUS1CC71A0
0229E4662B768527743E6164F/Director-Product-Marketing-
Analytics?utm_source=Q2P9NP2NNP&utm_medium=phenom-feeds&gh_src=ed5543a62": {
    "url": "https://careers.snowflake.com/us/en/job/SNCOUS5AF10A9C7A01464788ABD17AECBEE52EEXTERNALEN
US1CC71A00229E4662B768527743E6164F/Director-Product-Marketing-
Analytics?utm_source=Q2P9NP2NNP&utm_medium=phenom-feeds&gh_src=ed5543a62",
    "company": "Snowflake",
    "job_title": "Director, Product Marketing - Analytics"
  },
  "https://advisor360.breezy.hr/p/2e1636328c7d-senior-product-manager-ai-analytics-insights": {
    "url": "https://advisor360.breezy.hr/p/2e1636328c7d-senior-product-manager-ai-analytics-
insights",
    "company": "Advisor360 Degrees",
    "job_title": "Senior Product Manager - AI & Analytics Insights"
  },
  "https://eeho.fa.us2.oraclecloud.com/hcmUI/CandidateExperience/en/sites/jobsearch/job/276956?utm_m
edium=jobboard&utm_source=LinkedIn": {
    "url": "https://eeho.fa.us2.oraclecloud.com/hcmUI/CandidateExperience/en/sites/jobsearch/job/276
956?utm_medium=jobboard&utm_source=LinkedIn",
    "company": "Oracle",
    "job_title": "Senior AI Product Marketing Manager"
  },
  "https://searchjobs.libertymutualgroup.com/careers/job/618502472421?microsite=libertymutual.com&do
main=libertymutual.com&utm_source=Job+Board&utm_campaign=LinkedIn+Jobs&extcmp=bof-paid-text-lkin-
aljb": {
    "url": "https://searchjobs.libertymutualgroup.com/careers/job/618502472421?microsite=libertymutu
al.com&domain=libertymutual.com&utm_source=Job+Board&utm_campaign=LinkedIn+Jobs&extcmp=bof-paid-
text-lkin-aljb",
    "company": "Liberty Mutual Insurance",
    "job_title": "Senior Manager II, Corporate Strategy & Research"
  },
  "https://apply.deloitte.com/careers/InviteToApply?jobId=212481&source=LinkedIn": {
    "url": "https://apply.deloitte.com/careers/InviteToApply?jobId=212481&source=LinkedIn",
    "company": "Deloitte",
    "job_title": "Manager, Strategic Analytics - Strategy"
  },
  "https://jobs.us.pwc.com/job/-/-
/932/76741173104?utm_source=linkedin.com&utm_campaign=core_media&utm_medium=social_media&utm_content
=job_posting&ss=paid&dclid=CjgKEAjwy46_BhD8-
aeS9rzGtzsSJAAuE6pojXgWgT7LeiCns3H71Hqcb3dqchcqskpnFxz8njxwwPD_BwE": {
    "url": "https://jobs.us.pwc.com/job/-/-
/932/76741173104?utm_source=linkedin.com&utm_campaign=core_media&utm_medium=social_media&utm_content
=job_posting&ss=paid&dclid=CjgKEAjwy46_BhD8-
aeS9rzGtzsSJAAuE6pojXgWgT7LeiCns3H71Hqcb3dqchcqskpnFxz8njxwwPD_BwE",
    "company": "PwC",
    "job_title": "Data & Analytics- Senior Manager"
  },
  "https://zendesk.wd1.myworkdayjobs.com/en-US/zendesk/job/San-Francisco-California-United-States-
of-America/Competitive-Intelligence-Manager_R30346?source=LinkedIn": {
    "url": "https://zendesk.wd1.myworkdayjobs.com/en-US/zendesk/job/San-Francisco-California-United-
States-of-America/Competitive-Intelligence-Manager_R30346?source=LinkedIn",
    "company": "Zendesk",
    "job_title": "Competitive Intelligence Manager"
  },
  "https://salesforce.wd12.myworkdayjobs.com/External_Career_Site/job/California---San-
Francisco/Vice-President--Product-Research---Insights_JR279859?source=LinkedIn_Jobs": {
    "url": "https://salesforce.wd12.myworkdayjobs.com/External_Career_Site/job/California---San-
Francisco/Vice-President--Product-Research---Insights_JR279859?source=LinkedIn_Jobs",
    "company": "Salesforce.com",
    "job_title": "Vice President, Product Research & Insights"
  },
  "https://careers.thomsonreuters.com/us/en/job/THTTRUUSJREQ188456EXTERNALENUS/Director-of-AI-
Content-Innovation?utm_source=linkedin&utm_medium=phenom-feeds": {
    "url": "https://careers.thomsonreuters.com/us/en/job/THTTRUUSJREQ188456EXTERNALENUS/Director-of-
AI-Content-Innovation?utm_source=linkedin&utm_medium=phenom-feeds",
    "company": "Thomson Reuters",
    "job_title": "Director of AI Content Innovation"
  },
  "https://careers.spglobal.com/jobs/310832?lang=en-us&utm_source=linkedin": {
    "url": "https://careers.spglobal.com/jobs/310832?lang=en-us&utm_source=linkedin",
    "company": "S&P Global",
    "job_title": "Director of Data Science – RAG, NLP, LLM and GenAI (Hybrid or Virtual)"
  },
  "https://bostonscientific.eightfold.ai/careers/job/563602800464180?domain=bostonscientific.com": {
    "url":
"https://bostonscientific.eightfold.ai/careers/job/563602800464180?domain=bostonscientific.com",
    "company": "Boston Scientific",
    "job_title": "Sr. Manager Marketing Analytics Strategy"
  },
  "https://jobs.takeda.com/job/-/-/1113/79540724304": {
    "url": "https://jobs.takeda.com/job/-/-/1113/79540724304",
    "company": "Takeda",
    "job_title": "Senior Manager, Market Research, RDBU Commercial Analytics and Insights"
  },
  "https://jobs.johnsoncontrols.com/job/WD30231669": {
    "url": "https://jobs.johnsoncontrols.com/job/WD30231669",
    "company": "Johnson Controls",
    "job_title": "Manager, Market Size & Growth Model"
  }
}
```

#### Filtered URLs

In [5]:
json_input = JOB_POSTING_URLS_FILTERED_FILE

data = load_and_clean_json_file(json_file=json_input)
print(len(data))
companies = sorted(
    [job_data.get("company", "Unknown Company") for job_data in data.values()]
)
display(companies)
# for key, value in data.items():

for url, value in data.items():
    print(url)
    print(value.get("company"), value.get("job_title"))
    print()

# display_json_pretty(json_file, wrap_width=100)

2025-04-01 12:02:22,697 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\input\job_posting_urls_filtered.json


17


['Advisor360 Degrees',
 'Airtable',
 'Amazon',
 'Blend',
 'Boston Scientific',
 'Deloitte',
 'Deloitte',
 'Deloitte',
 'Glean',
 'Liberty Mutual',
 'PwC',
 'S&P Global',
 'Salesforce.com',
 'Snowflake',
 'Thomson Reuters',
 'Veeva',
 'Zendesk']

https://searchjobs.libertymutualgroup.com/careers/job/618501232921?microsite=libertymutual.com&domain=libertymutual.com&utm_source=Job+Board&utm_campaign=LinkedIn+Jobs&extcmp=bof-paid-text-lkin-aljb
Liberty Mutual Senior Manager II, Corporate Strategy & Research

https://boards.greenhouse.io/gleanwork/jobs/4425502005?source=LinkedIn
Glean Head of Competitive Intelligence

https://job-boards.greenhouse.io/airtable/jobs/7603873002?gh_src=aef790d02us
Airtable Product Manager, AI

https://careers.veeva.com/job/365ff44c-8e0a-42b4-a117-27b409a77753/director-crossix-analytics-services-boston-ma/?lever-source=Linkedin
Veeva Director - Crossix Analytics Services

https://apply.deloitte.com/careers/InviteToApply?jobId=210031&source=LinkedIn
Deloitte Market Research Sr Manager, Boston

https://apply.deloitte.com/careers/InviteToApply?jobId=199586&source=LinkedIn
Deloitte AI Data Specialist, Boston

https://apply.deloitte.com/careers/InviteToApply?jobId=201718&source=LinkedIn
Deloitte Global Busin

### Job Postings/Descriptions

#### List of All Records

In [None]:
json_input = JOB_DESCRIPTIONS_JSON_FILE

data = load_and_clean_json_file(json_file=json_input)

display_json_pretty(data)

In [47]:
json_input = JOB_DESCRIPTIONS_JSON_FILE

data = load_and_clean_json_file(json_file=json_input)

print(f"Total no. of records: {len(data)}")

for key in data.keys():
    print(key)

2025-04-01 14:11:59,597 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\preprocessing\jobpostings.json


Total no. of records: 35
https://www.google.com/about/careers/applications/jobs/results/113657145978692294-ai-market-intelligence-principal/?src=Online/LinkedIn/linkedin_us&utm_source=linkedin&utm_medium=jobposting&utm_campaign=contract&utm_medium=jobboard&utm_source=linkedin
https://www.capitalonecareers.com/job/-/-/234/66270465536?p_sid=ep3Sfxb&p_uid=sDBMWC5VxQ&source=rd_linkedin_job_posting_tm&ss=paid&utm_campaign=capone_all_jobs_24&utm_content=pj_board&utm_medium=jobad&utm_source=linkedin+slotted&dclid=CPGV3bef44gDFUEGTwgd4DoHPg
https://boards.greenhouse.io/embed/job_app?token=7600823002&gh_src=ab9f35b82
https://www.amazon.jobs/en/jobs/2696123/research-manager-strategy-and-insights-gca-marketing?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=cxro&utm_medium=social_media&utm_content=job_posting&ss=paid
https://www.amazon.jobs/en/jobs/2742527/sr-generative-ai-strategist-generative-ai-innovation-center?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=cxro&utm_medium=soci

In [44]:
json_input = JOB_DESCRIPTIONS_JSON_FILE

data = load_and_clean_json_file(json_file=json_input)

print(f"Total no. of records: {len(data)}")

for key, value in data.items():
    # Skip records with no data or older format
    if value is None:
        continue
    data_section = value.get("data")
    if data_section is None:
        continue
    company = data_section.get("company", "")
    job_title = data_section.get("job_title", "")
    print(key)
    print(f"{company}, {job_title}\n")

2025-04-01 13:40:55,833 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\preprocessing\jobpostings.json


Total no. of records: 36
https://careers.adobe.com/us/en/job/ADOBUSR151695EXTERNALENUS/Sr-Director-Applied-AI-ML-Discovery?utm_source=linkedin&utm_medium=phenom-feeds&source=LinkedIn
Adobe, Sr. Director, Applied AI/ML (Discovery)

https://www.digitalocean.com/careers/position/apply?gh_jid=6437995&gh_src=312a08e31us
DigitalOcean, Director, Product Management (AI/ML)

https://boards.greenhouse.io/gleanwork/jobs/4425502005?source=LinkedIn
Glean, Head of Competitive Intelligence

https://job-boards.greenhouse.io/airtable/jobs/7603873002?gh_src=aef790d02us
Airtable, Product Manager, AI

https://jobs.thermofisher.com/global/en/job/R-01298008/Market-Competitive-Intelligence-Manager?rx_ch=jobpost&rx_job=R-01298008-1&rx_medium=post&rx_paid=0&rx_r=none&rx_source=linkedin&rx_ts=20250206T184002Z&rx_vp=linkedindirectindex&utm_medium=post&utm_source=recruitics_linkedindirectindex&refId=34jd24&rx_viewer=e3efacca649311ef917d17a1705b89ba0dc4e1e7a57f4231bbce94a604c83931
Thermo Fisher Scientific, Market 

#### Check Individual Records

In [17]:
import json
from project_config import JOB_POSTING_URLS_FILTERED_FILE, JOB_DESCRIPTIONS_JSON_FILE

with open(JOB_DESCRIPTIONS_JSON_FILE) as f:
    data = json.load(f)

url = next(key for key in data.keys() if "thomson" in key)
posting = data[url]

print(f"thomson url in job posting file: {url}")
display_json_pretty(posting)


print()

thomson url in job posting file: https://careers.thomsonreuters.com/us/en/job/THTTRUUSJREQ188456EXTERNALENUS/Director-of-AI-Content-Innovation?utm_source=linkedin&utm_medium=phenom-feeds


```json
{
  "https://careers.thomsonreuters.com/us/en/job/THTTRUUSJREQ188456EXTERNALENUS/Director-of-AI-Content-
Innovation?utm_source=linkedin&utm_medium=phenom-feeds": {
    "status": "success",
    "message": "Job site data processed successfully.",
    "data": {
      "url": "https://careers.thomsonreuters.com/us/en/job/THTTRUUSJREQ188456EXTERNALENUS/Director-of-AI-Content-
Innovation?utm_source=linkedin&utm_medium=phenom-feeds",
      "job_title": "Director of AI Content Innovation",
      "company": "Thomson Reuters",
      "location": "New York City, San Francisco, Los Angeles, and/or Irvine, CA; McLean, VA; Washington, DC",
      "salary_info": "The base compensation range for the role in any of those locations is $177,380 - $329,420. For any
eligible US locations, unless otherwise noted, the base compensation range for this role is $154,000 - $286,000.",
      "posted_date": null,
      "content": {
        "About the Role": "Serve as a thought leader, representing our organization both internally and externally.
Analyze and interpret large datasets to identify opportunities for content innovation, with a focus on exploring new
ways to apply AI and machine learning to our content assets. Collaborate with cross-functional teams, including legal,
editorial, and technology teams, to assess the feasibility of AI-driven content ideas. Lead experimentation on machine
learning models to optimize content usage and engagement. Ensure all AI initiatives comply with relevant legal standards
and ethical considerations. Communicate complex AI concepts clearly and effectively to stakeholders at all levels. Stay
abreast of advancements in AI tools and technologies to continuously enhance our content strategies. Foster a culture of
continuous learning and innovation within the team, encouraging experimentation, innovation, and creativity in the
application of AI to our content assets.",
        "About You": "Advanced knowledge of programming languages such as Python, with experience applying these skills.
Strong data analysis skills, with the ability to interpret and leverage large datasets. Proficiency in machine learning
algorithms and platforms, with experience applying these technologies to content-focused challenges. In-depth knowledge
of the practice of law/Juris Doctor strongly preferred . Excellent communication skills, capable of conveying complex
ideas succinctly and effectively. Demonstrated ability to lead and inspire teams to achieve strategic objectives. A
commitment to continuous learning and staying updated with the latest advancements in AI.",
        "Preferred Qualifications": "Familiarity with editorial processes and content management systems, with
experience applying AI and machine learning to content-focused challenges. Advanced degree in law and computer science,
or a related field, with a focus on the intersection of law, technology, and content strategy.",
        "description": "We are seeking a Director of AI Content Innovation to lead our efforts in reimagining the use of
content at the intersection of law, technology, and editorial excellence. This role requires a unique blend of
programming, data analysis, machine learning, and legal expertise. The successful candidate will play a critical role in
shaping our editorial AI strategy, be a thought leader both internally and externally, driving innovation, and exploring
the art of the possible as we transform our content assets to meet the evolving needs of our customers. As a key member
of our team, you will leverage your expertise in law, technology, and content strategy to identify opportunities for
content and data innovation, optimize content usage, and ensure compliance with relevant legal standards and ethical
considerations. Your role will involve the ability to think creatively, analyze complex data, and iterate quickly on
concepts. Your ability to communicate effectively with stakeholders at all levels will be essential in driving our AI-
driven content initiatives forward.",
        "additional_info": "What's in it For You?\nJoin us to inform the way forward with the latest AI solutions and
address real-world challenges in legal, tax, compliance, and news. Backed by our commitment to continuous learning and
market-leading benefits, you'll be prepared to grow, lead, and thrive in an AI-enabled future. This
includes:\n\nIndustry-Leading Benefits:\nWe offer comprehensive benefit plans to include flexible vacation, two company-
wide Mental Health Days off, access to the Headspace app, retirement savings, tuition reimbursement, employee incentive
programs, and resources for mental, physical, and financial wellbeing.\n\nFlexibility & Work-Life Balance:\nFlex My Way
is a set of supportive workplace policies designed to help manage personal and professional responsibilities, whether
caring for family, giving back to the community, or finding time to refresh and reset. This builds upon our flexible
work arrangements, including work from anywhere for up to 8 weeks per year, and hybrid model, empowering employees to
achieve a better work-life balance.\n\nCareer Development and Growth:\nBy fostering a culture of continuous learning and
skill development, we prepare our talent to tackle tomorrow's challenges and deliver real-world solutions. Our skills-
first approach ensures you have the tools and knowledge to grow, lead, and thrive in an AI-enabled
future.\n\nCulture:\nGlobally recognized and award-winning reputation for inclusion, innovation, and customer-focus. Our
eleven business resource groups nurture our culture of belonging across the diverse backgrounds and experiences
represented across our global footprint.\n\nHybrid Work Model:\nWe've adopted a flexible hybrid working environment (2-3
days a week in the office depending on the role) for our office-based roles while delivering a seamless experience that
is digitally and physically connected.\n\nSocial Impact:\nMake an impact in your community with our Social Impact
Institute. We offer employees two paid volunteer days off annually and opportunities to get involved with pro-bono
consulting projects and Environmental, Social, and Governance (ESG) initiatives."
      }
    }
  }
}
```




In [22]:
import json
from project_config import JOB_POSTING_URLS_FILTERED_FILE, JOB_DESCRIPTIONS_JSON_FILE

with open(JOB_DESCRIPTIONS_JSON_FILE) as f:
    data = json.load(f)

url = next(key for key in data.keys() if "spglobal" in key.lower())
posting = data[url]

print(f"spglobal url in job posting file: {url}")
display_json_pretty(posting)


print()

spglobal url in job posting file: https://careers.spglobal.com/jobs/310832?lang=en-us&utm_source=linkedin


```json
{
  "https://careers.spglobal.com/jobs/310832?lang=en-us&utm_source=linkedin": {
    "status": "success",
    "message": "Job site data processed successfully.",
    "data": {
      "url": "https://careers.spglobal.com/jobs/310832?lang=en-us&utm_source=linkedin",
      "job_title": "Director of Data Science – RAG, NLP, LLM and GenAI (Hybrid or Virtual)",
      "company": "S&P Global Ratings",
      "location": "New York, New York; Virtual, Canada; Toronto, Canada; Princeton, New Jersey; Virtual, Arkansas;
Virtual, Colorado; Virtual, Florida; Virtual, Georgia; Virtual, North Carolina; Virtual, New Jersey; Virtual, New York;
Virtual, South Carolina; Virtual, Texas; Virtual, Virginia",
      "salary_info": "Anticipated base salary range: $180,000 - $225,000",
      "posted_date": "2025-02-24",
      "content": {
        "Job Description": "About The Role:\nGrade Level (for internal use): 13\nThe Role: Director of Data Science –
NLP, LLM and GenAI\nThe Team: S&P is a leader in risk management solutions leveraging automation and AI/ML. This role is
a unique opportunity for an experienced and hands-on NLP/Gen AI/ LLM AI scientist to grow into the next step in their
career journey and apply her or his domain expertise in NLP, deep learning, GenAI, and LLMs to drive business value for
multiple stakeholders while mentoring and growing a ML Data Science team. The ideal candidate must have deep design and
hands-on development expertise in ML, LLMs, model development and integrating ML solutions with business functions to
create the next generation of AI-powered capabilities.",
        "Responsibilities": "ML, Gen AI, NLP, LLM Strategy: Develop and implement ML modeling and LLM development and
fine-tuning strategies, best practices, and standards to enhance AI ML model deployment and monitoring efficiency.
Develop roadmap and strategy for NLP, LLM, Gen AI model development and lifecycle implementation\nML, Gen AI, NLP, LLM
Model Design and Development: Responsible for the design and development of custom ML, Gen AI, NLP, LLM Models for batch
and stream processing-based AI ML pipelines including data ingestion, preprocessing modules, search and retrieval,
Retrieval Augmented Generation (RAG), NLP/LLM model development and ensure the end-to-end solution meets all technical
and business requirements, and SLA specifications. Work closely with members of technology and business leads and their
teams in the design, development, and implementation of the ML model solutions\nML, NLP, LLM Model Evaluation: Work
closely with the MLOps team to create and maintain robust evaluation solutions and tools to evaluate model performance,
accuracy, consistency, reliability, during development, UAT. Identify and implement model optimizations to improve
system efficiency.",
        "What We’re Looking For": "Ph.D (preferred), Bachelor's or Master's degree in Computer Science, Mathematics or
Statistics , Computational linguistics, Engineering, or a related field. 10+ years of professional hands-on experience
leveraging large sets of structured and unstructured data to develop data-driven tactical and strategic analytics and
insights using ML and NLP.. Demonstrated 4+ years hands-on experience with Python, Hugging Face, TensorFlow, Keras,
PyTorch, Spark or similar statistical tools. Expert in python programming. 5+ years hands-on experience developing
natural language processing (NLP) models, ideally with transformer architectures. 5+ year’s experience with implementing
information search and retrieval at scale, using a range of solutions ranging from keyword search to semantic search
using embeddings.",
        "Nice to have": "Experience with contributing to Github and open source initiatives or in research projects",
        "additional_info": "S&P Global has a Securities Disclosure and Trading Policy that seeks to mitigate conflicts
of interest by monitoring and placing restrictions on personal securities holding and trading. Employment at S&P Global
is contingent upon compliance with the Policy."
      }
    }
  }
}
```




In [59]:
from project_config import JOB_DESCRIPTIONS_JSON_FILE

job_description_url = "https://zendesk.wd1.myworkdayjobs.com/en-US/zendesk/job/San-Francisco-California-United-States-of-America/Competitive-Intelligence-Manager_R30346?source=LinkedIn"
job_description_dict = read_from_json_file(JOB_DESCRIPTIONS_JSON_FILE).get(
    job_description_url, {}
)

content = job_description_dict.get("data", {})
content = content.get("content")
if not content:
    print("skipping")

2025-04-01 16:31:06,295 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\preprocessing\jobpostings.json


### Requirements

#### All Requirements Records

In [None]:
json_input = JOB_REQUIREMENTS_JSON_FILE
from utils.generic_utils import read_from_json_file

data = read_from_json_file(json_file=JOB_REQUIREMENTS_JSON_FILE)
extracted = {}

# Loop through each record in the JSON file
for key, record in data.items():
    # Check if record is a dictionary and contains a nested "data" key
    if (
        isinstance(record, dict)
        and "data" in record
        and isinstance(record["data"], dict)
    ):
        bare_minimum = record["data"].get("bare_minimum")
        if bare_minimum is not None:
            extracted[key] = bare_minimum
    else:
        # Older format record; skip processing.
        continue

display_json_pretty(extracted)

In [6]:
json_input = JOB_REQUIREMENTS_JSON_FILE
from utils.generic_utils import read_from_json_file

data = load_and_clean_json_file(json_file=json_input)
print(len(data))

for key in data.keys():
    print(key)

2025-04-03 21:50:57,856 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\preprocessing\extracted_job_requirements.json


37
https://www.google.com/about/careers/applications/jobs/results/113657145978692294-ai-market-intelligence-principal/?src=Online/LinkedIn/linkedin_us&utm_source=linkedin&utm_medium=jobposting&utm_campaign=contract&utm_medium=jobboard&utm_source=linkedin
https://www.capitalonecareers.com/job/-/-/234/66270465536?p_sid=ep3Sfxb&p_uid=sDBMWC5VxQ&source=rd_linkedin_job_posting_tm&ss=paid&utm_campaign=capone_all_jobs_24&utm_content=pj_board&utm_medium=jobad&utm_source=linkedin+slotted&dclid=CPGV3bef44gDFUEGTwgd4DoHPg
https://boards.greenhouse.io/embed/job_app?token=7600823002&gh_src=ab9f35b82
https://www.amazon.jobs/en/jobs/2696123/research-manager-strategy-and-insights-gca-marketing?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=cxro&utm_medium=social_media&utm_content=job_posting&ss=paid
https://www.amazon.jobs/en/jobs/2742527/sr-generative-ai-strategist-generative-ai-innovation-center?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=cxro&utm_medium=social_media&utm_content=j

#### Comparison: job_descriptions vs requirements

In [7]:
# Lists of URLs (extracted from your job descriptions and requirements)
from project_config import JOB_DESCRIPTIONS_JSON_FILE, JOB_REQUIREMENTS_JSON_FILE

job_descriptions_data = load_and_clean_json_file(json_file=JOB_DESCRIPTIONS_JSON_FILE)
set_job_desc = set(job_descriptions_data.keys())

requirements_data = load_and_clean_json_file(json_file=JOB_REQUIREMENTS_JSON_FILE)
set_requirements = set(requirements_data)

# Compute differences in both directions
diff_in_desc_not_req = set_job_desc - set_requirements
diff_in_req_not_desc = set_requirements - set_job_desc

print("Differences (job descriptions not in requirements):")
print(diff_in_desc_not_req)

print("\nDifferences (requirements not in job descriptions):")
print(diff_in_req_not_desc)

if not diff_in_desc_not_req and not diff_in_req_not_desc:
    print("\nBoth lists contain identical URLs.")

2025-04-03 21:51:06,015 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\preprocessing\jobpostings.json
2025-04-03 21:51:06,021 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\preprocessing\extracted_job_requirements.json


Differences (job descriptions not in requirements):
set()

Differences (requirements not in job descriptions):
set()

Both lists contain identical URLs.


#### Check Individual Records

In [31]:
json_input = JOB_REQUIREMENTS_JSON_FILE
from utils.generic_utils import read_from_json_file

data = load_and_clean_json_file(json_file=json_input)
print(len(data))

pwc_list = [key for key in data.keys() if "pwc" in key]
print(pwc_list)

pwc_branches = {key: value for key, value in data.items() if "pwc" in key}

print("\n\n")

for key, branch in pwc_branches.items():
    print(f"Key: {key}")
    display_json_pretty(branch)

# for pwc in pwc_list:
#     print(read_from_json_file(json_file=json_input, key=pwc))
# json_file = r"C:\github\job_bot\input_output\preprocessing\jobpostings.json"
# display_json_pretty(JOB_REQUIREMENTS_JSON_FILE, wrap_width=100)

2025-03-31 17:57:01,665 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\preprocessing\extracted_job_requirements.json


35
['https://jobs.us.pwc.com/job/-/-/932/76064801072?utm_source=linkedin.com&utm_campaign=core_media&utm_medium=social_media&utm_content=job_posting&ss=paid&dclid=CjgKEAiAwaG9BhCY3ayl47PW8lcSJAA_gCfjt-rzWhQetHLIbJdJBVocWQm2BRNcBgOARxhGyR9bgvD_BwE', 'https://jobs.us.pwc.com/job/-/-/932/76741173104?utm_source=linkedin.com&utm_campaign=core_media&utm_medium=social_media&utm_content=job_posting&ss=paid&dclid=CjgKEAjwy46_BhD8-aeS9rzGtzsSJAAuE6pojXgWgT7LeiCns3H71Hqcb3dqchcqskpnFxz8njxwwPD_BwE']



Key: https://jobs.us.pwc.com/job/-/-/932/76064801072?utm_source=linkedin.com&utm_campaign=core_media&utm_medium=social_media&utm_content=job_posting&ss=paid&dclid=CjgKEAiAwaG9BhCY3ayl47PW8lcSJAA_gCfjt-rzWhQetHLIbJdJBVocWQm2BRNcBgOARxhGyR9bgvD_BwE


```json
{
  "status": "success",
  "message": "Job site data processed successfully.",
  "data": {
    "pie_in_the_sky": [
      "Demonstrates extensive level abilities and/or a proven record of success in Digital Value Creation and Contact
Center/Service Excellence",
      "Demonstrating executive presence and ability to become a trusted advisor to the c-suite level clients"
    ],
    "down_to_earth": [
      "Minimum 5 years of experience",
      "Degree Preferred: Master Degree",
      "Demonstrating extensive level leadership, strategic and creative thinking, problem solving, individual
initiative",
      "Identifying and addressing client needs, rapidly building credibility, and maintaining and utilizing networks of
client relationships",
      "Managing teams/multiple work streams to establish successful project conclusion - i.e., delivery of quality work
on time and within budget",
      "Building productive relationships with team members and clients, both long term and day-to-day, by using a
collaborative approach with thorough listening skills and the ability to manage through influence",
      "Utilizing first principles thinking, and developing credible and pragmatic analytical approaches, frameworks and
methodologies",
      "Analyzing complex quantitative and qualitative data in an efficient manner and synthesizing the output into
meaningful and actionable insights",
      "Designing and conducting market research to understand consumer needs and purchasing behaviors",
      "Communicating effectively in an organized and knowledgeable manner in written and verbal formats to senior
audiences and being able to deliver difficult messages with persuasiveness and sensitivity",
      "Identifying and pursuing new business opportunities, and leading client/market development",
      "Attracting, retaining, assessing and developing staff/team members",
      "Demonstrating flexibility and creativity in managing work-life balance of self and team members",
      "Demonstrating Power User ability with MS Office suite of applications including Word, PowerPoint and Excel"
    ],
    "bare_minimum": [
      "Minimum Degree Required: Bachelor Degree"
    ],
    "cultural_fit": [
      "Develop new skills outside of comfort zone",
      "Act to resolve issues which prevent the team working effectively",
      "Coach others, recognise their strengths, and encourage them to take ownership of their personal development",
      "Uphold the firm's code of ethics and business conduct"
    ],
    "other": []
  }
}
```

Key: https://jobs.us.pwc.com/job/-/-/932/76741173104?utm_source=linkedin.com&utm_campaign=core_media&utm_medium=social_media&utm_content=job_posting&ss=paid&dclid=CjgKEAjwy46_BhD8-aeS9rzGtzsSJAAuE6pojXgWgT7LeiCns3H71Hqcb3dqchcqskpnFxz8njxwwPD_BwE


```json
{
  "status": "success",
  "message": "Job site data processed successfully.",
  "data": {
    "pie_in_the_sky": [
      "Leverage influence, expertise, and network to deliver quality results as a strategic advisor.",
      "Motivate and coach others to solve complex problems through sound judgment and clear communication."
    ],
    "down_to_earth": [
      "Craft impactful messages and apply systems thinking.",
      "Validate outcomes with clients and direct technology workstreams.",
      "8+ years of experience in data engineering or related field.",
      "Technical proficiency with Azure, Azure Databricks, Azure Data Factory, Azure Fabric, Spark, and Python/SQL.",
      "Certification in a cloud platform provider (AWS, Azure, GCP, Snowflake, or Databricks)."
    ],
    "bare_minimum": [
      "Bachelor's Degree in a relevant field.",
      "Familiarity with CI/CD, cloud devops, and containerization."
    ],
    "cultural_fit": [
      "Collaborative, growth-oriented mindset.",
      "Ability to mentor and develop high-performing teams."
    ],
    "other": [
      "Experience working in professional services, Big Four firms, or consulting environments."
    ]
  }
}
```

In [4]:
json_input = JOB_REQUIREMENTS_JSON_FILE
from utils.generic_utils import read_from_json_file

search_term = "takeda"
data = load_and_clean_json_file(json_file=json_input)

urls = [key for key in data.keys() if "spglobal" in key]
branches = {key: value for key, value in data.items() if search_term in key.lower()}

print(search_term)
print()

for key, branch in branches.items():
    print(f"Key: {key}")
    display_json_pretty(branch)

2025-04-04 00:35:34,015 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\preprocessing\extracted_job_requirements.json


takeda

Key: https://jobs.takeda.com/job/-/-/1113/79540724304


```json
{
  "status": "success",
  "message": "Job site data processed successfully.",
  "data": {
    "pie_in_the_sky": [
      "Leading market research and competitive intelligence initiatives to drive organizational action and maximize
commercial potential.",
      "Acting as a trusted advisor to brand teams and senior leadership, providing meaningful recommendations for
decision-making."
    ],
    "down_to_earth": [
      "6+ years of experience in advanced analytics, marketing, market research, or related roles within the bio-
pharmaceutical industry.",
      "Extensive knowledge and application of market research principles, methodologies, and practices."
    ],
    "bare_minimum": [
      "Bachelor’s Degree required; Advanced degree (MBA/PharmD) preferred.",
      "Strong quantitative analysis skills and experience in project planning and management."
    ],
    "cultural_fit": [
      "Collaborative and strategic partner to brand teams, contributing to brand strategy.",
      "Excellent communication and presentation skills, influencing cross-functional teams."
    ],
    "other": [
      "Proficient in creating persuasive presentations and influencing strategic decisions.",
      "Experience in vendor and budget management, with up to 20% travel domestically."
    ]
  }
}
```

In [61]:
json_input = JOB_REQUIREMENTS_JSON_FILE

data = load_and_clean_json_file(json_file=json_input)


urls = [
    "https://jobs.us.pwc.com/job/-/-/932/76741173104?utm_source=linkedin.com&utm_campaign=core_media&utm_medium=social_media&utm_content=job_posting&ss=paid&dclid=CjgKEAjwy46_BhD8-aeS9rzGtzsSJAAuE6pojXgWgT7LeiCns3H71Hqcb3dqchcqskpnFxz8njxwwPD_BwE",
    "https://salesforce.wd12.myworkdayjobs.com/External_Career_Site/job/California---San-Francisco/Vice-President--Product-Research---Insights_JR279859?source=LinkedIn_Jobs",
    "https://careers.thomsonreuters.com/us/en/job/THTTRUUSJREQ188456EXTERNALENUS/Director-of-AI-Content-Innovation?utm_source=linkedin&utm_medium=phenom-feeds",
    "https://bostonscientific.eightfold.ai/careers/job/563602800464180?domain=bostonscientific.com",
]
url_branches = {key: value for key, value in data.items() if key in urls}

for key, branch in url_branches.items():
    print(f"Key: {key}")
    display_json_pretty(branch)

2025-04-01 16:33:50,975 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\preprocessing\extracted_job_requirements.json


Key: https://jobs.us.pwc.com/job/-/-/932/76741173104?utm_source=linkedin.com&utm_campaign=core_media&utm_medium=social_media&utm_content=job_posting&ss=paid&dclid=CjgKEAjwy46_BhD8-aeS9rzGtzsSJAAuE6pojXgWgT7LeiCns3H71Hqcb3dqchcqskpnFxz8njxwwPD_BwE


```json
{
  "status": "success",
  "message": "Job site data processed successfully.",
  "data": {
    "pie_in_the_sky": [
      "Leverage influence, expertise, and network to deliver quality results as a strategic advisor.",
      "Motivate and coach others to solve complex problems through sound judgment and clear communication."
    ],
    "down_to_earth": [
      "Craft impactful messages and apply systems thinking.",
      "Validate outcomes with clients and direct technology workstreams.",
      "8+ years of experience in data engineering or related field.",
      "Technical proficiency with Azure, Azure Databricks, Azure Data Factory, Azure Fabric, Spark, and Python/SQL.",
      "Certification in a cloud platform provider (AWS, Azure, GCP, Snowflake, or Databricks)."
    ],
    "bare_minimum": [
      "Bachelor's Degree in a relevant field.",
      "Familiarity with CI/CD, cloud devops, and containerization."
    ],
    "cultural_fit": [
      "Collaborative, growth-oriented mindset.",
      "Ability to mentor and develop high-performing teams."
    ],
    "other": [
      "Experience working in professional services, Big Four firms, or consulting environments."
    ]
  }
}
```

Key: https://bostonscientific.eightfold.ai/careers/job/563602800464180?domain=bostonscientific.com


```json
{
  "status": "success",
  "message": "Job site data processed successfully.",
  "data": {
    "pie_in_the_sky": [
      "Develop, align, and execute an analytics strategy to support innovation in areas like GenAI, personalization, and
experimentation."
    ],
    "down_to_earth": [
      "Minimum 10 years of experience in Finance, Consulting, Business Strategy, Marketing Analytics, or Business
Intelligence.",
      "Minimum 10 years of experience demonstrating strong analytical skills, critical thinking, and quantitative
thinking.",
      "Minimum 5 years of experience working within Snowflake (or similar analytics platforms)."
    ],
    "bare_minimum": [
      "Bachelor's degree, preferably in Business, Engineering, or Life Sciences.",
      "Expert in SQL and proficient in Python.",
      "Exceptional communication and storytelling abilities, with a focus on explaining data and strategic implications
effectively.",
      "Strong attention to detail and focus on deliverable/analysis quality and accuracy.",
      "Excellent oral and written communication skills with proficiency in PowerPoint."
    ],
    "cultural_fit": [
      "Strong collaboration skills to lead cross-functional teams and drive projects.",
      "Team player with strong interpersonal skills."
    ],
    "other": [
      "Strong understanding of data architecture, including data ingestion, storage, and consumption.",
      "Experience using Business Intelligence tools like Tableau and Salesforce."
    ]
  }
}
```

Key: https://salesforce.wd12.myworkdayjobs.com/External_Career_Site/job/California---San-Francisco/Vice-President--Product-Research---Insights_JR279859?source=LinkedIn_Jobs


```json
{
  "status": "success",
  "message": "Job site data processed successfully.",
  "data": {
    "pie_in_the_sky": [
      "Proven track record of delivering insight work with measurable business outcomes",
      "Experience navigating complex organizational issues using collaborative and directive management styles"
    ],
    "down_to_earth": [
      "15+ years of experience leading research in product development, user experience, or technology contexts",
      "Expertise in executing research in a business environment",
      "Strong leadership and management expertise with executive-level presentation skills"
    ],
    "bare_minimum": [
      "University degree in a research or business-focused field",
      "At least 15 years of experience leading research in relevant contexts"
    ],
    "cultural_fit": [
      "Passionate about understanding and solving customer issues through data-driven insights",
      "Collaborative mindset to work with cross-functional teams"
    ],
    "other": [
      "Experience in CRM, enterprise business software, or analogous industries"
    ]
  }
}
```

Key: https://careers.thomsonreuters.com/us/en/job/THTTRUUSJREQ188456EXTERNALENUS/Director-of-AI-Content-Innovation?utm_source=linkedin&utm_medium=phenom-feeds


```json
{
  "status": "success",
  "message": "Job site data processed successfully.",
  "data": {
    "pie_in_the_sky": [
      "Serve as a thought leader in AI content innovation.",
      "Lead experimentation on machine learning models to optimize content engagement.",
      "Foster a culture of continuous learning and innovation within the team.",
      "Shape editorial AI strategy and drive innovation.",
      "Transform content assets to meet evolving customer needs."
    ],
    "down_to_earth": [
      "Advanced knowledge of programming languages like Python.",
      "Strong data analysis skills with large datasets.",
      "Proficiency in machine learning algorithms and platforms.",
      "Excellent communication skills for conveying complex ideas effectively.",
      "Demonstrated leadership ability to achieve strategic objectives."
    ],
    "bare_minimum": [
      "Bachelor's degree in law, computer science, or related field.",
      "In-depth knowledge of the practice of law/Juris Doctor strongly preferred.",
      "Commitment to continuous learning and staying updated with AI advancements."
    ],
    "cultural_fit": [
      "Collaborate with cross-functional teams including legal, editorial, and technology.",
      "Communicate complex AI concepts clearly to stakeholders at all levels.",
      "Encourage experimentation, innovation, and creativity in AI application."
    ],
    "other": [
      "Familiarity with editorial processes and content management systems.",
      "Advanced degree in law and computer science with a focus on law, technology, and content strategy."
    ]
  }
}
```

## Check Iteration 0

### File Path Imports

In [14]:
import json
from pathlib import Path
import textwrap
from IPython.display import display, Markdown
from project_config import (
    JOB_POSTING_URLS_FILE,
    JOB_DESCRIPTIONS_JSON_FILE,
    JOB_REQUIREMENTS_JSON_FILE,
    ITERATE_0_OPENAI_DIR,
    mapping_file_name,
    REQS_FILES_ITERATE_0_OPENAI_DIR,
    RESPS_FILES_ITERATE_0_OPENAI_DIR,
    SIMILARITY_METRICS_ITERATE_0_OPENAI_DIR,
    ITERATE_0_ANTHROPIC_DIR,
    REQS_FILES_ITERATE_0_ANTHROPIC_DIR,
    RESPS_FILES_ITERATE_0_ANTHROPIC_DIR,
    SIMILARITY_METRICS_ITERATE_0_ANTHROPIC_DIR,
    # URL_TO_FILE_MAPPING_FILE_ITERATE_0_OPENAI,
    # URL_TO_FILE_MAPPING_FILE_ITERATE_0_ANTHROPIC,
)

### OpenAI I/O Iterate 0

#### Mapping File (OpenAI pipeline)

##### Check all records from mapping file

In [8]:
from utils.get_file_names import get_file_names
from models.resume_job_description_io_models import JobFileMappings
from evaluation_optimization.create_mapping_file import load_mappings_model_from_json

# from project_config import URL_TO_FILE_MAPPING_FILE_ITERATE_0_OPENAI

directory = ITERATE_0_OPENAI_DIR
mapping_file = directory / mapping_file_name

file_mapping_model = load_mappings_model_from_json(mapping_file)

print("Job URLs:")
print(f"Number of URLs: {len(file_mapping_model.root.keys())}")

for index, url in enumerate(file_mapping_model.root.keys(), start=1):
    print(f"{index}. {url}")
    # print()

print("\n")

print("sim_metrics paths:")
for index, jobpaths in enumerate(file_mapping_model.root.values(), start=1):
    print(f"{index}. {Path(jobpaths.sim_metrics).name}")
    # print()

2025-04-04 14:07:48,720 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\url_to_file_mapping.json
2025-04-04 14:07:48,723 - evaluation_optimization.create_mapping_file - INFO - Loaded and validated mapping file from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\url_to_file_mapping.json


Job URLs:
Number of URLs: 38
1. https://www.google.com/about/careers/applications/jobs/results/113657145978692294-ai-market-intelligence-principal/?src=Online/LinkedIn/linkedin_us&utm_source=linkedin&utm_medium=jobposting&utm_campaign=contract&utm_medium=jobboard&utm_source=linkedin
2. https://www.capitalonecareers.com/job/-/-/234/66270465536?p_sid=ep3Sfxb&p_uid=sDBMWC5VxQ&source=rd_linkedin_job_posting_tm&ss=paid&utm_campaign=capone_all_jobs_24&utm_content=pj_board&utm_medium=jobad&utm_source=linkedin+slotted&dclid=CPGV3bef44gDFUEGTwgd4DoHPg
3. https://boards.greenhouse.io/embed/job_app?token=7600823002&gh_src=ab9f35b82
4. https://www.amazon.jobs/en/jobs/2696123/research-manager-strategy-and-insights-gca-marketing?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=cxro&utm_medium=social_media&utm_content=job_posting&ss=paid
5. https://www.amazon.jobs/en/jobs/2742527/sr-generative-ai-strategist-generative-ai-innovation-center?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=c

##### Check for specific records

In [48]:
search_terms = ["blend", "Amazon", "Blend", "Scientific"]


# Find URLs that contain the search term
matching_urls = [
    url
    for url in file_mapping_model.root.keys()
    if any(term.lower() in str(url).lower() for term in search_terms)
]

print("Matching URLs:")
for index, url in enumerate(matching_urls, start=1):
    print(f"{index}. {url}")
    print()

Matching URLs:
1. https://www.amazon.jobs/en/jobs/2696123/research-manager-strategy-and-insights-gca-marketing?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=cxro&utm_medium=social_media&utm_content=job_posting&ss=paid

2. https://www.amazon.jobs/en/jobs/2742527/sr-generative-ai-strategist-generative-ai-innovation-center?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=cxro&utm_medium=social_media&utm_content=job_posting&ss=paid

3. https://www.amazon.jobs/en/jobs/2684745/product-manager-artificial-general-intelligence-data-services?cmpid=SPLICX0248M&ss=paid&utm_campaign=cxro&utm_content=job_posting&utm_medium=social_media&utm_source=linkedin.com

4. https://jobs.smartrecruiters.com/Blend360/744000042638791-director-ai-strategy?trid=2d92f286-613b-4daf-9dfa-6340ffbecf73

5. https://www.amazon.jobs/en/jobs/2905092/senior-manger-partner-strategy-genai-innovation-center?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=cxro&utm_medium=social_media&utm_content=job_posting

In [25]:
from pydantic import HttpUrl
from utils.get_file_names import get_file_names
from models.resume_job_description_io_models import JobFileMappings
from evaluation_optimization.create_mapping_file import load_mappings_model_from_json

# from project_config import URL_TO_FILE_MAPPING_FILE_ITERATE_0_OPENAI

directory = ITERATE_0_OPENAI_DIR
mapping_file = directory / mapping_file_name

file_mapping_model = load_mappings_model_from_json(mapping_file)

url = "https://jobs.takeda.com/job/-/-/1113/79540724304"

file_mapping_model.root[HttpUrl("https://jobs.takeda.com/job/-/-/1113/79540724304")]

2025-04-04 16:23:07,068 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\url_to_file_mapping.json
2025-04-04 16:23:07,069 - evaluation_optimization.create_mapping_file - INFO - Loaded and validated mapping file from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\url_to_file_mapping.json


JobFilePaths(reqs='C:\\github\\job_bot\\input_output\\evaluation_optimization\\evaluation_optimization_by_openai\\iteration_0\\requirements\\Takeda_Senior_Manager__Market_Research__RDBU_Commercial_Analytics_and_Insights_reqs_flat_iter0.json', resps='C:\\github\\job_bot\\input_output\\evaluation_optimization\\evaluation_optimization_by_openai\\iteration_0\\responsibilities\\Takeda_Senior_Manager__Market_Research__RDBU_Commercial_Analytics_and_Insights_resps_flat_iter0.json', sim_metrics='C:\\github\\job_bot\\input_output\\evaluation_optimization\\evaluation_optimization_by_openai\\iteration_0\\similarity_metrics\\Takeda_Senior_Manager__Market_Research__RDBU_Commercial_Analytics_and_Insights_sim_metrics_iter0.csv', pruned_resps='C:\\github\\job_bot\\input_output\\evaluation_optimization\\evaluation_optimization_by_openai\\iteration_0\\pruned_responsibilities\\Takeda_Senior_Manager__Market_Research__RDBU_Commercial_Analytics_and_Insights_pruned_resps_flat_iter0.json')

#### Sim Metrics (OpenAI Pipeline)

##### Check all records

In [16]:
from utils.get_file_names import get_file_names
from models.resume_job_description_io_models import JobFileMappings
from evaluation_optimization.create_mapping_file import (
    load_existing_or_create_new_mapping,
)

# from project_config import URL_TO_FILE_MAPPING_FILE_ITERATE_0_OPENAI

directory = SIMILARITY_METRICS_ITERATE_0_OPENAI_DIR
file_list = get_file_names(directory_path=directory)

print("sim_metrics files in sim_metrics folder")
print(f"Number of files: {len(file_list)}")

for index, file_name in enumerate(file_list, start=1):
    print(f"{index}. {file_name}")
    # print()

sim_metrics files in sim_metrics folder
Number of files: 38
1. Glean_Head_of_Competitive_Intelligence_sim_metrics_iter0.csv
2. Blend_Director__AI_Strategy_sim_metrics_iter0.csv
3. Veeva_Systems_Director_-_Crossix_Analytics_Services_sim_metrics_iter0.csv
4. Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_sim_metrics_iter0.csv
5. Deloitte_Market_Research_Sr_Manager_sim_metrics_iter0.csv
6. Capital_One_Director__AI_Platforms_sim_metrics_iter0.csv
7. Deloitte_Global_Business_Services__GBS__Strategy_Manager_sim_metrics_iter0.csv
8. MongoDB_Director__Competitive_Intelligence_sim_metrics_iter0.csv
9. Figma_Researcher__Strategic_Growth_sim_metrics_iter0.csv
10. S_P_Global_Ratings_Director_of_Data_Science___RAG__NLP__LLM_and_GenAI__Hybrid_or_Virtual__sim_metrics_iter0.csv
11. Microsoft_Head_of_Partner_Intelligence_and_Strategy_sim_metrics_iter0.csv
12. Thermo_Fisher_Scientific_Market___Competitive_Intelligence_Manager_sim_metrics_iter0.csv
13. DEPT__Director_of_Applied_AI_Strategy__Media_

##### Missing records (similarity metrics files)

In [8]:
from pathlib import Path
from utils.get_file_names import get_file_names
from models.resume_job_description_io_models import JobFileMappings
from evaluation_optimization.create_mapping_file import load_mappings_model_from_json
from project_config import SIMILARITY_METRICS_ITERATE_0_OPENAI_DIR

# from project_config import URL_TO_FILE_MAPPING_FILE_ITERATE_0_OPENAI

sim_metrics_dir = SIMILARITY_METRICS_ITERATE_0_OPENAI_DIR
file_list_sim_metrics_dir = get_file_names(directory_path=sim_metrics_dir)

file_mapping_model = load_mappings_model_from_json(mapping_file)
file_list_mapping_file = [
    Path(jobpaths.sim_metrics).name for jobpaths in file_mapping_model.root.values()
]

missing_files = set(file_list_mapping_file) - set(
    file_list_sim_metrics_dir
)  # Elements in list1 but not in list2

print(f"Number of missing files in sim metrics folder: {len(missing_files)}")
for idx, f_name in enumerate(missing_files, start=1):
    print(f"{idx}. {f_name}")

2025-04-02 17:47:53,923 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\url_to_file_mapping.json
2025-04-02 17:47:53,924 - evaluation_optimization.create_mapping_file - INFO - Loaded and validated mapping file from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\url_to_file_mapping.json


Number of missing files in sim metrics folder: 0


In [None]:
from pathlib import Path
from utils.get_file_names import get_file_names
from models.resume_job_description_io_models import JobFileMappings, Requirements
from evaluation_optimization.create_mapping_file import load_mappings_model_from_json

# from project_config import URL_TO_FILE_MAPPING_FILE_ITERATE_0_OPENAI

sim_metrics_dir = SIMILARITY_METRICS_ITERATE_0_OPENAI_DIR
file_list_sim_metrics_dir = get_file_names(directory_path=sim_metrics_dir)

requirements_dir = REQS_FILES_ITERATE_0_OPENAI_DIR
file_list_reqs_dir = get_file_names(requirements_dir)
print(f"Files in reqs dir: {len(file_list_reqs_dir)}")

responsibilities_dir = RESPS_FILES_ITERATE_0_OPENAI_DIR
file_list_resps_dir = get_file_names(responsibilities_dir)
print(f"Files in resps dir: {len(file_list_resps_dir)}")

file_mapping_model = load_mappings_model_from_json(mapping_file)
file_list_mapping_file = [
    Path(jobpaths.sim_metrics).name for jobpaths in file_mapping_model.root.values()
]
print(f"Files in mapping file: {len(file_list_mapping_file)}")

print()

missing_files = set(file_list_mapping_file) - set(
    file_list_sim_metrics_dir
)  # Elements in list1 but not in list2

print(f"Number of missing files in sim metrics folder: {len(missing_files)}")
for idx, f_name in enumerate(missing_files, start=1):
    print(f"{idx}. {f_name}")

2025-03-10 10:10:16,354 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\url_to_file_mapping.json
2025-03-10 10:10:16,355 - evaluation_optimization.create_mapping_file - INFO - Loaded and validated mapping file from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\url_to_file_mapping.json


Files in reqs dir: 30
Files in resps dir: 30
Files in mapping file: 30

Number of missing files in sim metrics folder: 1
1. Liberty_Mutual_Insurance_Senior_Manager_II__Corporate_Strategy___Research_sim_metrics_iter0.csv


#### Responsibilities

##### All responsibilities files

In [9]:
from utils.get_file_names import get_file_names
from project_config import RESPS_FILES_ITERATE_0_OPENAI_DIR
import pandas as pd


resps_dir = RESPS_FILES_ITERATE_0_OPENAI_DIR


files = get_file_names(directory_path=RESPS_FILES_ITERATE_0_OPENAI_DIR)
files

['Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_resps_flat_iter0.json',
 'Adobe_Sr__Director__Applied_AI_ML__Discovery__resps_flat_iter0.json',
 'Advisor360__Senior_Product_Manager_-_AI_Analytics___Insights_resps_flat_iter0.json',
 'Airtable_Product_Manager__AI_resps_flat_iter0.json',
 'Amazon_Product_Manager__Artificial_General_Intelligence_-_Data_Services_resps_flat_iter0.json',
 'Amazon_Research_Manager_-_Strategy_and_Insights_GCA_Marketing_resps_flat_iter0.json',
 'Amazon_Sr__Generative_AI_Strategist__Generative_AI_Innovation_Center_resps_flat_iter0.json',
 'Amazon_Web_Services__Inc__Senior_Manger__Partner_Strategy__GenAI_Innovation_Center_resps_flat_iter0.json',
 'Amplitude_Marketing_Strategy___Analytics_Manager_resps_flat_iter0.json',
 'Blend_Director__AI_Strategy_resps_flat_iter0.json',
 'Boston_Scientific_Hybrid_Sr__Manager_Marketing_Analytics_Strategy_resps_flat_iter0.json',
 'Capital_One_Director__AI_Platforms_resps_flat_iter0.json',
 'Deloitte_AI_Data_Specialist_resp

In [52]:
from utils.get_file_names import get_file_names
from project_config import RESPS_FILES_ITERATE_0_OPENAI_DIR
import pandas as pd

resps_dir = RESPS_FILES_ITERATE_0_OPENAI_DIR
file_glean = r"C:\\github\\job_bot\\input_output\\evaluation_optimization\\evaluation_optimization_by_openai\\iteration_0\\responsibilities\\Glean_Head_of_Competitive_Intelligence_resps_flat_iter0.json"

data = load_and_clean_json_file(file_glean)
print(len(data.get("responsibilities")))

csv_file = r"C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_1\similarity_metrics\Glean_Head_of_Competitive_Intelligence_sim_metrics_iter1.csv"
df = pd.read_csv(csv_file)
# print(df)
no_of_resps = set(df.responsibility_key)
print(len(no_of_resps))

print(set(df.responsibility_key))

2025-03-31 19:18:49,955 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\responsibilities\Glean_Head_of_Competitive_Intelligence_resps_flat_iter0.json


30
30
{'2.responsibilities.1', '4.responsibilities.3', '1.responsibilities.3', '1.responsibilities.4', '2.responsibilities.6', '2.responsibilities.7', '4.responsibilities.4', '0.responsibilities.3', '2.responsibilities.2', '3.responsibilities.0', '2.responsibilities.5', '2.responsibilities.0', '2.responsibilities.3', '5.responsibilities.0', '1.responsibilities.2', '4.responsibilities.1', '1.responsibilities.1', '4.responsibilities.2', '0.responsibilities.4', '0.responsibilities.0', '4.responsibilities.0', '4.responsibilities.5', '1.responsibilities.5', '3.responsibilities.1', '0.responsibilities.2', '2.responsibilities.4', '1.responsibilities.6', '1.responsibilities.7', '0.responsibilities.1', '1.responsibilities.0'}


#### Requirements

##### All requirements files

In [10]:
from utils.get_file_names import get_file_names

reqs_dir = REQS_FILES_ITERATE_0_OPENAI_DIR

file_list = get_file_names(reqs_dir)

print(f"No. of files: {len(file_list)}")
display(file_list)

No. of files: 38


['Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_reqs_flat_iter0.json',
 'Adobe_Sr__Director__Applied_AI_ML__Discovery__reqs_flat_iter0.json',
 'Advisor360__Senior_Product_Manager_-_AI_Analytics___Insights_reqs_flat_iter0.json',
 'Airtable_Product_Manager__AI_reqs_flat_iter0.json',
 'Amazon_Product_Manager__Artificial_General_Intelligence_-_Data_Services_reqs_flat_iter0.json',
 'Amazon_Research_Manager_-_Strategy_and_Insights_GCA_Marketing_reqs_flat_iter0.json',
 'Amazon_Sr__Generative_AI_Strategist__Generative_AI_Innovation_Center_reqs_flat_iter0.json',
 'Amazon_Web_Services__Inc__Senior_Manger__Partner_Strategy__GenAI_Innovation_Center_reqs_flat_iter0.json',
 'Amplitude_Marketing_Strategy___Analytics_Manager_reqs_flat_iter0.json',
 'Blend_Director__AI_Strategy_reqs_flat_iter0.json',
 'Boston_Scientific_Hybrid_Sr__Manager_Marketing_Analytics_Strategy_reqs_flat_iter0.json',
 'Capital_One_Director__AI_Platforms_reqs_flat_iter0.json',
 'Deloitte_AI_Data_Specialist_reqs_flat_iter0.

##### Check all files - how many requirements

In [11]:
from pydantic import ValidationError
from models.resume_job_description_io_models import Requirements
from project_config import REQS_FILES_ITERATE_0_OPENAI_DIR
from utils.get_file_names import get_file_names

reqs_dir = REQS_FILES_ITERATE_0_OPENAI_DIR


file_list = get_file_names(reqs_dir, True)

for idx, file in enumerate(file_list, start=1):
    try:
        # Load JSON data
        data = load_and_clean_json_file(file)

        # Validate using Pydantic model
        validated_data = Requirements(**data)

        # If validation passes, print results
        print(f"{idx}. {Path(file).name}")
        print(f"Number of requirements: {len(validated_data.requirements)}")

    except ValidationError as e:
        print(f"Skipping {file}: Validation failed - {e}")
    except json.JSONDecodeError as e:
        print(f"Skipping {file}: Invalid JSON format - {e}")
    except Exception as e:
        print(f"Skipping {file}: Unexpected error - {e}")

2025-04-04 14:08:35,865 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\requirements\Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_reqs_flat_iter0.json


2025-04-04 14:08:35,876 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\requirements\Adobe_Sr__Director__Applied_AI_ML__Discovery__reqs_flat_iter0.json
2025-04-04 14:08:35,888 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\requirements\Advisor360__Senior_Product_Manager_-_AI_Analytics___Insights_reqs_flat_iter0.json
2025-04-04 14:08:35,899 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\requirements\Airtable_Product_Manager__AI_reqs_flat_iter0.json
2025-04-04 14:08:35,901 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\requirements\Amazon_Product_Manager__Artificial_General_Intelligence_-

1. Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_reqs_flat_iter0.json
Number of requirements: 10
2. Adobe_Sr__Director__Applied_AI_ML__Discovery__reqs_flat_iter0.json
Number of requirements: 7
3. Advisor360__Senior_Product_Manager_-_AI_Analytics___Insights_reqs_flat_iter0.json
Number of requirements: 12
4. Airtable_Product_Manager__AI_reqs_flat_iter0.json
Number of requirements: 8
Skipping C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\requirements\Amazon_Product_Manager__Artificial_General_Intelligence_-_Data_Services_reqs_flat_iter0.json: Validation failed - 2 validation errors for Requirements
url
  Field required [type=missing, input_value={'0.pie_in_the_sky.0': '1...e and Technical teams.'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/missing
requirements
  Field required [type=missing, input_value={'0.pie_in_the_sky.0': '1...e and Technical teams.'}, input_type=dict]
    For fu

2025-04-04 14:08:36,055 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\requirements\Liberty_Mutual_Insurance_Senior_Manager_I_-_Corporate_Strategy___Research_reqs_flat_iter0.json
2025-04-04 14:08:36,057 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\requirements\Meta_Product_Strategy_Lead_reqs_flat_iter0.json
2025-04-04 14:08:36,058 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\requirements\Microsoft_Head_of_Partner_Intelligence_and_Strategy_reqs_flat_iter0.json
2025-04-04 14:08:36,071 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\requirements\MongoDB_Director__Competitive_Intelligen

23. Liberty_Mutual_Insurance_Senior_Manager_II__Corporate_Strategy___Research_reqs_flat_iter0.json
Number of requirements: 18
Skipping C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\requirements\Liberty_Mutual_Insurance_Senior_Manager_I_-_Corporate_Strategy___Research_reqs_flat_iter0.json: Validation failed - 2 validation errors for Requirements
url
  Field required [type=missing, input_value={'0.pie_in_the_sky.0': 'T... a plus (not required)'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/missing
requirements
  Field required [type=missing, input_value={'0.pie_in_the_sky.0': 'T... a plus (not required)'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.10/v/missing
Skipping C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\requirements\Meta_Product_Strategy_Lead_reqs_flat_iter0.json: Validation failed - 2 val

##### Check individual files - how many requirements

In [11]:
from pydantic import ValidationError
from models.resume_job_description_io_models import Requirements
from project_config import REQS_FILES_ITERATE_0_OPENAI_DIR, JOB_DESCRIPTIONS_JSON_FILE
from utils.get_file_names import get_file_names


# Set the directory containing requirement files
reqs_dir = REQS_FILES_ITERATE_0_OPENAI_DIR

# Get the list of files from the directory
file_list = get_file_names(reqs_dir, full_path=True)

# Choose the specific file (handle if not found)
airtable_file = next((file for file in file_list if "airtable" in file.lower()), None)
glean_file = next((file for file in file_list if "glean" in file.lower()), None)
lib_mut_file = next(
    (file for file in file_list if "liberty_mutual" in file.lower()), None
)
veeva_file = next((file for file in file_list if "veeva" in file.lower()), None)
zendesk_file = next((file for file in file_list if "zendesk" in file.lower()), None)
zendesk_file = next((file for file in file_list if "pwc" in file.lower()), None)

requirements_file = zendesk_file

if not requirements_file:
    raise FileNotFoundError("Airtable requirements file not found.")

# Load and display job requirements
print("Job Requirements:")
try:
    data = load_and_clean_json_file(requirements_file)
    display_json_pretty(data)
except Exception as e:
    print(f"Error loading requirements file: {e}")
    data = {}

# Extract URL from requirements file
url = data.get("url")
if not url:
    raise ValueError("No 'url' key found in requirements file.")

# Load and display corresponding job posting
try:
    job_postings = load_and_clean_json_file(JOB_DESCRIPTIONS_JSON_FILE)
    job_posting = job_postings.get(url)

    if not job_posting:
        raise KeyError(f"No job posting found for URL: {url}")

    print("Job Posting:")
    display_json_pretty(job_posting)
except Exception as e:
    print(f"Error loading job posting: {e}")

2025-03-31 13:18:38,945 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\requirements\PwC_Strategy__Manager_-_Digital_Value_Transformation_Contact_Center_reqs_flat_iter0.json


Job Requirements:


```json
{
  "url": "https://jobs.us.pwc.com/job/-/-
/932/76064801072?utm_source=linkedin.com&utm_campaign=core_media&utm_medium=social_media&utm_content=job_posting&ss=paid
&dclid=CjgKEAiAwaG9BhCY3ayl47PW8lcSJAA_gCfjt-rzWhQetHLIbJdJBVocWQm2BRNcBgOARxhGyR9bgvD_BwE",
  "requirements": {
    "0.pie_in_the_sky.0": "Demonstrates extensive level abilities and/or a proven record of success in Digital Value
Creation and Contact Center/Service Excellence",
    "0.pie_in_the_sky.1": "Demonstrating executive presence and ability to become a trusted advisor to the c-suite level
clients",
    "1.down_to_earth.0": "Minimum 5 years of experience",
    "1.down_to_earth.1": "Degree Preferred: Master Degree",
    "1.down_to_earth.2": "Demonstrating extensive level leadership, strategic and creative thinking, problem solving,
individual initiative",
    "1.down_to_earth.3": "Identifying and addressing client needs, rapidly building credibility, and maintaining and
utilizing networks of client relationships",
    "1.down_to_earth.4": "Managing teams/multiple work streams to establish successful project conclusion - i.e.,
delivery of quality work on time and within budget",
    "1.down_to_earth.5": "Building productive relationships with team members and clients, both long term and day-to-
day, by using a collaborative approach with thorough listening skills and the ability to manage through influence",
    "1.down_to_earth.6": "Utilizing first principles thinking, and developing credible and pragmatic analytical
approaches, frameworks and methodologies",
    "1.down_to_earth.7": "Analyzing complex quantitative and qualitative data in an efficient manner and synthesizing
the output into meaningful and actionable insights",
    "1.down_to_earth.8": "Designing and conducting market research to understand consumer needs and purchasing
behaviors",
    "1.down_to_earth.9": "Communicating effectively in an organized and knowledgeable manner in written and verbal
formats to senior audiences and being able to deliver difficult messages with persuasiveness and sensitivity",
    "1.down_to_earth.10": "Identifying and pursuing new business opportunities, and leading client/market development",
    "1.down_to_earth.11": "Attracting, retaining, assessing and developing staff/team members",
    "1.down_to_earth.12": "Demonstrating flexibility and creativity in managing work-life balance of self and team
members",
    "1.down_to_earth.13": "Demonstrating Power User ability with MS Office suite of applications including Word,
PowerPoint and Excel",
    "2.cultural_fit.0": "Develop new skills outside of comfort zone",
    "2.cultural_fit.1": "Act to resolve issues which prevent the team working effectively",
    "2.cultural_fit.2": "Coach others, recognise their strengths, and encourage them to take ownership of their personal
development",
    "2.cultural_fit.3": "Uphold the firm's code of ethics and business conduct"
  }
}
```

2025-03-31 13:18:38,949 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\preprocessing\jobpostings.json


Job Posting:


```json
{
  "status": "success",
  "message": "Job site data processed successfully.",
  "data": {
    "url": "https://jobs.us.pwc.com/job/-/-
/932/76064801072?utm_source=linkedin.com&utm_campaign=core_media&utm_medium=social_media&utm_content=job_posting&ss=paid
&dclid=CjgKEAiAwaG9BhCY3ayl47PW8lcSJAA_gCfjt-rzWhQetHLIbJdJBVocWQm2BRNcBgOARxhGyR9bgvD_BwE",
    "job_title": "Strategy& Manager - Digital Value Transformation Contact Center",
    "company": "PwC",
    "location": "CA-Los Angeles, GA-Atlanta, CO-Denver, IL-Chicago, OH-Cincinnati, MA-Boston, MO-St. Louis, NY-New York,
TX-Dallas, CA-San Francisco, PA-Philadelphia, WA-Seattle, TX-Houston",
    "salary_info": "$100,000 - $232,000, plus individuals may be eligible for an annual discretionary bonus",
    "posted_date": null,
    "content": {
      "Overview": "As a member of PwC's Strategy& (Strategy Consulting) team, you will help clients seize essential
advantages by developing corporate and business unit strategies and building the differentiating capabilities they need
to outperform their competitors. PwC Strategy Consultants provide innovative solutions focused on corporate & business
strategy, people & organization strategy, operations strategy, customer strategy, and technology strategy.",
      "Job Type": "Full Time",
      "Level": "Experienced",
      "Travel": "Yes (Frequent)",
      "The PwC Professional": "At PwC, we support our people at every stage in their career, with year-round
transparency into how they are performing and action-oriented coaching to drive their development and help achieve their
aspirations. All our employees are recruited, developed and managed against the global PwC Professional framework that
focuses on two dimensions: Trusted Leadership and Distinctive Outcomes. It guides us on \"what to do\" and \"how to do
it\", emphasizing impact and behaviors. Together, Trusted Leadership and Distinctive Outcomes are how we deliver on our
purpose and strategy, serving our clients and living our values in every interaction, everyday.",
      "Our Culture": "Our values and behaviors define the expectations we have for working together and with clients. We
all contribute to the culture of PwC. At PwC, we cultivate an environment in which our differences are embraced and our
people feel comfortable bringing their whole selves to work. We act with integrity\nWe make a difference\nWe care\nWe
work together\nWe reimagine the possible",
      "Benefits / Rewards": "Health Care: We offer comprehensive medical coverage, vision care, dental and health
savings accounts.\nRetirement: PwC offers a 401(k) Savings Plan and a Wealth Builder retirement plan completely funded
by PwC.\nMaternity/Paternity Leave: Eligible new parents receive, within the first year from birth or adoption/foster
placement, 12 weeks of paid parental leave. Parents have the option to work 60% of hours, at full-time pay, for an
additional four weeks immediately following paid parental leave.\nPaid Time Off: The Firm recognizes 13 paid holidays
and provides extended Firm holidays around the July 4 holiday and Christmas and New Year holiday. You accrue vacation
time of between three weeks (15 work days) and one month (22 work days) per year, based upon staff classification and
length of service with PwC.\nInfinite Learning: PwC will provide you with a personalized learning experience — using on-
the-job training, real-time development, smart technology and data and analytics, giving you customized access to formal
and informal learning.\nBe Well, Work Well: Through our increased focus on well-being, we are shifting the mindset of
how we work. We will provide you with the skills and knowledge to prioritize your well-being with opportunities and
resources for your physical, emotional, mental and spiritual health.",
      "description": "A career within Technology Strategy services, will provide you with the opportunity to help
organisations develop strategies that transform their technology capabilities and solve their most critical challenges.
We focus on building technology enabled and agile operating models, planning their new enterprise architecture into a
differentiating capability system that helps them win in the market, leveraging digital analytics to enhance the
customer experience and optimising business operations, and using modern management techniques such as robotic process
automation and next generation sourcing strategies to help our clients get fit for growth.\n\nTo really stand out and
make us fit for the future in a constantly changing world, each and every one of us at PwC needs to be a purpose-led and
values-driven leader at every level. To help us achieve this we have the PwC Professional; our global leadership
development framework. It gives us a single set of expectations across our lines, geographies and career paths, and
provides transparency on the skills we need as individuals to be successful and progress in our careers, now and in the
future.\n\nAs a Manager, you'll work as part of a team of problem solvers, helping to solve complex business issues from
strategy to execution. PwC Professional skills and responsibilities for this management level include but are not
limited to:\n\n- Develop new skills outside of comfort zone.\n- Act to resolve issues which prevent the team working
effectively.\n- Coach others, recognise their strengths, and encourage them to take ownership of their personal
development.\n- Analyse complex ideas or proposals and build a range of meaningful recommendations.\n- Use multiple
sources of information including broader stakeholder views to develop solutions and recommendations.\n- Address sub-
standard work or work that does not meet firm's/client's expectations.\n- Use data and insights to inform conclusions
and support decision-making.\n- Develop a point of view on key global trends, and how they impact clients.\n- Manage a
variety of viewpoints to build consensus and create positive outcomes for all parties.\n- Simplify complex messages,
highlighting and summarising key points.\n- Uphold the firm's code of ethics and business conduct.",
      "Basic Qualifications": "Minimum Degree Required: Bachelor Degree\nMinimum Years of Experience: 5 year(s)",
      "Preferred Qualifications": "Degree Preferred: Master Degree\nPreferred Knowledge/Skills:\nDemonstrates extensive
level abilities and/or a proven record of success in Digital Value Creation and Contact Center/Service Excellence,
either in professional consulting services or corporate roles in the following areas:\n- Bringing together the best of
digital capabilities to help our clients use digital technology (AI, ML, Data and Analytics, etc.) to transform their
business;\n- Working on projects across all industries and functions to help our clients deliver breakthrough products,
experiences, and businesses, both on technology and non-technology topics; and,\n- Designing end to end customer /
process journeys across the business value chain, designing the product solution and working with a team of
technologists to build the solution and its eventual rollout to customers / employees.\n\nDemonstrates extensive level
leadership, strategic and creative thinking, problem solving, individual initiative, and the following abilities:\n-
Identifying and addressing client needs, rapidly building credibility, and maintaining and utilizing networks of client
relationships;\n- Managing teams / multiple work streams to establish successful project conclusion - i.e., delivery of
quality work on time and within budget;\n- Building productive relationships with team members and clients, both long
term and day-to-day, by using a collaborative approach with thorough listening skills and the ability to manage through
influence;\n- Utilizing first principles thinking, and developing credible and pragmatic analytical approaches,
frameworks and methodologies;\n- Analyzing complex quantitative and qualitative data in an efficient manner and
synthesizing the output into meaningful and actionable insights;\n- Designing and conducting market research to
understand consumer needs and purchasing behaviors;\n- Communicating effectively in an organized and knowledgeable
manner in written and verbal formats to senior audiences and being able to deliver difficult messages with
persuasiveness and sensitivity;\n- Demonstrating executive presence and ability to become a trusted advisor to the
c-suite level clients;\n- Identifying and pursuing new business opportunities, and leading client / market
development;\n- Attracting, retaining, assessing and developing staff / team members;\n- Demonstrating flexibility and
creativity in managing work-life balance of self and team members; and,\n- Demonstrating Power User ability with MS
Office suite of applications including Word, PowerPoint and Excel."
    }
  }
}
```

In [21]:
reqs_dir = REQS_FILES_ITERATE_0_OPENAI_DIR
file_list = get_file_names(reqs_dir, full_path=True)

src_term = "takeda"
file_names = [file for file in file_list if src_term in str(file).lower()]

for f in file_names:
    display_json_pretty(f)

# for key, value in requirements.items():
#     print(f"{key}: {value}")
#     print()

2025-04-04 14:35:06,573 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_0\requirements\Takeda_Senior_Manager__Market_Research__RDBU_Commercial_Analytics_and_Insights_reqs_flat_iter0.json


```json
{
  "url": "https://jobs.takeda.com/job/-/-/1113/79540724304",
  "requirements": {
    "0.pie_in_the_sky.0": "Leading market research and competitive intelligence initiatives to drive organizational
action and maximize commercial potential.",
    "0.pie_in_the_sky.1": "Acting as a trusted advisor to brand teams and senior leadership, providing meaningful
recommendations for decision-making.",
    "1.down_to_earth.0": "6+ years of experience in advanced analytics, marketing, market research, or related roles
within the bio-pharmaceutical industry.",
    "1.down_to_earth.1": "Extensive knowledge and application of market research principles, methodologies, and
practices.",
    "2.cultural_fit.0": "Collaborative and strategic partner to brand teams, contributing to brand strategy.",
    "2.cultural_fit.1": "Excellent communication and presentation skills, influencing cross-functional teams.",
    "3.other.0": "Proficient in creating persuasive presentations and influencing strategic decisions.",
    "3.other.1": "Experience in vendor and budget management, with up to 20% travel domestically."
  }
}
```

### Anthropic I/O Iterate 0

#### Mapping File

##### Check all records from mapping file

In [None]:
from utils.get_file_names import get_file_names
from models.resume_job_description_io_models import JobFileMappings
from evaluation_optimization.create_mapping_file import load_mappings_model_from_json

# from project_config import URL_TO_FILE_MAPPING_FILE_ITERATE_0_OPENAI

directory = ITERATE_0_ANTHROPIC_DIR
mapping_file = directory / mapping_file_name

file_mapping_model = load_mappings_model_from_json(mapping_file)

print("Job URLs:")
print(f"Number of URLs: {len(file_mapping_model.root.keys())}")

for index, url in enumerate(file_mapping_model.root.keys(), start=1):
    print(f"{index}. {url}")
    # print()

print("\n")

print("sim_metrics paths:")
for index, jobpaths in enumerate(file_mapping_model.root.values(), start=1):
    print(f"{index}. {Path(jobpaths.sim_metrics).name}")
    # print()

2025-03-09 15:40:39,378 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_anthropic\iteration_0\url_to_file_mapping.json
2025-03-09 15:40:39,379 - evaluation_optimization.create_mapping_file - INFO - Loaded and validated mapping file from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_anthropic\iteration_0\url_to_file_mapping.json


Job URLs:
Number of URLs: 30
1. https://www.google.com/about/careers/applications/jobs/results/113657145978692294-ai-market-intelligence-principal/?src=Online/LinkedIn/linkedin_us&utm_source=linkedin&utm_medium=jobposting&utm_campaign=contract&utm_medium=jobboard&utm_source=linkedin
2. https://www.capitalonecareers.com/job/-/-/234/66270465536?p_sid=ep3Sfxb&p_uid=sDBMWC5VxQ&source=rd_linkedin_job_posting_tm&ss=paid&utm_campaign=capone_all_jobs_24&utm_content=pj_board&utm_medium=jobad&utm_source=linkedin+slotted&dclid=CPGV3bef44gDFUEGTwgd4DoHPg
3. https://boards.greenhouse.io/embed/job_app?token=7600823002&gh_src=ab9f35b82
4. https://www.amazon.jobs/en/jobs/2696123/research-manager-strategy-and-insights-gca-marketing?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=cxro&utm_medium=social_media&utm_content=job_posting&ss=paid
5. https://www.amazon.jobs/en/jobs/2742527/sr-generative-ai-strategist-generative-ai-innovation-center?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=c

##### Check for specific records

In [None]:
search_terms = ["blend", "Amazon", "Blend", "Snowflake"]


# Find URLs that contain the search term
matching_urls = [
    url
    for url in file_mapping_model.root.keys()
    if any(term.lower() in str(url).lower() for term in search_terms)
]

print("Matching URLs:")
for index, url in enumerate(matching_urls, start=1):
    print(f"{index}. {url}")
    print()

Matching URLs:
1. https://www.amazon.jobs/en/jobs/2696123/research-manager-strategy-and-insights-gca-marketing?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=cxro&utm_medium=social_media&utm_content=job_posting&ss=paid

2. https://www.amazon.jobs/en/jobs/2742527/sr-generative-ai-strategist-generative-ai-innovation-center?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=cxro&utm_medium=social_media&utm_content=job_posting&ss=paid

3. https://www.amazon.jobs/en/jobs/2684745/product-manager-artificial-general-intelligence-data-services?cmpid=SPLICX0248M&ss=paid&utm_campaign=cxro&utm_content=job_posting&utm_medium=social_media&utm_source=linkedin.com

4. https://jobs.smartrecruiters.com/Blend360/744000042638791-director-ai-strategy?trid=2d92f286-613b-4daf-9dfa-6340ffbecf73

5. https://www.amazon.jobs/en/jobs/2905092/senior-manger-partner-strategy-genai-innovation-center?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=cxro&utm_medium=social_media&utm_content=job_posting

#### Sim Metrics

##### Check all records

In [None]:
from utils.get_file_names import get_file_names
from models.resume_job_description_io_models import JobFileMappings
from evaluation_optimization.create_mapping_file import load_mappings_model_from_json

# from project_config import URL_TO_FILE_MAPPING_FILE_ITERATE_0_OPENAI

directory = SIMILARITY_METRICS_ITERATE_0_ANTHROPIC_DIR
file_list = get_file_names(directory_path=directory)

print("sim_metrics files in sim_metrics folder")
print(f"Number of files: {len(file_list)}")

for index, file_name in enumerate(file_list, start=1):
    print(f"{index}. {file_name}")
    # print()

sim_metrics files in sim_metrics folder
Number of files: 27
1. Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_sim_metrics_iter0.csv
2. Adobe_Sr__Director__Applied_AI_ML__Discovery__sim_metrics_iter0.csv
3. Airtable_Product_Manager__AI_sim_metrics_iter0.csv
4. Amazon_Product_Manager__Artificial_General_Intelligence_-_Data_Services_sim_metrics_iter0.csv
5. Amazon_Research_Manager_-_Strategy_and_Insights_GCA_Marketing_sim_metrics_iter0.csv
6. Amazon_Sr__Generative_AI_Strategist__Generative_AI_Innovation_Center_sim_metrics_iter0.csv
7. Amplitude_Marketing_Strategy___Analytics_Manager_sim_metrics_iter0.csv
8. Blend_Director__AI_Strategy_sim_metrics_iter0.csv
9. Capital_One_Director__AI_Platforms_sim_metrics_iter0.csv
10. Deloitte_AI_Data_Specialist_sim_metrics_iter0.csv
11. Deloitte_Global_Business_Services__GBS__Strategy_Manager_sim_metrics_iter0.csv
12. Deloitte_Market_Research_Sr_Manager_sim_metrics_iter0.csv
13. DEPT__Director_of_Applied_AI_Strategy__Media_sim_metrics_iter0.csv
1

##### Missing records (similarity metrics files)

In [None]:
from pathlib import Path
from utils.get_file_names import get_file_names
from models.resume_job_description_io_models import JobFileMappings
from evaluation_optimization.create_mapping_file import load_mappings_model_from_json

# from project_config import URL_TO_FILE_MAPPING_FILE_ITERATE_0_OPENAI

sim_metrics_dir = SIMILARITY_METRICS_ITERATE_0_ANTHROPIC_DIR
file_list_sim_metrics_dir = get_file_names(directory_path=sim_metrics_dir)

directory = ITERATE_0_ANTHROPIC_DIR
mapping_file = directory / mapping_file_name
file_mapping_model = load_mappings_model_from_json(mapping_file)
file_list_mapping_file = [
    Path(jobpaths.sim_metrics).name for jobpaths in file_mapping_model.root.values()
]

missing_files = set(file_list_mapping_file) - set(
    file_list_sim_metrics_dir
)  # Elements in list1 but not in list2

print(f"Number of missing files in sim metrics folder: {len(missing_files)}")
for idx, f_name in enumerate(missing_files, start=1):
    print(f"{idx}. {f_name}")

2025-03-09 15:41:24,273 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_anthropic\iteration_0\url_to_file_mapping.json
2025-03-09 15:41:24,275 - evaluation_optimization.create_mapping_file - INFO - Loaded and validated mapping file from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_anthropic\iteration_0\url_to_file_mapping.json


Number of missing files in sim metrics folder: 3
1. Amazon_Web_Services__Inc__Senior_Manger__Partner_Strategy__GenAI_Innovation_Center_sim_metrics_iter0.csv
2. Advisor360__Senior_Product_Manager_-_AI_Analytics___Insights_sim_metrics_iter0.csv
3. Snowflake_Director__Product_Marketing_-_Analytics_sim_metrics_iter0.csv


In [None]:
from pathlib import Path
from utils.get_file_names import get_file_names
from models.resume_job_description_io_models import JobFileMappings, Requirements
from evaluation_optimization.create_mapping_file import load_mappings_model_from_json

# from project_config import URL_TO_FILE_MAPPING_FILE_ITERATE_0_OPENAI

sim_metrics_dir = SIMILARITY_METRICS_ITERATE_0_ANTHROPIC_DIR
file_list_sim_metrics_dir = get_file_names(directory_path=sim_metrics_dir)

requirements_dir = REQS_FILES_ITERATE_0_ANTHROPIC_DIR
file_list_reqs_dir = get_file_names(requirements_dir)
print(f"Files in reqs dir: {len(file_list_reqs_dir)}")

responsibilities_dir = RESPS_FILES_ITERATE_0_ANTHROPIC_DIR
file_list_resps_dir = get_file_names(responsibilities_dir)
print(f"Files in resps dir: {len(file_list_resps_dir)}")

directory = ITERATE_0_ANTHROPIC_DIR
mapping_file = directory / mapping_file_name
file_mapping_model = load_mappings_model_from_json(mapping_file)
file_list_mapping_file = [
    Path(jobpaths.sim_metrics).name for jobpaths in file_mapping_model.root.values()
]
print(f"Files in mapping file: {len(file_list_mapping_file)}")

print()

missing_files = set(file_list_mapping_file) - set(
    file_list_sim_metrics_dir
)  # Elements in list1 but not in list2

print(f"Number of missing files in sim metrics folder: {len(missing_files)}")
for idx, f_name in enumerate(missing_files, start=1):
    print(f"{idx}. {f_name}")

2025-03-09 15:43:34,592 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_anthropic\iteration_0\url_to_file_mapping.json
2025-03-09 15:43:34,592 - evaluation_optimization.create_mapping_file - INFO - Loaded and validated mapping file from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_anthropic\iteration_0\url_to_file_mapping.json


Files in reqs dir: 30
Files in resps dir: 30
Files in mapping file: 30

Number of missing files in sim metrics folder: 3
1. Amazon_Web_Services__Inc__Senior_Manger__Partner_Strategy__GenAI_Innovation_Center_sim_metrics_iter0.csv
2. Advisor360__Senior_Product_Manager_-_AI_Analytics___Insights_sim_metrics_iter0.csv
3. Snowflake_Director__Product_Marketing_-_Analytics_sim_metrics_iter0.csv


#### Requirements

In [None]:
from models.resume_job_description_io_models import Requirements
from project_config import REQS_FILES_ITERATE_0_ANTHROPIC_DIR

reqs_dir = REQS_FILES_ITERATE_0_ANTHROPIC_DIR


file_list = get_file_names(reqs_dir, True)

for idx, file in enumerate(file_list, start=1):
    data = load_and_clean_json_file(file)
    validated_data = Requirements(**data)

    print(f"{idx}. {Path(file).name}")
    print(f"Number of requirements: {len(validated_data.requirements)}")

1. Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_reqs_flat_iter0.json
Number of requirements: 10
2. Adobe_Sr__Director__Applied_AI_ML__Discovery__reqs_flat_iter0.json
Number of requirements: 6
3. Advisor360__Senior_Product_Manager_-_AI_Analytics___Insights_reqs_flat_iter0.json
Number of requirements: 12
4. Airtable_Product_Manager__AI_reqs_flat_iter0.json
Number of requirements: 5
5. Amazon_Product_Manager__Artificial_General_Intelligence_-_Data_Services_reqs_flat_iter0.json
Number of requirements: 8
6. Amazon_Research_Manager_-_Strategy_and_Insights_GCA_Marketing_reqs_flat_iter0.json
Number of requirements: 8
7. Amazon_Sr__Generative_AI_Strategist__Generative_AI_Innovation_Center_reqs_flat_iter0.json
Number of requirements: 12
8. Amazon_Web_Services__Inc__Senior_Manger__Partner_Strategy__GenAI_Innovation_Center_reqs_flat_iter0.json
Number of requirements: 12
9. Amplitude_Marketing_Strategy___Analytics_Manager_reqs_flat_iter0.json
Number of requirements: 13
10. Blend_Director__

## Check Iteration 1

### Iteration 1 Imports

In [19]:
import json
import textwrap
from IPython.display import display, Markdown
from project_config import (
    JOB_POSTING_URLS_FILE,
    JOB_DESCRIPTIONS_JSON_FILE,
    JOB_REQUIREMENTS_JSON_FILE,
    ITERATE_1_ANTHROPIC_DIR,
    mapping_file_name,
    REQS_FILES_ITERATE_1_ANTHROPIC_DIR,
    RESPS_FILES_ITERATE_1_ANTHROPIC_DIR,
    SIMILARITY_METRICS_ITERATE_1_ANTHROPIC_DIR,
    ITERATE_1_OPENAI_DIR,
    REQS_FILES_ITERATE_1_OPENAI_DIR,
    RESPS_FILES_ITERATE_1_OPENAI_DIR,
    SIMILARITY_METRICS_ITERATE_1_OPENAI_DIR,
)

### Anthropic Iterate 1

#### Mapping File

#### Responsibilities

In [None]:
from utils.get_file_names import get_file_names

directory = RESPS_FILES_ITERATE_1_ANTHROPIC_DIR

file_names = get_file_names(directory_path=directory)

print(f"Responsibilities file names: \n" + "\n".join(name for name in file_names))

Responsibilities file names: 
Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_resps_nested_iter1.json
Adobe_Sr__Director__Applied_AI_ML__Discovery__resps_nested_iter1.json
Advisor360__Senior_Product_Manager_-_AI_Analytics___Insights_resps_nested_iter1.json
Airtable_Product_Manager__AI_resps_nested_iter1.json
Amazon_Product_Manager__Artificial_General_Intelligence_-_Data_Services_resps_nested_iter1.json
Amazon_Research_Manager_-_Strategy_and_Insights_GCA_Marketing_resps_nested_iter1.json
Amazon_Sr__Generative_AI_Strategist__Generative_AI_Innovation_Center_resps_nested_iter1.json
Amazon_Web_Services__Inc__Senior_Manger__Partner_Strategy__GenAI_Innovation_Center_resps_nested_iter1.json
Amplitude_Marketing_Strategy___Analytics_Manager_resps_nested_iter1.json
Blend_Director__AI_Strategy_resps_nested_iter1.json
Capital_One_Director__AI_Platforms_resps_nested_iter1.json
Deloitte_AI_Data_Specialist_resps_nested_iter1.json
Deloitte_Global_Business_Services__GBS__Strategy_Manager_resps_nes

In [None]:
from models.resume_job_description_io_models import NestedResponsibilities

# Load and validate the JSON data
# file_name = "Blend_Director__AI_Strategy_resps_nested_iter1.json"
file_name = (
    "Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_resps_nested_iter1.json"
)

for file in file_names:
    # file_name = "Advisor360__Senior_Product_Manager_-_AI_Analytics___Insights_resps_nested_iter1.json"
    file_path = RESPS_FILES_ITERATE_1_ANTHROPIC_DIR / file

    data = load_and_clean_json_file(file_path)
    validated_data = NestedResponsibilities(**data)

    # Compute the number of matched requirements per responsibility
    num_requirements_per_responsibility = {
        resp_key: len(resp.optimized_by_requirements)
        for resp_key, resp in validated_data.responsibilities.items()
    }

    # Display some insights
    most_matched_resp = max(
        num_requirements_per_responsibility,
        key=lambda k: num_requirements_per_responsibility[k],
    )

    least_matched_resp = min(
        num_requirements_per_responsibility,
        key=lambda k: num_requirements_per_responsibility[k],
    )

    print(f"File: {file}")
    print(f"Total Responsibilities: {len(num_requirements_per_responsibility)}")
    print(
        f"Most Matched Responsibility: {most_matched_resp} -> Matches: {num_requirements_per_responsibility[most_matched_resp]}"
    )
    print(
        f"Least Matched Responsibility: {least_matched_resp} -> Matches: {num_requirements_per_responsibility[least_matched_resp]}"
    )
    print()

# Find responsibilities with zero matches
no_match_resps = [
    resp_key
    for resp_key, count in num_requirements_per_responsibility.items()
    if count == 0
]
# print(f"Responsibilities with no matched requirements: {len(no_match_resps)}")

# matches_list = validated_data.responsibilities["2.responsibilities.7"]
# match_list = matches_list.model_dump()
# match_list
# # Distribution of matches
# import matplotlib.pyplot as plt

# plt.hist(num_requirements_per_responsibility.values(), bins=10, edgecolor="black")
# plt.xlabel("Number of Matched Requirements")
# plt.ylabel("Number of Responsibilities")
# plt.title("Distribution of Requirement Matches per Responsibility")
# plt.show()

File: Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_resps_nested_iter1.json
Total Responsibilities: 30
Most Matched Responsibility: 0.responsibilities.0 -> Matches: 10
Least Matched Responsibility: 0.responsibilities.0 -> Matches: 10

File: Adobe_Sr__Director__Applied_AI_ML__Discovery__resps_nested_iter1.json
Total Responsibilities: 26
Most Matched Responsibility: 3.responsibilities.0 -> Matches: 6
Least Matched Responsibility: 0.responsibilities.0 -> Matches: 1

File: Advisor360__Senior_Product_Manager_-_AI_Analytics___Insights_resps_nested_iter1.json
Total Responsibilities: 30
Most Matched Responsibility: 0.responsibilities.0 -> Matches: 12
Least Matched Responsibility: 3.responsibilities.0 -> Matches: 7

File: Airtable_Product_Manager__AI_resps_nested_iter1.json
Total Responsibilities: 26
Most Matched Responsibility: 3.responsibilities.3 -> Matches: 5
Least Matched Responsibility: 0.responsibilities.0 -> Matches: 1

File: Amazon_Product_Manager__Artificial_General_Intelligen

#### Requirements

In [None]:
from utils.get_file_names import get_file_names

directory = REQS_FILES_ITERATE_1_ANTHROPIC_DIR

file_names = get_file_names(directory_path=directory)
print(f"Files in requirements dir: \n", ".\n".join(names for names in file_names))

Files in requirements dir: 
 Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_reqs_flat_iter1.json.
Adobe_Sr__Director__Applied_AI_ML__Discovery__reqs_flat_iter1.json.
Advisor360__Senior_Product_Manager_-_AI_Analytics___Insights_reqs_flat_iter1.json.
Airtable_Product_Manager__AI_reqs_flat_iter1.json.
Amazon_Product_Manager__Artificial_General_Intelligence_-_Data_Services_reqs_flat_iter1.json.
Amazon_Research_Manager_-_Strategy_and_Insights_GCA_Marketing_reqs_flat_iter1.json.
Amazon_Sr__Generative_AI_Strategist__Generative_AI_Innovation_Center_reqs_flat_iter1.json.
Amazon_Web_Services__Inc__Senior_Manger__Partner_Strategy__GenAI_Innovation_Center_reqs_flat_iter1.json.
Amplitude_Marketing_Strategy___Analytics_Manager_reqs_flat_iter1.json.
Blend_Director__AI_Strategy_reqs_flat_iter1.json.
Capital_One_Director__AI_Platforms_reqs_flat_iter1.json.
Deloitte_AI_Data_Specialist_reqs_flat_iter1.json.
Deloitte_Global_Business_Services__GBS__Strategy_Manager_reqs_flat_iter1.json.
Deloitte_Mar

In [None]:
from models.resume_job_description_io_models import Requirements

# Load and validate the JSON data

# file_name = "Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_reqs_flat_iter1.json"
# file_name = "Blend_Director__AI_Strategy_reqs_flat_iter1.json"
# file_name = 'Advisor360__Senior_Product_Manager_-_AI_Analytics___Insights_reqs_flat_iter1.json'

file_list = get_file_names(REQS_FILES_ITERATE_1_ANTHROPIC_DIR, True)
print(file_list)

for idx, file in enumerate(file_list, start=1):
    data = load_and_clean_json_file(file)
    validated_data = Requirements(**data)

    print(f"{idx}. {Path(file).name}")
    print(f"Number of requirements: {len(validated_data.requirements)}")

['C:\\github\\job_bot\\input_output\\evaluation_optimization\\evaluation_optimization_by_anthropic\\iteration_1\\requirements\\Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_reqs_flat_iter1.json', 'C:\\github\\job_bot\\input_output\\evaluation_optimization\\evaluation_optimization_by_anthropic\\iteration_1\\requirements\\Adobe_Sr__Director__Applied_AI_ML__Discovery__reqs_flat_iter1.json', 'C:\\github\\job_bot\\input_output\\evaluation_optimization\\evaluation_optimization_by_anthropic\\iteration_1\\requirements\\Advisor360__Senior_Product_Manager_-_AI_Analytics___Insights_reqs_flat_iter1.json', 'C:\\github\\job_bot\\input_output\\evaluation_optimization\\evaluation_optimization_by_anthropic\\iteration_1\\requirements\\Airtable_Product_Manager__AI_reqs_flat_iter1.json', 'C:\\github\\job_bot\\input_output\\evaluation_optimization\\evaluation_optimization_by_anthropic\\iteration_1\\requirements\\Amazon_Product_Manager__Artificial_General_Intelligence_-_Data_Services_reqs_flat_iter1

### OpenAI Iterate 1

#### Mapping File

In [22]:
from utils.get_file_names import get_file_names
from pathlib import Path
from models.resume_job_description_io_models import JobFileMappings
from evaluation_optimization.create_mapping_file import (
    load_job_file_mappings_model,
)
from project_config import ITERATE_1_OPENAI_DIR, mapping_file_name

# from project_config import URL_TO_FILE_MAPPING_FILE_ITERATE_0_OPENAI
directory = ITERATE_1_OPENAI_DIR
mapping_file = directory / mapping_file_name
file_mapping_model = load_job_file_mappings_model(mapping_file)

print("Job URLs:")
print(f"Number of URLs: {len(file_mapping_model.root.keys())}")


for index, url in enumerate(file_mapping_model.root.keys(), start=1):
    print(f"{index}. {url}")

print("\n")
print(
    *(
        url
        for index, url in enumerate(file_mapping_model.root.keys(), start=1)
        if "glean" in str(url)
    ),
    sep="\n",
)

print([key for key in file_mapping_model.root.keys() if "tekeda" in str(key)])

2025-05-04 10:28:48,299 - utils.generic_utils - INFO - Loaded data from /home/xzhang/dev/job_bot/input_output/evaluation_optimization/evaluation_optimization_by_openai/iteration_1/url_to_file_mapping.json
2025-05-04 10:28:48,309 - utils.pydantic_model_loaders_from_files - INFO - Loaded and validated mapping file from /home/xzhang/dev/job_bot/input_output/evaluation_optimization/evaluation_optimization_by_openai/iteration_1/url_to_file_mapping.json


Job URLs:
Number of URLs: 38
1. https://www.google.com/about/careers/applications/jobs/results/113657145978692294-ai-market-intelligence-principal/?src=Online/LinkedIn/linkedin_us&utm_source=linkedin&utm_medium=jobposting&utm_campaign=contract&utm_medium=jobboard&utm_source=linkedin
2. https://www.capitalonecareers.com/job/-/-/234/66270465536?p_sid=ep3Sfxb&p_uid=sDBMWC5VxQ&source=rd_linkedin_job_posting_tm&ss=paid&utm_campaign=capone_all_jobs_24&utm_content=pj_board&utm_medium=jobad&utm_source=linkedin+slotted&dclid=CPGV3bef44gDFUEGTwgd4DoHPg
3. https://boards.greenhouse.io/embed/job_app?token=7600823002&gh_src=ab9f35b82
4. https://www.amazon.jobs/en/jobs/2696123/research-manager-strategy-and-insights-gca-marketing?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=cxro&utm_medium=social_media&utm_content=job_posting&ss=paid
5. https://www.amazon.jobs/en/jobs/2742527/sr-generative-ai-strategist-generative-ai-innovation-center?cmpid=SPLICX0248M&utm_source=linkedin.com&utm_campaign=c

#### Responsibilities

In [22]:
from utils.get_file_names import get_file_names
from project_config import RESPS_FILES_ITERATE_1_OPENAI_DIR

directory = RESPS_FILES_ITERATE_1_OPENAI_DIR

file_names = get_file_names(directory_path=directory)

print(f"Responsibilities file names: \n" + "\n".join(name for name in file_names))

Responsibilities file names: 
Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_resps_nested_iter1.json
Adobe_Sr__Director__Applied_AI_ML__Discovery__resps_nested_iter1.json
Advisor360__Senior_Product_Manager_-_AI_Analytics___Insights_resps_nested_iter1.json
Airtable_Product_Manager__AI_resps_nested_iter1.json
Amazon_Product_Manager__Artificial_General_Intelligence_-_Data_Services_resps_iter1.json
Amazon_Research_Manager_-_Strategy_and_Insights_GCA_Marketing_resps_iter1.json
Amazon_Sr__Generative_AI_Strategist__Generative_AI_Innovation_Center_resps_iter1.json
Amazon_Web_Services__Inc__Senior_Manger__Partner_Strategy__GenAI_Innovation_Center_resps_nested_iter1.json
Amplitude_Marketing_Strategy___Analytics_Manager_resps_iter1.json
Blend_Director__AI_Strategy_resps_nested_iter1.json
Boston_Scientific_Hybrid_Sr__Manager_Marketing_Analytics_Strategy_resps_nested_iter1.json
Capital_One_Director__AI_Platforms_resps_iter1.json
Deloitte_AI_Data_Specialist_resps_nested_iter1.json
Deloitte_Gl

In [77]:
from models.resume_job_description_io_models import NestedResponsibilities
from pydantic import ValidationError

# Load and validate the JSON data
# file_name = "Blend_Director__AI_Strategy_resps_nested_iter1.json"
file_name = (
    "Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_resps_nested_iter1.json"
)

for file in file_names:
    # file_name = "Advisor360__Senior_Product_Manager_-_AI_Analytics___Insights_resps_nested_iter1.json"
    file_path = RESPS_FILES_ITERATE_1_OPENAI_DIR / file

    try:
        data = load_and_clean_json_file(file_path)
        validated_data = NestedResponsibilities(**data)

    except ValidationError as e:
        print(e)
        print(e.json)

    # Compute the number of matched requirements per responsibility
    num_requirements_per_responsibility = {
        resp_key: len(resp.optimized_by_requirements)
        for resp_key, resp in validated_data.responsibilities.items()
    }

    # Display some insights
    most_matched_resp = max(
        num_requirements_per_responsibility,
        key=lambda k: num_requirements_per_responsibility[k],
    )

    least_matched_resp = min(
        num_requirements_per_responsibility,
        key=lambda k: num_requirements_per_responsibility[k],
    )

    print(f"File: {file}")
    print(f"Total Responsibilities: {len(num_requirements_per_responsibility)}")
    print(
        f"Most Matched Responsibility: {most_matched_resp} -> Matches: {num_requirements_per_responsibility[most_matched_resp]}"
    )
    print(
        f"Least Matched Responsibility: {least_matched_resp} -> Matches: {num_requirements_per_responsibility[least_matched_resp]}"
    )
    print()

2025-03-21 13:27:28,380 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_1\responsibilities\Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_resps_nested_iter1.json
2025-03-21 13:27:28,385 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_1\responsibilities\Adobe_Sr__Director__Applied_AI_ML__Discovery__resps_nested_iter1.json
2025-03-21 13:27:28,388 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_1\responsibilities\Advisor360__Senior_Product_Manager_-_AI_Analytics___Insights_resps_nested_iter1.json
2025-03-21 13:27:28,391 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_1\responsibilities\Airtabl

File: Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_resps_nested_iter1.json
Total Responsibilities: 30
Most Matched Responsibility: 0.responsibilities.0 -> Matches: 10
Least Matched Responsibility: 0.responsibilities.0 -> Matches: 10

File: Adobe_Sr__Director__Applied_AI_ML__Discovery__resps_nested_iter1.json
Total Responsibilities: 30
Most Matched Responsibility: 0.responsibilities.0 -> Matches: 7
Least Matched Responsibility: 0.responsibilities.0 -> Matches: 7

File: Advisor360__Senior_Product_Manager_-_AI_Analytics___Insights_resps_nested_iter1.json
Total Responsibilities: 30
Most Matched Responsibility: 0.responsibilities.0 -> Matches: 1
Least Matched Responsibility: 0.responsibilities.0 -> Matches: 1

File: Airtable_Product_Manager__AI_resps_nested_iter1.json
Total Responsibilities: 30
Most Matched Responsibility: 0.responsibilities.0 -> Matches: 8
Least Matched Responsibility: 0.responsibilities.0 -> Matches: 8

1 validation error for NestedResponsibilities
url
  Field re

In [78]:
from utils.generic_utils import read_from_json_file

file = r"C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_1\responsibilities\Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_resps_nested_iter1.json"
data = read_from_json_file(file)

# Compute the number of matched requirements per responsibility
num_requirements_per_responsibility = {
    resp_key: len(resp.optimized_by_requirements)
    for resp_key, resp in validated_data.responsibilities.items()
}

# Display some insights
most_matched_resp = max(
    num_requirements_per_responsibility,
    key=lambda k: num_requirements_per_responsibility[k],
)

least_matched_resp = min(
    num_requirements_per_responsibility,
    key=lambda k: num_requirements_per_responsibility[k],
)

print(f"File: {file}")
print(f"Total Responsibilities: {len(num_requirements_per_responsibility)}")
print(
    f"Most Matched Responsibility: {most_matched_resp} -> Matches: {num_requirements_per_responsibility[most_matched_resp]}"
)
print(
    f"Least Matched Responsibility: {least_matched_resp} -> Matches: {num_requirements_per_responsibility[least_matched_resp]}"
)
print()

2025-03-21 13:30:54,372 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_1\responsibilities\Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_resps_nested_iter1.json


File: C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_1\responsibilities\Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_resps_nested_iter1.json
Total Responsibilities: 30
Most Matched Responsibility: 0.responsibilities.0 -> Matches: 1
Least Matched Responsibility: 0.responsibilities.0 -> Matches: 1



In [79]:
from utils.generic_utils import read_from_json_file

file = r"C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_anthropic\iteration_1\requirements\Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_reqs_flat_iter1.json"
data = read_from_json_file(file)

data

2025-03-21 13:31:40,518 - utils.generic_utils - INFO - Loaded data from C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_anthropic\iteration_1\requirements\Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_reqs_flat_iter1.json


{'url': 'https://www.accenture.com/us-en/careers/jobdetails?id=R00251798_en&src=LINKEDINJP',
 'requirements': {'0.pie_in_the_sky.0': 'Shape vision and create opportunities for data & AI led business reinvention.',
  '0.pie_in_the_sky.1': 'Create strategy for AI-first products and develop commercialization opportunities.',
  '1.down_to_earth.0': '5+ years of experience in business development, client relationship management, or marketing.',
  '1.down_to_earth.1': 'Proficiency in CRM tools such as Salesforce for tracking and analyzing client interactions.',
  '1.down_to_earth.2': 'Ability to build client relationships and credibility as a trusted advisor on how to infuse Data & AI into the business processes or functions.',
  '2.cultural_fit.0': 'Collaborative leadership style with a growth-oriented mindset.',
  '2.cultural_fit.1': 'Ability to mentor and develop high-performing teams.',
  '2.cultural_fit.2': 'Infuse Responsible AI in vision and roadmap, develop plan for leveraging ecosys

#### Requirements

In [4]:
from utils.get_file_names import get_file_names
from project_config import REQS_FILES_ITERATE_1_OPENAI_DIR

directory = REQS_FILES_ITERATE_1_OPENAI_DIR

file_names = get_file_names(directory_path=directory)

print(f"Responsibilities file names: \n" + "\n".join(name for name in file_names))

Responsibilities file names: 
Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_reqs_flat_iter1.json
Adobe_Sr__Director__Applied_AI_ML__Discovery__reqs_flat_iter1.json
Advisor360__Senior_Product_Manager_-_AI_Analytics___Insights_reqs_flat_iter1.json
Airtable_Product_Manager__AI_reqs_flat_iter1.json
Amazon_Product_Manager__Artificial_General_Intelligence_-_Data_Services_reqs_iter1.json
Amazon_Research_Manager_-_Strategy_and_Insights_GCA_Marketing_reqs_iter1.json
Amazon_Sr__Generative_AI_Strategist__Generative_AI_Innovation_Center_reqs_iter1.json
Amazon_Web_Services__Inc__Senior_Manger__Partner_Strategy__GenAI_Innovation_Center_reqs_flat_iter1.json
Amplitude_Marketing_Strategy___Analytics_Manager_reqs_iter1.json
Blend_Director__AI_Strategy_reqs_flat_iter1.json
Boston_Scientific_Hybrid_Sr__Manager_Marketing_Analytics_Strategy_reqs_flat_iter1.json
Capital_One_Director__AI_Platforms_reqs_iter1.json
Deloitte_AI_Data_Specialist_reqs_flat_iter1.json
Deloitte_Global_Business_Services__GBS__

#### Sim Metrics

##### All sim metrics files

In [11]:
from utils.get_file_names import get_file_names
from project_config import SIMILARITY_METRICS_ITERATE_1_OPENAI_DIR

sim_dir = SIMILARITY_METRICS_ITERATE_1_OPENAI_DIR

get_file_names(sim_dir)

['Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_sim_metrics_iter1.csv',
 'Airtable_Product_Manager__AI_sim_metrics_iter1.csv',
 'Amazon_Product_Manager__Artificial_General_Intelligence_-_Data_Services_sim_metrics_iter1.csv',
 'Amazon_Research_Manager_-_Strategy_and_Insights_GCA_Marketing_sim_metrics_iter1.csv',
 'Amazon_Sr__Generative_AI_Strategist__Generative_AI_Innovation_Center_sim_metrics_iter1.csv',
 'Amplitude_Marketing_Strategy___Analytics_Manager_sim_metrics_iter1.csv',
 'Boston_Scientific_Hybrid_Sr__Manager_Marketing_Analytics_Strategy_sim_metrics_iter1.csv',
 'Glean_Head_of_Competitive_Intelligence_sim_metrics_iter1.csv',
 'Google_AI_Market_Intelligence_Principal_sim_metrics_iter1.csv',
 'Liberty_Mutual_Insurance_Senior_Manager_I_-_Corporate_Strategy___Research_sim_metrics_iter1.csv',
 'Meta_Product_Strategy_Lead_sim_metrics_iter1.csv',
 'Microsoft_Head_of_Partner_Intelligence_and_Strategy_sim_metrics_iter1.csv',
 'PwC_Data___Analytics-_Senior_Manager_sim_metrics_iter

In [15]:
import pandas as pd


csv_file = (
    sim_dir
    / "Salesforce_Vice_President__Product_Research___Insights_sim_metrics_iter1.csv"
)

df = pd.read_csv(csv_file)

set(df.requirement_key)
display(df.head())

Unnamed: 0,job_posting_url,responsibility_key,responsibility,requirement_key,requirement,bert_score_precision,soft_similarity,word_movers_distance,deberta_entailment_score,roberta_entailment_score,...,word_movers_distance_cat,deberta_entailment_score_cat,roberta_entailment_score_cat,scaled_bert_score_precision,scaled_soft_similarity,scaled_word_movers_distance,scaled_deberta_entailment_score,scaled_roberta_entailment_score,composite_score,pca_score
0,https://salesforce.wd12.myworkdayjobs.com/Exte...,0.responsibilities.0,Conversational AI & NLP: Engineered a dynamic ...,0.pie_in_the_sky.0,Proven track record of delivering insight work...,0.467233,0.603541,0.646567,0.067588,0.411333,...,Low,Low,Medium,0.209335,0.591926,0.646567,0.065988,0.409323,0.444589,-0.496784
1,https://salesforce.wd12.myworkdayjobs.com/Exte...,0.responsibilities.0,Conversational AI & NLP Expertise: Engineered ...,0.pie_in_the_sky.1,Experience navigating complex organizational i...,0.573688,0.725133,0.842715,0.982928,0.982277,...,Low,High,High,0.489615,0.777499,0.842715,0.989075,0.988075,0.864535,0.622396
2,https://salesforce.wd12.myworkdayjobs.com/Exte...,0.responsibilities.0,"With extensive experience, engineered research...",1.down_to_earth.0,15+ years of experience leading research in pr...,0.49728,0.502337,0.399821,0.009457,0.012183,...,Low,Low,Low,0.288446,0.437469,0.399821,0.007365,0.004715,0.231911,-0.854664
3,https://salesforce.wd12.myworkdayjobs.com/Exte...,0.responsibilities.0,Conversational AI & NLP Expertise: Engineered ...,1.down_to_earth.1,Expertise in executing research in a business ...,0.44501,0.472049,0.661639,0.304574,0.631551,...,Low,Medium,Medium,0.150826,0.391243,0.661639,0.304979,0.632552,0.465715,-0.211311
4,https://salesforce.wd12.myworkdayjobs.com/Exte...,0.responsibilities.0,Conversational AI & NLP Leadership: Engineered...,1.down_to_earth.2,Strong leadership and management expertise wit...,0.519765,0.739129,0.83319,0.988553,0.988718,...,Low,High,High,0.347646,0.79886,0.83319,0.994747,0.994604,0.867839,0.626461


## Get File List

In [1]:
from utils.get_file_names import get_file_names
from project_config import (
    ITERATE_1_ANTHROPIC_DIR,
    SIMILARITY_METRICS_ITERATE_1_ANTHROPIC_DIR,
)

files_dir = SIMILARITY_METRICS_ITERATE_1_ANTHROPIC_DIR

file_list = get_file_names(files_dir, True)
file_list

['C:\\github\\job_bot\\input_output\\evaluation_optimization\\evaluation_optimization_by_anthropic\\iteration_1\\similarity_metrics\\Accenture_Enterprise_AI_Value_Strategy_Senior_Manager_sim_metrics_iter1.csv',
 'C:\\github\\job_bot\\input_output\\evaluation_optimization\\evaluation_optimization_by_anthropic\\iteration_1\\similarity_metrics\\Adobe_Sr__Director__Applied_AI_ML__Discovery__sim_metrics_iter1.csv',
 'C:\\github\\job_bot\\input_output\\evaluation_optimization\\evaluation_optimization_by_anthropic\\iteration_1\\similarity_metrics\\Advisor360__Senior_Product_Manager_-_AI_Analytics___Insights_sim_metrics_iter1.csv',
 'C:\\github\\job_bot\\input_output\\evaluation_optimization\\evaluation_optimization_by_anthropic\\iteration_1\\similarity_metrics\\Airtable_Product_Manager__AI_sim_metrics_iter1.csv',
 'C:\\github\\job_bot\\input_output\\evaluation_optimization\\evaluation_optimization_by_anthropic\\iteration_1\\similarity_metrics\\Amazon_Product_Manager__Artificial_General_Intell

## Cross Tab Heatmap

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import textwrap

# Load similarity metrics CSV
file_path = file_list[1]  # Replace with your actual file path
df = pd.read_csv(file_path)

# Pivot the data to match heatmap format
pivot_df = df.pivot(
    index="responsibility", columns="requirement", values="composite_score"
)

# Create the heatmap
fig, ax = plt.subplots(figsize=(20, 12))
cmap = sns.color_palette("coolwarm", as_cmap=True)
sns.heatmap(
    pivot_df,
    annot=False,
    fmt=".2f",
    cmap=cmap,
    linewidths=1,
    linecolor="black",
    cbar=True,
    ax=ax,
)


# Function to wrap text inside heatmap cells
def wrap_text(text, width=20):
    return "\n".join(textwrap.wrap(str(text), width))


# Wrap y-axis labels
wrapped_y_labels = [textwrap.fill(label, width=20) for label in pivot_df.index]
ax.set_yticklabels(wrapped_y_labels, rotation=0)

# Overlay text inside each cell (display composite score + wrapped requirement)
for i, res in enumerate(pivot_df.index):
    for j, req in enumerate(pivot_df.columns):
        match = df[(df["responsibility"] == res) & (df["requirement"] == req)]
        if not match.empty:
            score = match.iloc[0]["composite_score"]
            req_text = wrap_text(match.iloc[0]["requirement"], width=20)
            display_text = f"{score:.2f}\n{req_text}"
            ax.text(
                j + 0.5,
                i + 0.5,
                display_text,
                ha="center",
                va="center",
                fontsize=8,
                color="black",
            )

# Formatting adjustments
ax.set_title("Responsibility vs Requirement Matching Grid (Text Inside Cells)")
ax.set_xlabel("Requirements")
ax.set_ylabel("Responsibilities")

plt.xticks(rotation=45, ha="right")

# Increase left margin
plt.subplots_adjust(left=0.5)
# box = ax.get_position()
# ax.set_position([box.x0 + 0.2, box.y0, box.width, box.height])

plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import textwrap
import numpy as np

# Sample Responsibilities and Requirements
responsibilities = [
    "Led strategic initiatives for IT transformation",
    "Managed global vendor relationships",
    "Optimized business intelligence reporting",
    "Developed AI-driven analytics models",
    "Implemented cloud security protocols",
]

requirements = [
    "Experience in strategic IT leadership",
    "Vendor management expertise",
    "Business intelligence reporting experience",
    "AI and machine learning proficiency",
    "Cloud security best practices",
]

# Generate random similarity scores between 0.5 and 1.0
np.random.seed(42)
data = []
for res in responsibilities:
    for req in requirements:
        data.append(
            {
                "responsibility": res,
                "requirement": req,
                "composite_score": round(np.random.uniform(0.5, 1.0), 2),
            }
        )

# Convert to DataFrame
df_dummy = pd.DataFrame(data)

# Pivot table for heatmap
pivot_df = df_dummy.pivot(
    index="responsibility", columns="requirement", values="composite_score"
)

# Create the heatmap
fig, ax = plt.subplots(figsize=(12, 8))
cmap = sns.color_palette("coolwarm", as_cmap=True)  # Define color scheme

# Generate heatmap
sns.heatmap(
    pivot_df,
    annot=False,
    fmt=".2f",
    cmap=cmap,
    linewidths=1,
    linecolor="black",
    cbar=True,
    ax=ax,
)


# Function to wrap text inside heatmap cells
def wrap_text(text, width=20):
    return "\n".join(textwrap.wrap(str(text), width))


# Overlay text inside each cell (score + requirement)
for i, res in enumerate(pivot_df.index):
    for j, req in enumerate(pivot_df.columns):
        match = df_dummy[
            (df_dummy["responsibility"] == res) & (df_dummy["requirement"] == req)
        ]
        if not match.empty:
            score = match.iloc[0]["composite_score"]
            req_text = wrap_text(
                match.iloc[0]["requirement"], width=20
            )  # Wrap text for better display
            display_text = f"{score:.2f}\n{req_text}"  # Display similarity score + wrapped requirement text

            ax.text(
                j + 0.5,
                i + 0.5,
                display_text,
                ha="center",
                va="center",
                fontsize=8,
                color="black",
            )

# Formatting adjustments
ax.set_title("Dummy Responsibility vs Requirement Heatmap (Text Inside Cells)")
ax.set_xlabel("Requirements")
ax.set_ylabel("Responsibilities")
plt.xticks(rotation=45, ha="right")  # Rotate x-axis labels for better readability
plt.yticks(rotation=0)

# Show the plot
plt.show()

In [None]:
import altair as alt
import pandas as pd
import textwrap

# Load similarity metrics CSV
file_path = file_list[1]  # Replace with your actual file path
df = pd.read_csv(file_path)

# Create a wrapped version of the requirement text (width=20)
df["wrapped_requirement"] = df["requirement"].apply(
    lambda x: "\n".join(textwrap.wrap(str(x), width=20))
)
df["score_text"] = df["composite_score"].apply(lambda x: f"{x:.2f}")
df["label"] = df["score_text"] + "\n" + df["wrapped_requirement"]

# Build the heatmap chart
heatmap = (
    alt.Chart(df)
    .mark_rect()
    .encode(
        x=alt.X("requirement:N", title="Requirements", axis=alt.Axis(labelAngle=45)),
        y=alt.Y("responsibility:N", title="Responsibilities"),
        color=alt.Color(
            "composite_score:Q",
            scale=alt.Scale(scheme="redblue"),
            title="Composite Score",
        ),
    )
)

# Build the text overlay chart
# The key here is using the "detail" encoding so that each row is rendered individually.
text_overlay = (
    alt.Chart(df)
    .mark_text(
        fontSize=8,
        color="black",
        align="left",  # Set left alignment (change to 'center' if preferred)
        baseline="middle",
    )
    .encode(
        x=alt.X("requirement:N"),
        y=alt.Y("responsibility:N"),
        text=alt.Text("label:N"),
        detail="label:N",  # Force each label to be treated as a distinct detail
    )
)

# Combine the heatmap and text overlay
chart = (
    (heatmap + text_overlay)
    .properties(
        width=600,
        height=400,
        title="Responsibility vs Requirement Matching Grid (Text Inside Cells)",
    )
    .configure_view(strokeWidth=0)
)

chart.display()

# Make Cross Tab in Excel Instead

In [None]:
import pandas as pd
import argparse


def create_pivot_table(sim_metrics_csv, output_csv):
    """
    Reads the similarity metrics CSV and creates a pivot table:
      - Index: responsibility_key
      - Columns: requirement_key
      - Values: responsibility
    Then saves it as a new CSV file.
    """
    # Load CSV file
    df = pd.read_csv(sim_metrics_csv)
    # display(df.head(5))

    # Pivot table with responsibility_key as index, requirement_key as columns, and responsibility as values
    # Multi-index for columns: (requirement_key, requirement)
    pivot_table = df.pivot(
        index="responsibility_key",
        columns=["requirement_key", "requirement"],  # Multi-index for columns
        values=["responsibility", "composite_score"],  # Multi-values in pivot
    )

    # Fill missing values with empty string
    pivot_table = pivot_table.fillna("")

    # Save to CSV
    pivot_table.to_csv(output_csv)

    print(f"Pivot table saved to {output_csv}")

    display(pivot_table.head(10))


def main():
    # Define input and output file paths
    input_csv = r"C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_1\similarity_metrics\Microsoft_Head_of_Partner_Intelligence_and_Strategy_sim_metrics_iter1.csv"
    output_csv = (
        r"C:\github\job_bot\data\matching_examples\resp_vs_reqs_pivot_output_1.csv"
    )

    # Call the function
    create_pivot_table(input_csv, output_csv)


if __name__ == "__main__":
    main()

### With Color Fromatting

In [None]:
import pandas as pd
from openpyxl import load_workbook
from openpyxl.styles import PatternFill
from openpyxl.formatting.rule import ColorScaleRule


def create_pivot_table(sim_metrics_csv, output_excel):
    """
    Reads the CSV and creates a pivot table:
      - Index: responsibility_key
      - Columns: (requirement_key, requirement)
      - Values: responsibility, composite_score
    Then saves it as an Excel file and applies conditional formatting.
    """
    # Load CSV file
    df = pd.read_csv(sim_metrics_csv)

    # Format responsibility text based on composite_score
    def format_responsibility(row):
        if pd.isna(row["composite_score"]):  # Handle NaN values
            return row["responsibility"]
        elif row["composite_score"] >= 0.75:
            return f"⭐ {row['responsibility']}"  # Highlight important ones
        elif row["composite_score"] < 0.3:
            return f"❌ {row['responsibility']}"  # Mark low ones
        return row["responsibility"]

    df["formatted_responsibility"] = df.apply(format_responsibility, axis=1)

    # Create pivot table
    pivot_table = df.pivot_table(
        index="responsibility_key",
        columns=["requirement_key", "requirement"],
        values=["formatted_responsibility", "composite_score"],
        aggfunc="first",
    )

    pivot_table = pivot_table.fillna("")
    pivot_table.to_excel(output_excel)

    # Apply Conditional Formatting
    apply_conditional_formatting(output_excel)
    print(f"Pivot table saved and formatted at: {output_excel}")


def apply_conditional_formatting(excel_file):
    """Finds composite_score columns in the pivot and applies a color scale formatting."""
    wb = load_workbook(excel_file)
    ws = wb.active

    # Define a Color Scale Rule (Red - Yellow - Green)
    color_rule = ColorScaleRule(
        start_type="num",
        start_value=0,
        start_color="FF6347",  # Red
        mid_type="num",
        mid_value=0.5,
        mid_color="FFFF00",  # Yellow
        end_type="num",
        end_value=1,
        end_color="00FF00",  # Green
    )

    # Detect composite_score columns explicitly
    for col in range(2, ws.max_column + 1):  # Columns start at 2
        header = ws.cell(row=1, column=col).value  # Get column header
        if header and "composite_score" in str(header):  # Ensure it's a valid column
            col_letter = ws.cell(row=1, column=col).column_letter
            ws.conditional_formatting.add(
                f"{col_letter}2:{col_letter}{ws.max_row}", color_rule
            )

    wb.save(excel_file)
    print("✅ Conditional formatting applied successfully!")


def main():
    input_csv = r"C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_1\similarity_metrics\Microsoft_Head_of_Partner_Intelligence_and_Strategy_sim_metrics_iter1.csv"
    output_excel = (
        r"C:\github\job_bot\data\matching_examples\resp_vs_reqs_pivot_output_1.xlsx"
    )

    create_pivot_table(input_csv, output_excel)


if __name__ == "__main__":
    main()

#### With Xlwings

In [None]:
import pandas as pd
import xlwings as xw


def create_pivot_table(sim_metrics_csv, output_excel):
    """
    Reads the CSV and creates a pivot table:
      - Index: responsibility_key
      - Columns: (requirement_key, requirement)
      - Values: responsibility, composite_score
    Then saves it as an Excel file and applies conditional formatting using xlwings.
    """
    # Load CSV file
    df = pd.read_csv(sim_metrics_csv)

    # Format responsibility text based on composite_score
    def format_responsibility(row):
        if pd.isna(row["composite_score"]):  # Handle NaN values
            return row["responsibility"]
        elif row["composite_score"] >= 0.75:
            return f"{row['responsibility']}"  # Highlight important ones
        elif row["composite_score"] < 0.3:
            return f"{row['responsibility']}"  # Mark low ones
        return row["responsibility"]

    df["formatted_responsibility"] = df.apply(format_responsibility, axis=1)

    # Create pivot table
    pivot_table = df.pivot_table(
        index="responsibility_key",
        columns=["requirement_key", "requirement"],
        values=["formatted_responsibility", "composite_score"],
        aggfunc="first",
    )

    pivot_table = pivot_table.fillna("")
    pivot_table.to_excel(output_excel)

    # Apply Conditional Formatting with xlwings
    apply_xlwings_formatting(output_excel)
    print(f"Pivot table saved and formatted at: {output_excel}")


def apply_xlwings_formatting(excel_file):
    """Applies conditional formatting to value cells (not headers) based on their composite_score."""
    app = xw.App(visible=True)  # Keep Excel open for debugging
    wb = xw.Book(excel_file)
    ws = wb.sheets[0]

    # Detect last row and last column
    last_row = ws.range("A1").expand("down").last_cell.row
    last_col = ws.range("A1").expand("right").last_cell.column

    # Iterate through all data cells (excluding headers)
    for row in range(2, last_row + 1):  # Start from row 2 to avoid header
        for col in range(2, last_col + 1):  # Start from col 2 to avoid row labels
            cell = ws.cells(row, col)
            try:
                value = float(cell.value)  # Convert value to float
                if value >= 0.75:
                    cell.api.Interior.Color = xw.utils.rgb_to_int(
                        (0, 255, 0)
                    )  # Green for high scores
                elif value < 0.3:
                    cell.api.Interior.Color = xw.utils.rgb_to_int(
                        (255, 0, 0)
                    )  # Red for low scores
                else:
                    cell.api.Interior.Color = xw.utils.rgb_to_int(
                        (255, 255, 0)
                    )  # Yellow for mid-range scores
            except (ValueError, TypeError):
                pass  # Ignore non-numeric values

    wb.save()
    wb.close()
    app.quit()
    print("✅ Conditional formatting applied to value cells!")


def main():
    input_csv = r"C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_openai\iteration_1\similarity_metrics\Microsoft_Head_of_Partner_Intelligence_and_Strategy_sim_metrics_iter1.csv"
    output_excel = (
        r"C:\github\job_bot\data\matching_examples\resp_vs_reqs_pivot_output_1.xlsx"
    )

    create_pivot_table(input_csv, output_excel)


if __name__ == "__main__":
    main()

In [None]:
input_csv = r"C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_anthropic\iteration_1\responsibilities\older_files\PwC_Strategy__Manager_-_Digital_Value_Transformation_Contact_Center_resps_nested_iter1.json"

with open(input_csv, "r", encoding="utf-8") as f:
    for _ in range(30):  # Print first 30 lines
        print(f.readline().strip())

In [None]:
#!/usr/bin/env python3
import pandas as pd
import argparse
import os
import xlsxwriter


def create_two_row_header_excel(sim_metrics_csv, output_file):
    """
    Reads the similarity metrics CSV and creates an Excel file with:
      - Row 1: "Resp Key / Req Key" + requirement keys
      - Row 2: "Requirements" + requirement texts
      - Rows 3+: One row per responsibility key, showing matched responsibility texts
    """
    # 1) Load the similarity metrics CSV
    df = pd.read_csv(sim_metrics_csv)

    # 2) Extract unique requirements and map to their texts
    req_map = df.groupby("requirement_key")["requirement"].first().to_dict()
    req_keys = sorted(req_map.keys())  # Ordered list of requirement keys
    req_texts = [req_map[k] for k in req_keys]  # Corresponding requirement texts

    # 3) Extract unique responsibilities
    resp_keys = sorted(df["responsibility_key"].unique())

    # 4) Create a dataframe to ensure all `requirement_keys` appear
    full_pivot = pd.DataFrame(index=resp_keys, columns=req_keys).fillna("")

    # 5) Pivot the table to get optimized_text per (responsibility, requirement) pair
    pivot = df.pivot(
        index="responsibility_key", columns="requirement_key", values="responsibility"
    )

    # 6) Merge the pivoted data into `full_pivot` to retain all columns
    full_pivot.update(pivot)

    # 7) Reset index so responsibility_key becomes a column
    full_pivot = full_pivot.reset_index()

    # 8) Prepare the first two header rows (Multi-layer Headers)
    header1 = ["Resp Key / Req Key"] + req_keys  # First row (Keys)
    header2 = ["Requirements"] + req_texts  # Second row (Descriptions)

    # 9) Write to Excel using xlsxwriter (Multi-layer Headers)
    workbook = xlsxwriter.Workbook(output_file)
    worksheet = workbook.add_worksheet("CrossTab")

    # Apply formatting
    bold_format = workbook.add_format(
        {"bold": True, "bg_color": "#002b36", "font_color": "white"}
    )
    wrap_format = workbook.add_format({"text_wrap": True, "align": "top"})

    # Merge header rows for multi-layer effect
    worksheet.write_row(0, 0, header1, bold_format)  # Row 1: Requirement Keys
    worksheet.write_row(1, 0, header2, wrap_format)  # Row 2: Requirement Texts

    # Write responsibilities data (row 3+)
    for row_idx, row in enumerate(full_pivot.itertuples(index=False), start=2):
        worksheet.write_row(row_idx, 0, row, wrap_format)

    # Adjust column widths for readability
    worksheet.set_column(0, 0, 25)  # Responsibility Key column
    worksheet.set_column(1, len(req_keys), 50)  # Requirement columns

    workbook.close()
    print(f"Excel file created: {output_file}")


def main():
    parser = argparse.ArgumentParser(
        description="Create a 2-row-header Excel from similarity metrics CSV."
    )
    parser.add_argument(
        "--sim_metrics_csv",
        required=True,
        help="Path to the similarity metrics CSV file",
    )
    parser.add_argument(
        "--output",
        required=True,
        help="Path to the output Excel file (e.g., output.xlsx)",
    )
    args = parser.parse_args()

    # Create the output directory if it doesn't exist
    output_dir = os.path.dirname(args.output)
    if output_dir and not os.path.exists(output_dir):
        os.makedirs(output_dir)

    create_two_row_header_excel(args.sim_metrics_csv, args.output)


if __name__ == "__main__":
    # Example file paths (adjust these as needed)
    input_csv = r"C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_anthropic\iteration_1\similarity_metrics\Thermo_Fisher_Scientific_Market___Competitive_Intelligence_Manager_sim_metrics_iter1.csv"
    output_excel = (
        r"C:\github\job_bot\data\matching_examples\resp_vs_reqs_crosstab_output_1.xlsx"
    )

    # Directly call the function with desired column names:
    create_two_row_header_excel(
        sim_metrics_csv=input_csv,
        output_file=output_excel,
    )

In [None]:
from pathlib import Path
import pandas as pd


def create_cross_tab(sim_metric_csv_file: Path, output_excel_file: Path):
    # Load the CSV file
    df = pd.read_csv(sim_metric_csv_file)

    # Create a pivot table (cross-tab) based on responsibility_key and requirement_key
    cross_tab = pd.pivot_table(
        df,
        values="responsibility",
        index="responsibility_key",
        columns="requirement_key",
        aggfunc=lambda x: " ".join(x),
    )

    # Extract unique requirements and their corresponding keys
    requirements = (
        df[["requirement_key", "requirement"]]
        .drop_duplicates()
        .set_index("requirement_key")["requirement"]
    )

    # Create a DataFrame for the requirements row with the same columns as cross_tab
    requirements_row = pd.DataFrame([requirements], columns=cross_tab.columns)

    # Combine the requirements row with the cross-tab table
    cross_tab_with_requirements = pd.concat([requirements_row, cross_tab], axis=0)

    # Reset the index to make the table cleaner
    cross_tab_with_requirements.reset_index(drop=True, inplace=True)

    # Save the cross-tab table with requirements to an Excel file
    cross_tab_with_requirements.to_excel(output_excel_file, index=False)

    print(f"Cross-tab table with requirements saved to {output_excel_file}")


# Input and output file paths
input_csv = r"C:\github\job_bot\input_output\evaluation_optimization\evaluation_optimization_by_anthropic\iteration_1\similarity_metrics\Thermo_Fisher_Scientific_Market___Competitive_Intelligence_Manager_sim_metrics_iter1.csv"
output_excel = (
    r"C:\github\job_bot\data\matching_examples\resp_vs_reqs_crosstab_output_1.xlsx"
)

# Create the cross-tab table
create_cross_tab(Path(input_csv), Path(output_excel))

In [None]:
responsibilities="{'0.responsibilities.0': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Led strategic initiatives to optimize the service partner network for a prominent international IT company in the Asia Pacific market, resulting in enhanced local execution outcomes. Possess extensive experience in analytical roles.'), '1.down_to_earth.1': OptimizedText(optimized_text="Led the optimization of a major global IT vendor's service partner ecosystem in the Asia Pacific region, resulting in improved local implementation outcomes. Leveraged extensive client-facing experience to drive these strategic enhancements."), '1.down_to_earth.2': OptimizedText(optimized_text='Led strategic consulting and analytics initiatives for a leading global IT vendor, driving enhancements to their partner ecosystem in the Asia Pacific region and delivering improved local implementation outcomes.'), '1.down_to_earth.3': OptimizedText(optimized_text="Led the optimization of a major global IT vendor's service partner ecosystem in the Asia Pacific region, driving improved local implementation results."), '2.other.0': OptimizedText(optimized_text='Led strategic initiatives that optimized the service partner network of a leading global IT vendor in the Asia Pacific region, resulting in enhanced local implementation and improved client outcomes.')}), '0.responsibilities.1': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Led the strategic growth of a leading international services provider by identifying and scaling new engineering service opportunities in key emerging markets.'), '1.down_to_earth.1': OptimizedText(optimized_text="Led the evaluation and scaling of new engineering service opportunities in vital emerging markets to support a leading international services provider's growth strategy."), '1.down_to_earth.2': OptimizedText(optimized_text='Led strategic analysis to identify and capitalize on new engineering service opportunities in key emerging markets, driving growth for a leading international services provider.'), '1.down_to_earth.3': OptimizedText(optimized_text='Led the evaluation and scaling of new engineering service opportunities in vital emerging markets to support the growth strategy of a U.S.-based international services provider.'), '2.other.0': OptimizedText(optimized_text='Led the expansion strategy for a leading international services provider by identifying and scaling new engineering service opportunities in key emerging markets.')}), '0.responsibilities.2': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Authored impactful industry reports analyzing engineering services merger and acquisition trends, providing strategic insights into deal sizes, capability gaps, and emerging opportunities to inform decisions on IT and operational technology convergence.'), '1.down_to_earth.1': OptimizedText(optimized_text='Led the co-authorship of an industry-recognized report on M&A trends in the engineering services sector, providing strategic insights into deal sizes, capability gaps, and emerging opportunities to guide decision-making on IT and operational technology convergence.'), '1.down_to_earth.2': OptimizedText(optimized_text='Authored insightful industry reports analyzing mergers and acquisitions in the engineering services sector. Provided comprehensive insights into deal sizes, capability gaps, and emerging opportunities, informing strategic decisions on IT and operational technology convergence.'), '1.down_to_earth.3': OptimizedText(optimized_text='Led the development of an industry-recognized report on mergers and acquisitions in the engineering services sector, delivering in-depth analysis of deal dynamics, capability gaps, and emerging opportunities at the intersection of IT and operational technology. Leveraged these insights to drive strategic planning and execution.'), '2.other.0': OptimizedText(optimized_text='Authored a comprehensive industry report on mergers and acquisitions in the engineering services sector, providing in-depth analysis of deal sizes, capability gaps, and emerging opportunities. The report informed strategic decisions regarding the convergence of information technology and operational technology.')}), '0.responsibilities.3': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Led efforts to enhance data quality and consistency through thorough financial analysis, standardized methodologies, and collaborative vendor engagements.'), '1.down_to_earth.1': OptimizedText(optimized_text='Drove the enhancement of data quality and consistency by integrating thorough financial analysis, standardizing methodologies, and conducting in-depth vendor engagements.'), '1.down_to_earth.2': OptimizedText(optimized_text='Led consultative analytics engagements, leveraging financial analysis, standardized methodologies, and vendor collaborations to enhance data quality and consistency.'), '1.down_to_earth.3': OptimizedText(optimized_text='Transformed data quality and consistency by integrating financial analysis, standardizing methodologies, and engaging vendors.'), '2.other.0': OptimizedText(optimized_text='Drove impactful improvements in data quality and consistency by integrating thorough financial analysis, standardizing methodologies, and conducting in-depth vendor engagements.')}), '0.responsibilities.4': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Led the optimization of resource allocation through centralization of tasks, transitioning a significant portion of work to an offshore team in India, which resulted in increased efficiency and cost savings.'), '1.down_to_earth.1': OptimizedText(optimized_text='Led offshore teams, optimized resource utilization, and enhanced operational efficiency. Extensive client-facing experience.'), '1.down_to_earth.2': OptimizedText(optimized_text='Led the centralization of over 40% of tasks to an offshore team in India, optimizing resource allocation and driving significant improvements in team productivity and efficiency.'), '1.down_to_earth.3': OptimizedText(optimized_text='Centralized over 40% of tasks to an offshore team, optimizing resource allocation for enhanced efficiency.'), '2.other.0': OptimizedText(optimized_text='Centralized over 40% of tasks to an offshore team, optimizing resource allocation and supporting product development through client services feedback.')}), '0.responsibilities.5': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Automated and streamlined internal processes using Python, driving over 40% improvements in report preparation and data analysis.'), '1.down_to_earth.1': OptimizedText(optimized_text='Led the development of custom Python tools that automated and optimized internal workflows, resulting in a 40% decrease in report generation and data analysis time. Leveraged extensive experience collaborating with clients to deliver tailored solutions.'), '1.down_to_earth.2': OptimizedText(optimized_text='Developed custom Python tools that streamlined and accelerated internal processes, delivering over 40% reduction in report preparation and data analysis time. Led consultative analytics engagements with clients.'), '1.down_to_earth.3': OptimizedText(optimized_text='Led the development of custom Python tools that streamlined and accelerated internal processes, driving significant improvements in efficiency across report preparation and data analysis.'), '2.other.0': OptimizedText(optimized_text='Leveraged advanced Python programming skills to create custom tools that streamlined internal operations, driving over 40% improvements in report preparation and data analysis efficiency.')}), '0.responsibilities.6': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Analytical leader with a proven track record of pioneering groundbreaking industry research and publications. Collaborated extensively with engineering services teams to develop market forecasts, analyze the impact of COVID-19, and identify emerging trends in mergers and acquisitions within the engineering services sector.'), '1.down_to_earth.1': OptimizedText(optimized_text='Led the engineering services research team in pioneering the engineering services tracker, authored impactful publications on market forecasts, the impact of COVID-19 on services, and trends in mergers and acquisitions within the engineering services industry. Demonstrated substantial client-facing experience.'), '1.down_to_earth.2': OptimizedText(optimized_text='Led the development of the engineering services tracker and authored influential publications on market forecasts, the impact of COVID-19 on services, and trends in M&A within the engineering services industry.'), '1.down_to_earth.3': OptimizedText(optimized_text='Pioneered industry-leading engineering services tracker and authored impactful publications on market forecasts, COVID-19 impact, and engineering services M&A trends. Demonstrated ability to effectively manage complex projects and deliver valuable insights to stakeholders.'), '2.other.0': OptimizedText(optimized_text='Led engineering services research team to pioneer engineering services tracker, authored influential publications on market forecasts, the impact of COVID-19 on services, and industry trends. Leveraged these insights to drive product development through client services feedback.')}), '0.responsibilities.7': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Seasoned data analyst who led global teams (US, Canada, Latin America, Europe, MEA, APAC) to maintain data integrity, achieve objectives, and share expertise. Adept at identifying and implementing innovative analytical tools and techniques.'), '1.down_to_earth.1': OptimizedText(optimized_text='Led collaborative efforts with global analyst teams to ensure data quality, meet deadlines, share knowledge, and implement best practices and new tools.'), '1.down_to_earth.2': OptimizedText(optimized_text='Led consultative analytics engagements with global analyst teams. Ensured data quality, met deadlines, and shared knowledge, best practices, and methodology to procure new tools.'), '1.down_to_earth.3': OptimizedText(optimized_text='Collaborated with global analyst teams to ensure data quality, meet deadlines, share knowledge, implement best practices, and procure new tools.'), '2.other.0': OptimizedText(optimized_text='Led international analyst teams to maintain data integrity, achieve project milestones, exchange expertise and best practices, and acquire innovative tools in support of product development initiatives.')}), '1.responsibilities.0': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Seasoned leader with a proven record in full P&L oversight, including budgeting, HR, vendor relations, partnerships, research, and business development. Drove significant growth, expanding program bookings by over 50%. Extensive experience in analytical roles, with a strong foundation in data-driven decision-making and strategic planning.'), '1.down_to_earth.1': OptimizedText(optimized_text='Skilled leader with a proven track record in full P&L management, overseeing budgeting, HR, vendor relations, partnerships, research, and business development. Drove significant growth, expanding program bookings by over 50%. Excelled in client-facing roles, delivering exceptional service and driving impactful business results.'), '1.down_to_earth.2': OptimizedText(optimized_text='Led full P&L management, including budgeting, HR, vendor relationships, partnerships, research, and business development. Drove significant expansion, growing program bookings by over 50%.'), '1.down_to_earth.3': OptimizedText(optimized_text='Adept leader with a proven record of full P&L management, overseeing diverse responsibilities including budgeting, human resources, vendor relationships, strategic partnerships, research, and business development. Drove significant growth, expanding program bookings by over 50%, showcasing exceptional project management skills.'), '2.other.0': OptimizedText(optimized_text='Led full profit and loss responsibilities, excelling at budgeting, human resources, vendor relationships, partnerships, research, and business development. Drove significant growth, increasing program bookings by over 50%.')}), '1.responsibilities.1': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Directed and expanded diverse global research teams, leading a global team of over 15 professionals across the US, India, and Mexico.'), '1.down_to_earth.1': OptimizedText(optimized_text='Led a global research team, leveraging diverse perspectives to drive innovation across multiple locations.'), '1.down_to_earth.2': OptimizedText(optimized_text='Led and grew a diverse global research team spanning multiple locations.'), '1.down_to_earth.3': OptimizedText(optimized_text='Led and managed diverse, global research teams to drive successful project outcomes.'), '2.other.0': OptimizedText(optimized_text='Led and expanded a diverse, global research team across strategic locations, enabling informed product development through client feedback.')}), '1.responsibilities.2': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Managed cross-functional teams to deliver innovative software solutions.'), '1.down_to_earth.1': OptimizedText(optimized_text='Managed cross-functional teams to deliver innovative software solutions for clients.'), '1.down_to_earth.2': OptimizedText(optimized_text='Spearheaded cross-functional teams to ideate, build, and deploy innovative software solutions that addressed client needs. Led an external software development team to build and implement new tools.'), '1.down_to_earth.3': OptimizedText(optimized_text='Spearheaded the development and implementation of innovative software tools and solutions by leading an external software development team.'), '2.other.0': OptimizedText(optimized_text='Collaborated with an external software development team to build and implement new tools that enhanced product development efforts.')}), '1.responsibilities.3': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Experienced leader who championed cutting-edge technology initiatives, including machine learning, natural language processing, chatbots, ontologies, web scraping, APIs, and user experience design. Demonstrated strong analytical skills with a data-driven approach honed over 8+ years.'), '1.down_to_earth.1': OptimizedText(optimized_text='Led impactful and innovative technology projects leveraging cutting-edge tools like machine learning, natural language processing, chatbots, ontologies, web scraping, APIs, and user experience design. Collaborated extensively with stakeholders to deliver tailored solutions that exceeded expectations.'), '1.down_to_earth.2': OptimizedText(optimized_text='Led innovative technology initiatives that leveraged cutting-edge tools and techniques, including machine learning, natural language processing, chatbots, ontologies, web scraping, APIs, and user experience design.'), '1.down_to_earth.3': OptimizedText(optimized_text='Led the successful implementation of cutting-edge technology initiatives, including machine learning, natural language processing, chatbots, ontology development, web scraping, API integration, and user experience design. Extensive experience in managing complex technology projects and delivering innovative solutions that drive business growth.'), '2.other.0': OptimizedText(optimized_text='Spearheaded innovative product development and enhanced user experience by leveraging cutting-edge technologies, including machine learning, natural language processing, chatbots, ontologies, web scraping, and APIs.')}), '1.responsibilities.4': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Developed and led a team to create Python-based automated tools that streamlined report preparation, driving 40% time savings.'), '1.down_to_earth.1': OptimizedText(optimized_text='Drove development of automated Python tools, reducing report preparation time by 40%.'), '1.down_to_earth.2': OptimizedText(optimized_text='Developed and implemented automated Python tools, driving a 40% reduction in report preparation time.'), '1.down_to_earth.3': OptimizedText(optimized_text='Adept project manager who led teams in developing custom Python-based tools, enhancing reporting efficiency by 40%.'), '2.other.0': OptimizedText(optimized_text='Developed Python-based automated solutions that streamlined report generation and enhanced overall operational efficiency.')}), '1.responsibilities.5': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Pioneered cutting-edge technologies, leading the implementation of machine learning, chatbots, APIs, and ontology development. Excels in analytical roles, delivering impactful solutions that drive business success.'), '1.down_to_earth.1': OptimizedText(optimized_text='Pioneered cutting-edge technology projects, including deploying machine learning, chatbots, APIs, and ontology development. Delivered client-focused solutions with a proven track record.'), '1.down_to_earth.2': OptimizedText(optimized_text='Pioneering technology executive with a track record of leading transformative initiatives, including the implementation of cutting-edge solutions such as machine learning, chatbots, APIs, and ontology development. Excels at driving strategic analytics engagements and delivering impactful consultative services to clients.'), '1.down_to_earth.3': OptimizedText(optimized_text='Innovative technology leader who pioneered cutting-edge solutions including machine learning, chatbots, APIs, and ontology development. Skilled at driving high-impact projects and delivering measurable results.'), '2.other.0': OptimizedText(optimized_text='Led the implementation of machine learning, launch of a chatbot, development of APIs, and construction of an ontology to drive innovation with emerging technologies. Collaborated closely with clients to provide valuable feedback that informed and supported ongoing product development efforts.')}), '1.responsibilities.6': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Leveraged analytical expertise to advise services firms on deal pursuit and sales orchestration strategies, driving business development and sales execution.'), '1.down_to_earth.1': OptimizedText(optimized_text='Guided professional services firms in developing their deal pursuit and sales strategy.'), '1.down_to_earth.2': OptimizedText(optimized_text='Advised services firms on deal pursuit and sales orchestration strategies, providing strategic guidance and expertise.'), '1.down_to_earth.3': OptimizedText(optimized_text='Proven leader who advised services firms on developing and executing effective deal pursuit and sales strategies.'), '2.other.0': OptimizedText(optimized_text='Drove deal pursuit and sales orchestration strategies for professional services firms, leveraging client insights to inform product development.')}), '1.responsibilities.7': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Advised software vendors on partnership strategy, leveraging extensive analytical expertise to provide strategic advisory services.'), '1.down_to_earth.1': OptimizedText(optimized_text='Advised software vendors on strategic partnerships and delivered client-facing services.'), '1.down_to_earth.2': OptimizedText(optimized_text='Guided software vendors on partnership opportunities and drove consultative analytics engagements with clients.'), '1.down_to_earth.3': OptimizedText(optimized_text='Guided software vendors on partnership strategy, leveraging extensive experience to deliver strategic guidance and drive successful initiatives.'), '2.other.0': OptimizedText(optimized_text='Guided software vendors on strategic services partnerships, driving successful client engagements.')}), '1.responsibilities.8': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Prolific content creator who authored reports, blogs, presentations, and custom research. Leveraged industry insights to drive strategic decision-making, analyzing go-to-market strategies, deal signing, renewal analysis, buyer studies, and technology trends (cloud, AI, ML, digital, etc.). Demonstrated extensive experience in an analytical role.'), '1.down_to_earth.1': OptimizedText(optimized_text='Prolific author of reports, blogs, presentations, and custom research. Adept at analyzing go-to-market strategies, deal signing, renewal trends, buyer behavior, and the adoption of emerging technologies such as cloud, AI, ML, and digital solutions. Proven track record of providing valuable industry insights and trend analysis to clients.'), '1.down_to_earth.2': OptimizedText(optimized_text='Seasoned professional who authors reports, blogs, presentations, and custom research. Adept at analyzing go-to-market strategies, deal signing, renewal trends, buyer behavior, and technology adoptions (cloud, AI, ML, digital, etc.), as well as identifying industry trends. Skilled in delivering consultative analytics engagements to clients.'), '1.down_to_earth.3': OptimizedText(optimized_text='Authored impactful reports, blogs, presentations, and custom research on go-to-market strategy, deal analysis, buyer studies, and industry trends. Leveraged emerging technologies like cloud, AI, and ML to deliver actionable insights.'), '2.other.0': OptimizedText(optimized_text='Accomplished professional who has authored impactful reports, blogs, presentations, and custom research projects. Expertise spans developing go-to-market strategies, conducting deal and renewal analyses, executing buyer studies, and analyzing technology adoption trends (e.g., cloud, AI, ML, digital). Regularly provided valuable insights and feedback to drive product development efforts.')}), '2.responsibilities.0': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Led quarterly webinars analyzing industry trends in outsourcing and managed services.'), '1.down_to_earth.1': OptimizedText(optimized_text='Conducted quarterly webinars on outsourcing and managed services trends.'), '1.down_to_earth.2': OptimizedText(optimized_text='Led quarterly webinar series to present industry insights and best practices to clients.'), '1.down_to_earth.3': OptimizedText(optimized_text='Seasoned professional who has delivered quarterly webinars showcasing industry insights and best practices on outsourcing and managed services signing trends.'), '2.other.0': OptimizedText(optimized_text='Conducted quarterly webinars to share industry insights and client feedback, driving product development initiatives.')}), '2.responsibilities.1': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Authored insightful pursuit strategy reports and industry trend research.'), '1.down_to_earth.1': OptimizedText(optimized_text='Authored compelling pursuit strategy reports and conducted in-depth industry trend research, leveraging a strong background in client-facing roles.'), '1.down_to_earth.2': OptimizedText(optimized_text='Led consultative analytics engagements and produced industry-leading research reports.'), '1.down_to_earth.3': OptimizedText(optimized_text='Authored strategic planning and industry analysis reports, demonstrating a proven track record in project management.'), '2.other.0': OptimizedText(optimized_text='Drove product development by leveraging client feedback and industry research.')}), '3.responsibilities.0': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Designed and architected complex database systems, integrating diverse data sources and enhancing data quality through deduplication initiatives. Demonstrated a proven track record in analytical roles over multiple years.'), '1.down_to_earth.1': OptimizedText(optimized_text='Designed and architected a complex company database, integrating external and internal data sources to significantly reduce data duplication. Led client-facing initiatives throughout my career, demonstrating extensive experience in this area.'), '1.down_to_earth.2': OptimizedText(optimized_text='Led the technical design and architecture of a large-scale enterprise database, integrating multiple data sources to enhance data integrity and streamline operations. Demonstrated extensive expertise in complex data architecture and integration, collaborating with stakeholders to deliver impactful solutions that reduced data duplication by 50%.'), '1.down_to_earth.3': OptimizedText(optimized_text='Designed and architected sophisticated database solutions, integrating diverse data sources to enhance integrity and optimize management processes. Reduced data duplication by 50% in a complex company database with 100K+ unique records, seamlessly integrating DnB API and internal databases.'), '2.other.0': OptimizedText(optimized_text='Led the design and implementation of a comprehensive database system, consolidating diverse data sources to enhance data quality and streamline operations.')}), '3.responsibilities.1': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Drove implementation of process automation solutions that boosted productivity across multiple industries.'), '1.down_to_earth.1': OptimizedText(optimized_text='Results-driven professional with a proven track record of leading successful client-facing projects, including managing the implementation of two Appian solutions that enhanced team productivity by 20 to 30%.'), '1.down_to_earth.2': OptimizedText(optimized_text='Driven leader who managed multiple Appian implementations that delivered substantial productivity gains for client teams.'), '1.down_to_earth.3': OptimizedText(optimized_text='Accomplished project manager who led two successful Appian implementations that drove 20-30% improvements in team productivity.'), '2.other.0': OptimizedText(optimized_text='Led two Appian implementations that drove 20-30% improvements in team productivity.')}), '3.responsibilities.2': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Managed daily content operations, including leading a team of offshore and nearshore content team, as well as other sales and research related activities. Seasoned professional with a proven track record in managing content operations, leading cross-functional teams, and supporting sales and research initiatives.'), '1.down_to_earth.1': OptimizedText(optimized_text='Directed daily content operations, leading a team of offshore and nearshore content specialists, and supporting sales and research initiatives. Demonstrated extensive client-facing expertise.'), '1.down_to_earth.2': OptimizedText(optimized_text='Managed daily content operations, leading a team of offshore and nearshore content professionals, and overseeing a range of sales and research-related initiatives.'), '1.down_to_earth.3': OptimizedText(optimized_text='Managed daily content operations, including leading a team of offshore and nearshore content professionals as well as sales and research-related initiatives. Skilled at project management and delivering high-quality results.'), '2.other.0': OptimizedText(optimized_text='Led a talented content team to drive daily operations, collaborating with sales and research to support strategic initiatives.')}), '3.responsibilities.3': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Analyzed and modeled financials for 20-30 IT vendors and over 1,500 services contracts, delivering critical insights to support vendor and contract management.'), '1.down_to_earth.1': OptimizedText(optimized_text="Analyzed and modeled financials for 20 to 30 IT vendors' diverse portfolios. Reviewed and negotiated over 1,500 service contracts with extensive client-facing experience."), '1.down_to_earth.2': OptimizedText(optimized_text='Analyzed and modeled financials for 20 to 30 IT vendors and over 1,500 service contracts to support consultative engagements.'), '1.down_to_earth.3': OptimizedText(optimized_text='Analyzed and modeled financial data for 20 to 30 IT vendors and over 1,500 services contracts, leveraging insights to drive successful contract management.'), '2.other.0': OptimizedText(optimized_text='Analyzed and modeled financial data for 20 to 30 IT vendors and over 1,500 service contracts to support product development efforts by gathering and incorporating client services feedback.')}), '3.responsibilities.4': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Experienced data integration project manager who led the successful delivery of mission-critical platform initiatives.'), '1.down_to_earth.1': OptimizedText(optimized_text='Led three major data integration projects critical to the successful launch of a new platform. Adept at delivering high-impact client-facing solutions.'), '1.down_to_earth.2': OptimizedText(optimized_text='Led multiple data integration projects critical to the successful launch of a new enterprise platform, demonstrating strong experience managing client-facing analytics engagements.'), '1.down_to_earth.3': OptimizedText(optimized_text='Led the successful delivery of three mission-critical data integration projects instrumental in launching the new platform.'), '2.other.0': OptimizedText(optimized_text='Led the successful implementation of three mission-critical data integration projects that were instrumental in launching a new platform.')}), '3.responsibilities.5': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Established research leader who advanced to Research Manager role, drawing on over 10 years of analytical expertise.'), '1.down_to_earth.1': OptimizedText(optimized_text='Results-driven Research Manager with a proven track record of client engagement. Led teams and implemented strategies to deliver insights and drive improvements.'), '1.down_to_earth.2': OptimizedText(optimized_text='Managed analytical initiatives and partnered with clients to deliver impactful solutions. Promoted to Research Manager in 2007.'), '1.down_to_earth.3': OptimizedText(optimized_text='Promoted research professional with a decade of career advancement, culminating in a Research Manager role.'), '2.other.0': OptimizedText(optimized_text='Managed research initiatives and oversaw product development, driving continuous improvements based on client feedback. Transitioned from Senior Research Analyst to Research Manager.')}), '4.responsibilities.0': ResponsibilityMatch(optimized_by_requirements={'1.down_to_earth.0': OptimizedText(optimized_text='Drove strategic product decisions through extensive market research and analysis.'), '1.down_to_earth.1': OptimizedText(optimized_text='Researched market dynamics to drive strategic product development and strengthen client relationships.'), '1.down_to_earth.2': OptimizedText(optimized_text='Conducted market research and data analysis to inform strategic product decisions and improve client engagements.'), '1.down_to_earth.3': OptimizedText(optimized_text='Leveraged market research insights to develop and execute effective product strategies.'), '2.other.0': OptimizedText(optimized_text='Guided product strategy and development efforts by leveraging market research insights.')})}"
print(responsibilities)    


# Random Stuff

In [16]:
import pandas as pd

f_path = r"C:\Users\xzhan\My Drive\Job Search\Job Search 2025.xlsx"

df = pd.read_excel(f_path)
for co, job in zip(df.Company, df["Job Title"]):
    text = "_".join([co, job])
    text = text.replace(" ", "_")
    text = text.replace(",", "")
    print(text)

MongoDB_Director_Competitive_Intelligence
Adobe_Sr._Director_Applied_AI/ML_(Discovery)
Flextronics_Sr._Manager_AI_Strategy
TRACE3_Senior_Consultant_AI_Strategy_(Remote)
PwC_Strategy&_Senior_Manager_-_Digital_Value_Transformation_Contact_Center
Glean_Head_of_Competitive_Intelligence
Airtable_Product_Manager_AI
Veeva_Director_-_Crossix_Analytics_Services
ThermoFisher_Scientific_Market_&_Competitive_Intelligence_Manager
DigitialOcean_Director_Product_Management_(AI/ML)
Figma_Researcher_Strategic_Growth
DEPT_Director_of_Applied_AI_Strategy_Media
Deloitte_Market_Research_Sr_Manager_Boston
Deloitte_AI_Data_Specialist_Boston
Deloitte_Global_Business_Services_(GBS)_Strategy_Manager_Boston
Amazon_Senior_Manger_Partner_Strategy_GenAI_Innovation_Center
Blend_Director_AI_Strategy
Snowflake_Director_Product_Marketing_-_Analytics
Advisor360_Degrees_Sr._Product_Manager_–_AI_Analytics_&Insights
Oracle_Senior_AI_Product_Marketing_Manager
Liberty_Mutual_Insurance_Senior_Manager_II_Corporate_Strategy_&_R

TypeError: sequence item 0: expected str instance, float found

In [26]:
# Cell 1: Set style
from IPython.display import display, HTML

display(
    HTML(
        """
<style>
.monaco-workbench .notebook-cell .output pre {
    font-size: 24px !important;
}
</style>
"""
    )
)

# Cell 2: Test
display("This text should be bigger")
display({"key": "value"})

'This text should be bigger'

{'key': 'value'}