# 1. Prepare data with Synth

In [1]:
import pandas as pd
from openai import OpenAI
import json
from datasets import load_dataset
from google.colab import userdata
from sentence_transformers import SentenceTransformer
import numpy as np
client = OpenAI(api_key=userdata.get('openai_api_key'))

## Function

In [2]:
import re

def remove_irrelevant_sections(description):
    """
    Removes irrelevant sections such as "About the Company," "Perks & Benefits,"
    and "Responsibilities" from a job description.

    Args:
        description (str): The job description as a string.

    Returns:
        str: The cleaned job description with irrelevant sections removed.
    """
    # Define regex patterns for sections to remove
    patterns = [
        r"(About the Company:|Our Mission:).*?(?=(Qualifications|Requirements|Skills|Experience|$))",
        r"(Perks & Benefits:|What We Offer:).*?(?=(Qualifications|Requirements|Skills|Experience|$))",
        r"(Responsibilities:).*?(?=(Qualifications|Requirements|Skills|Experience|$))"
    ]

    # Remove each pattern
    for pattern in patterns:
        description = re.sub(pattern, "", description, flags=re.IGNORECASE | re.DOTALL)

    return description.strip()

def extract_qualifications_from_html(description):
    """
    Extracts sections of a job description that begin with keywords like
    "Qualifications," "Requirements," "Skills," or "Experience."

    Args:
        description (str): The job description as a string.

    Returns:
        str: The relevant section containing qualifications, or the original
             description if no match is found.
    """
    # Search for sections starting with relevant keywords
    match = re.search(
        r"(Qualifications|Requirements|Skills|Experience).*",
        description,
        flags=re.IGNORECASE | re.DOTALL,
    )
    if match:
        # Extract the matched section
        relevant_section = match.group(0)
        return relevant_section
    return description

def remove_eoe_notes(description):
    """
    Removes Equal Opportunity Employer (EOE) notes and similar boilerplate text
    from a job description.

    Args:
        description (str): The job description as a string.

    Returns:
        str: The cleaned job description with EOE notes removed.
    """
    # Define regex patterns for common EOE notes
    patterns = [
        r"an equal opportunity employer.*?(?=\n|$)",  # Common phrasing
        r"EOE.*?(?=\n|$)",  # Short form
        r"EEO.*?(?=\n|$)",
        r"equal employment*?(?=\n|$)",  # Full boilerplate
        r"Equal employment opportunity.*?(?=\n|$)"  # Variations
    ]

    # Remove each pattern from the description
    for pattern in patterns:
        description = re.sub(pattern, "", description, flags=re.IGNORECASE | re.DOTALL)

    return description.strip()


## Load data

In [3]:
prompt_template = lambda job_description : f"""Read the following job description and create a concise job search query with at most 3 specialized skills or \
areas of expertise that are distinct to the role. Exclude generic data science or software engineering skills like AI, machine \
learning, and coding languages unless they are explicitly highlighted as unique or advanced. Keep the query short and human-like, \
suitable for typing into a search engine.

Here's the job description: {job_description}"""

In [4]:
def generate_query(job_description):
    """
        Function to generate synthetic query to input job description.
    """

    # generate prompt
    prompt = prompt_template(job_description)

    # make api call
    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[
            {"role": "user", "content": prompt}
        ],
        temperature = 0.7
    )

    # return response
    return response.choices[0].message.content

In [5]:
# load data from HF hub
ds = load_dataset("datastax/linkedin_job_listings")

# convert to pandas df
df = ds['train'].to_pandas()

# keep only title and description
df = df[['title', 'description']]
df.shape

postings.csv:   0%|          | 0.00/517M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/123849 [00:00<?, ? examples/s]

(123849, 2)

In [6]:
# List of strings to search for
search_terms = ["Data Scientist", "Data Analyst", "Machine Learning Engineer",
                "Data Engineer", "AI Engineer", "Deep Learning"]

# Create a regex pattern to match any of the strings
pattern = '|'.join(search_terms)

# Filter rows that contain any of the search terms
df = df[df['title'].str.contains(pattern, case=False, na=False)]
df.shape

(1179, 2)

In [7]:
df.head(5)

Unnamed: 0,title,description
283,Sr Data Engineer with Kafka,Data Engineer with Kafka (W2 Only)💯% Remote\nM...
360,Cloud Platform/ Big Data Engineer,About Subaru Research and Development:Do you c...
367,Data Engineer/ETL,"Responsibilities:Develop new features, fix bug..."
389,Data Analyst,Job Title: Data AnalystDuration: ContractLocat...
483,Senior Data Engineer/Analyst - Full Time,"Job Type: Full-Time, Permanent \nResponsibilit..."


In [None]:
# save to file
df.to_csv('job_data.csv')

## Synthetic by Open AI

In [None]:
job_description_list = df['description'].to_list()

In [None]:
# create batch requests
batch_requests = [
    {
        "custom_id": f"request-{i+1}",  # Custom ID for tracking
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": "gpt-4o-mini",
            "messages": [
                {"role": "user", "content": prompt_template(job_description)}
            ],
            "temperature": 0.7
        }
    }
    for i, job_description in enumerate(job_description_list)
]

In [None]:
# Convert to JSONL format (newline-delimited JSON)
batch_jsonl = "\n".join(json.dumps(request) for request in batch_requests)
# Save to a .jsonl file
with open("batch_requests.jsonl", "w") as file:
    file.write(batch_jsonl)

In [None]:
batch_input_file = client.files.create(
    file=open("batch_requests.jsonl", "rb"),
    purpose="batch"
)

print(batch_input_file)

FileObject(id='file-CyDYmncueRgP3pbZ3mYst1', bytes=5047825, created_at=1757582678, filename='batch_requests.jsonl', object='file', purpose='batch', status='processed', expires_at=1760174678, status_details=None)


In [None]:
# create batch job
batch_object = client.batches.create(
    input_file_id=batch_input_file.id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
        "description": "synthetic queries from job descriptions"
    }
)

In [None]:
print(batch_object)

Batch(id='batch_68c2955a2e5c8190a21fc8ee7c0625ff', completion_window='24h', created_at=1757582682, endpoint='/v1/chat/completions', input_file_id='file-CyDYmncueRgP3pbZ3mYst1', object='batch', status='validating', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=None, expired_at=None, expires_at=1757669082, failed_at=None, finalizing_at=None, in_progress_at=None, metadata={'description': 'synthetic queries from job descriptions'}, output_file_id=None, request_counts=BatchRequestCounts(completed=0, failed=0, total=0), usage={'input_tokens': 0, 'output_tokens': 0, 'total_tokens': 0, 'input_tokens_details': {'cached_tokens': 0}, 'output_tokens_details': {'reasoning_tokens': 0}})


## Process batch result

In [None]:
# extract JDs
df_jobs = pd.read_csv("job_data.csv")
# df_jobs = df_jobs.drop_duplicates()

# only keep text relevant to job qualifications
df_jobs['description_cleaned'] = df_jobs['description'].apply(remove_irrelevant_sections)
df_jobs['description_cleaned'] = df_jobs['description_cleaned'].apply(extract_qualifications_from_html)
df_jobs['description_cleaned'] = df_jobs['description_cleaned'].apply(remove_eoe_notes)

# store job descriptions in a list
job_description_list = df_jobs['description_cleaned'].to_list()

In [None]:
from openai import OpenAI


batch = client.batches.retrieve("batch_68c2955a2e5c8190a21fc8ee7c0625ff")#batch_object.id)
print(batch.status, "-", batch.output_file_id)

completed - file-TsrwPivJ5Y3zh1Y3B8esMX


In [None]:
from openai import OpenAI

file_response = client.files.content("file-TsrwPivJ5Y3zh1Y3B8esMX")#batch.output_file_id)
# print(file_response.text)

# save to file
with open("output.jsonl", 'w') as file:
    file.write(file_response.text)

In [None]:
# extract synthetic queries and store in list (from batch request_
file_path = 'output.jsonl'
query_list = []

with open(file_path, 'r') as file:
    for line in file:
        query = json.loads(line)['response']['body']['choices'][0]['message']['content'].replace('"', '')
        query_list.append(query)

In [None]:
# # create dict with queries and JDs
df = pd.DataFrame({"query" : query_list, "job_description_pos" : job_description_list})

In [None]:
df.head(5)

Unnamed: 0,query,job_description_pos
0,"Kafka data injection, Snowflake SQL expertise,...",experience neededVery strong experience in Kaf...
1,"automotive engineering, big data analysis, clo...",requirements to determine feasibility of desig...
2,"React development, AWS Lambda, Test Driven Dev...","experienceAccountable for code quality, includ..."
3,"Data analytics, statistical analysis software,...","QualificationsAnalytical skills, including the..."
4,mortgage banking data management predictive mo...,requirements and industry practices for mortga...


In [None]:
print("Original shape:", df.shape)
df = df.drop_duplicates(subset=['job_description_pos'])
print("Unique JDs:", df.shape)
df = df.drop_duplicates(subset=['query'])
print("Unique queries:",df.shape)

Original shape: (1179, 2)
Unique JDs: (1020, 2)
Unique queries: (1017, 2)


## Create negative pair of query

In [None]:
# Load the model
model = SentenceTransformer("dangvantuan/vietnamese-document-embedding",trust_remote_code=True)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/171 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/54.0 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

configuration.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/dangvantuan/Vietnamese_impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py: 0.00B [00:00, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/dangvantuan/Vietnamese_impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.22G [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/296 [00:00<?, ?B/s]

In [None]:
# Encode all job descriptions
job_embeddings = model.encode(df['job_description_pos'].to_list())
print(job_embeddings.shape)

(1017, 768)


In [None]:
similarities = model.similarity(job_embeddings, job_embeddings)
print(similarities.shape)


torch.Size([1017, 1017])


In [None]:
# match least JDs least similar to positive match as the negative match
similarities_argsorted = np.argsort(similarities.numpy(), axis=1)
negative_pair_index_list = []

for i in range(len(similarities)):

    # Start with the smallest similarity index for the current row
    j = 0
    index = int(similarities_argsorted[i][j])

    # Ensure the index is unique
    while index in negative_pair_index_list:
        j += 1  # Move to the next smallest index
        index = int(similarities_argsorted[i][j])  # Fetch next smallest index

    negative_pair_index_list.append(index)

In [None]:
# add negative pairs to df
df['job_description_neg'] = df['job_description_pos'].iloc[negative_pair_index_list].values

In [None]:
df.head()

Unnamed: 0,query,job_description_pos,job_description_neg
0,"Kafka data injection, Snowflake SQL expertise,...",experience neededVery strong experience in Kaf...,requirements including Terms amp; Condi
1,"automotive engineering, big data analysis, clo...",requirements to determine feasibility of desig...,Requirements: We're looking for a candidate wi...
2,"React development, AWS Lambda, Test Driven Dev...","experienceAccountable for code quality, includ...",Resource should be able to visualize and expla...
3,"Data analytics, statistical analysis software,...","QualificationsAnalytical skills, including the...",experienced and boldOne of the founders and th...
4,mortgage banking data management predictive mo...,requirements and industry practices for mortga...,experience as a lead full stack Java developer...


In [None]:
# Save to csv
df.to_csv("final_df.csv", index=False, encoding="utf-8-sig")

# 2. Train

## Train test split

In [8]:
import pandas as pd

# Shuffle the dataset
df = pd.read_csv("final_df.csv", encoding="utf-8-sig")
df = df.sample(frac=1, random_state=42).reset_index(drop=True)

# Split into train, validation, and test sets (e.g., 80% train, 10% validation, 10% test)
train_frac = 0.8
valid_frac = 0.1
test_frac = 0.1

# define train and validation size
train_size = int(train_frac * len(df))
valid_size = int(valid_frac * len(df))

# create train, validation, and test datasets
df_train = df[:train_size]
df_valid = df[train_size:train_size + valid_size]
df_test = df[train_size + valid_size:]

In [9]:
df_valid

Unnamed: 0,query,job_description_pos,job_description_neg
813,"Data Engineer with AWS Big Data Services, Orac...",experience. Excellent knowledge of database co...,requirements and industry practices.Build high...
814,"Big Data Engineer, Spark, Hadoop, AWS/GCP",Skills • Expertise and hands-on experience on ...,requirements and provide data-driven recommend...
815,"Data scientist time series analysis, condition...",Experience in Production Operations or Well En...,QUALIFICATIONSMust-Have:Bachelor’s Degree in C...
816,"Senior Data Analyst, healthcare data analysis,...",requirements.Reporting and Dashboard Developme...,experience with speech interfaces Lead and eva...
817,"Data analysis for operations, SQL expertise, d...","requirements, determine technical issues, and ...","experiences, beliefs, backgrounds, expertise, ..."
...,...,...,...
909,"Computer Vision algorithms, behavioral dynamic...",QualificationsRequirementsPh.D. in Computer Vi...,experience in:\n-Expert level SQL skills.-Very...
910,"Business Data Analyst, KPI analysis, data visu...",requirements and provide data-driven recommend...,Skills - Nice to Havessnowflakebig dataJob Des...
911,Data pipelines KNIME SharePoint financial serv...,"Skills:-SQL, SharePoint, Financial Services, E...",experienced Data Engineer to join our world le...
912,"Data Governance, Financial Services Analytics,...","experience for yourself, and a better working ...",skills : AI/ML models using Google Cloud Platf...


In [10]:
# Convert the pandas DataFrames back to Hugging Face Datasets
from datasets import Dataset, DatasetDict

train_ds = Dataset.from_pandas(df_train)
valid_ds = Dataset.from_pandas(df_valid)
test_ds = Dataset.from_pandas(df_test)

# Combine into a DatasetDict
dataset = DatasetDict({
    'train': train_ds,
    'validation': valid_ds,
    'test': test_ds
})


# Finetune

In [11]:
!pip install -U -q sentence-transformers git+https://github.com/huggingface/transformers@v4.56.0-Embedding-Gemma-preview

  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
  Building wheel for transformers (pyproject.toml) ... [?25l[?25hdone


In [None]:
import torch
from sentence_transformers import SentenceTransformer

device = "cuda" if torch.cuda.is_available() else "cpu"

model_id = "google/embeddinggemma-300M"
model = SentenceTransformer(model_id, token=userdata.get('HF_TOKEN')).to(device=device)

print(f"Device: {model.device}")
print(model)
print("Total number of parameters in the model:", sum([p.numel() for _, p in model.named_parameters()]))

In [13]:
task_name = "STS"

def get_scores(query, documents):
  # Tính toán embeddings bằng cách gọi model.encode()
  query_embeddings = model.encode(query, prompt=task_name)
  doc_embeddings = model.encode(documents, prompt=task_name)

  # Tính toán độ tương đồng giữa các embeddings
  similarities = model.similarity(query_embeddings, doc_embeddings)

  for idx, doc in enumerate(documents):
    print("*"*30,"\n")
    print("📕Document: ", doc, "\n\n🤖 Điểm số: ", similarities.numpy()[0][idx])

query = dataset["test"][0]["query"]
print("🚩Truy vấn = {}".format(query))
documents = [dataset["test"][0]["job_description_pos"],dataset["test"][0]["job_description_neg"]]

get_scores(query, documents)


🚩Truy vấn = Data Migration Specialist SAP MDG data quality
****************************** 

📕Document:  requirements, collect data, lead cleansing efforts, and load/support data into SAPthe gap between business and IT teams, effectively communicating data models and setting clear expectations of deliverablesand maintain trackers to showcase progress and hurdles to Project Managers and Stakeholders
Qualifications
knowledge of SAP and MDGcommunication skillsto manage multiple high-priority, fast-paced projects with attention to detail and organizationan excellent opportunity to learn an in-demand area of SAP MDGa strong willingness to learn, with unlimited potential for growth and plenty of opportunities to expand skills
This role offers a dynamic environment where you can directly impact IT projects and contribute to the company’s success. You will work alongside a supportive team of professionals, with ample opportunities for personal and professional development. 
If you’re ready to t

In [None]:
from sentence_transformers import SentenceTransformerTrainer, SentenceTransformerTrainingArguments
from sentence_transformers.losses import MultipleNegativesRankingLoss
from transformers import TrainerCallback

# Định nghĩa hàm mất mát cho bài toán matching văn bản
loss = MultipleNegativesRankingLoss(model)

# Cấu hình huấn luyện
training_args = SentenceTransformerTrainingArguments(
    output_dir="custom-embedding-model",      # thư mục lưu kết quả huấn luyện
    prompts=model.prompts[task_name],         # prompt lấy từ model để train
    num_train_epochs=1,                       # số epoch
    per_device_train_batch_size=1,            # batch size mỗi thiết bị
    learning_rate=2e-5,                       # tốc độ học
    warmup_ratio=0.1,                         # tỉ lệ warmup
    logging_steps=dataset["train"].num_rows,  # số bước log
    report_to="none",                         # không log ra ngoài
)

# Callback chạy đánh giá sau mỗi epoch
class EvalCallback(TrainerCallback):
    """Callback để đánh giá mô hình trong quá trình train"""
    def __init__(self, eval_func):
        self.eval_func = eval_func

    def on_log(self, args, state, control, **kwargs):
        # In log và gọi hàm đánh giá
        print(f"✅ Step {state.global_step} hoàn tất. Bắt đầu evaluate...")
        self.eval_func()

# Hàm evaluate (giữ nguyên tên)
def evaluate():
    get_scores(query, documents)

# Khởi tạo Trainer và huấn luyện
trainer = SentenceTransformerTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    loss=loss,
    callbacks=[EvalCallback(evaluate)]
)

trainer.train()


Computing widget examples:   0%|          | 0/1 [00:00<?, ?example/s]

Step,Training Loss
813,0.1509


✅ Step 813 hoàn tất. Bắt đầu evaluate...
****************************** 

📕Document:  requirements, collect data, lead cleansing efforts, and load/support data into SAPthe gap between business and IT teams, effectively communicating data models and setting clear expectations of deliverablesand maintain trackers to showcase progress and hurdles to Project Managers and Stakeholders
Qualifications
knowledge of SAP and MDGcommunication skillsto manage multiple high-priority, fast-paced projects with attention to detail and organizationan excellent opportunity to learn an in-demand area of SAP MDGa strong willingness to learn, with unlimited potential for growth and plenty of opportunities to expand skills
This role offers a dynamic environment where you can directly impact IT projects and contribute to the company’s success. You will work alongside a supportive team of professionals, with ample opportunities for personal and professional development. 
If you’re ready to take on new challen

TrainOutput(global_step=813, training_loss=0.15087221528038033, metrics={'train_runtime': 605.3707, 'train_samples_per_second': 1.343, 'train_steps_per_second': 1.343, 'total_flos': 0.0, 'train_loss': 0.15087221528038033, 'epoch': 1.0})

In [None]:
# After finetune
get_scores(query, documents)

****************************** 

📕Document:  requirements, collect data, lead cleansing efforts, and load/support data into SAPthe gap between business and IT teams, effectively communicating data models and setting clear expectations of deliverablesand maintain trackers to showcase progress and hurdles to Project Managers and Stakeholders
Qualifications
knowledge of SAP and MDGcommunication skillsto manage multiple high-priority, fast-paced projects with attention to detail and organizationan excellent opportunity to learn an in-demand area of SAP MDGa strong willingness to learn, with unlimited potential for growth and plenty of opportunities to expand skills
This role offers a dynamic environment where you can directly impact IT projects and contribute to the company’s success. You will work alongside a supportive team of professionals, with ample opportunities for personal and professional development. 
If you’re ready to take on new challenges and grow your career in data analytic

In [None]:
save_path = "saved-embedding-model"
trainer.save_model(save_path)

print(f"📂 Mô hình đã được lưu tại: {save_path}")

📂 Mô hình đã được lưu tại: saved-embedding-model


In [None]:
from sentence_transformers import SentenceTransformer
model = SentenceTransformer("saved-embedding-model")
get_scores(query, documents)

****************************** 

📕Document:  requirements, collect data, lead cleansing efforts, and load/support data into SAPthe gap between business and IT teams, effectively communicating data models and setting clear expectations of deliverablesand maintain trackers to showcase progress and hurdles to Project Managers and Stakeholders
Qualifications
knowledge of SAP and MDGcommunication skillsto manage multiple high-priority, fast-paced projects with attention to detail and organizationan excellent opportunity to learn an in-demand area of SAP MDGa strong willingness to learn, with unlimited potential for growth and plenty of opportunities to expand skills
This role offers a dynamic environment where you can directly impact IT projects and contribute to the company’s success. You will work alongside a supportive team of professionals, with ample opportunities for personal and professional development. 
If you’re ready to take on new challenges and grow your career in data analytic