In [50]:
# Imports
import pandas as pd
from sklearn.model_selection import train_test_split
from ibm_watsonx_ai import APIClient, Credentials
from ibm_watsonx_ai.foundation_models import ModelInference
from ibm_watsonx_ai.foundation_models.schema import TextGenParameters
from sklearn.metrics import classification_report 
from tqdm import tqdm
from decouple import config
import json
import re
import time

In [2]:
# Read the CSV file for jobposts into a DataFrame
df_jobposts = pd.read_csv('data/processed_data/processed_jobposts.csv')
df_jobposts.info()
# sample_df = df.sample(frac=0.05, random_state=42)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1032 entries, 0 to 1031
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   country          1032 non-null   object
 1   country_code     1032 non-null   object
 2   date_added       6 non-null      object
 3   has_expired      1032 non-null   object
 4   job_board        1032 non-null   object
 5   job_description  1032 non-null   object
 6   job_title        1032 non-null   object
 7   job_type         942 non-null    object
 8   location         1032 non-null   object
 9   organization     717 non-null    object
 10  page_url         1032 non-null   object
 11  salary           157 non-null    object
 12  sector           769 non-null    object
 13  uniq_id          1032 non-null   object
 14  predictions      1032 non-null   object
dtypes: object(15)
memory usage: 121.1+ KB


In [3]:
# Count and list unique job categories
print(df_jobposts['predictions'].value_counts())

predictions
HEALTHCARE                191
INFORMATION-TECHNOLOGY    188
SALES                     158
ENGINEERING               129
CONSTRUCTION               68
CHEF                       43
ACCOUNTANT                 42
ADVOCATE                   38
AUTOMOBILE                 38
HR                         32
FINANCE                    27
PUBLIC-RELATIONS           20
BUSINESS-DEVELOPMENT       19
TEACHER                    11
DIGITAL-MEDIA               8
AGRICULTURE                 6
BANKING                     4
DESIGNER                    3
AVIATION                    2
APPAREL                     2
CONSULTANT                  2
FITNESS                     1
Name: count, dtype: int64


In [4]:
# Filter for the 5 target categories
categories = ['INFORMATION-TECHNOLOGY', 'FINANCE', 'SALES', 'HEALTHCARE', 'HR']
df_filtered = df_jobposts[df_jobposts['predictions'].isin(categories)]

# Sample 2 job postings per category
df_sampled = df_filtered.groupby('predictions', group_keys=False).sample(n=2, random_state=42)

# Reduce to relevant columns
df_sampled = df_sampled[['predictions', 'job_description']]

In [5]:
# Split into separate DataFrames
df_it = df_sampled[df_sampled['predictions'] == 'INFORMATION-TECHNOLOGY']
df_finance = df_sampled[df_sampled['predictions'] == 'FINANCE']
df_sales = df_sampled[df_sampled['predictions'] == 'SALES']
df_healthcare = df_sampled[df_sampled['predictions'] == 'HEALTHCARE']
df_hr = df_sampled[df_sampled['predictions'] == 'HR']


In [6]:
print(df_sampled)

                predictions                                    job_description
194                 FINANCE  Liability Claims Team LeadCLAIM YOUR FUTURE AS...
534                 FINANCE  Report this job About the Job **Manager, Cash ...
57               HEALTHCARE  The Quality Assurance Manager will be responsi...
432              HEALTHCARE  Full-TimeOklahoma City, OK Job #:CLedmOK058499...
318                      HR  The Human Resource Consulting Group, LLC (HRCG...
174                      HR  Kforce is one of the premier leaders in the st...
160  INFORMATION-TECHNOLOGY  RESPONSIBILITIES:Kforce has a client in Irvine...
187  INFORMATION-TECHNOLOGY  Recruiter to Contact: JC CaoJob Title: Project...
133                   SALES  Summary Event Specialist Part Time Sales   Are...
52                    SALES  Overview   Let's Grow Together! We are a fast-...


In [7]:
# Print the column job "description" of 7th row
print(df_sampled.iloc[8]['job_description'])

Summary Event Specialist Part Time Sales   Are you outgoing, friendly and enjoy meeting new people?  Our part time Event Specialist jobs are fun and exciting and could be a great fit for you! Join our winning team as a retail demonstrator promoting best in class products at your local retailers. You can be the brand ambassador who excels in captivating an audience during in-store events, with an emphasis on brand awareness, product demonstration and sales. The in-store demonstrator is responsible for reviewing program materials, set up and break down of the work area, and the preparation and sampling of products on scheduled event days. Take this chance to join the largest sales and marketing agency in North America, Advantage Solutions, where you will receive top-notch training and competitive pay rates.          Responsibilities:   Set up, break down, product preparation and sampling during in-store demonstrations Generate brand awareness and positive product impressions to increase 

In [8]:
# Read the CSV file for CVs into a DataFrame
df_cvs = pd.read_csv('data/resume.csv')
df_cvs.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2484 entries, 0 to 2483
Data columns (total 4 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   ID           2484 non-null   int64 
 1   Resume_str   2484 non-null   object
 2   Resume_html  2484 non-null   object
 3   Category     2484 non-null   object
dtypes: int64(1), object(3)
memory usage: 77.8+ KB


In [9]:
# Print head of CVs dataframe
print(df_cvs.head())

         ID                                         Resume_str  \
0  16852973           HR ADMINISTRATOR/MARKETING ASSOCIATE\...   
1  22323967           HR SPECIALIST, US HR OPERATIONS      ...   
2  33176873           HR DIRECTOR       Summary      Over 2...   
3  27018550           HR SPECIALIST       Summary    Dedica...   
4  17812897           HR MANAGER         Skill Highlights  ...   

                                         Resume_html Category  
0  <div class="fontsize fontface vmargins hmargin...       HR  
1  <div class="fontsize fontface vmargins hmargin...       HR  
2  <div class="fontsize fontface vmargins hmargin...       HR  
3  <div class="fontsize fontface vmargins hmargin...       HR  
4  <div class="fontsize fontface vmargins hmargin...       HR  


In [10]:
# Count and list unique job categories of CVs
print(df_cvs['Category'].value_counts())

Category
INFORMATION-TECHNOLOGY    120
BUSINESS-DEVELOPMENT      120
FINANCE                   118
ADVOCATE                  118
ACCOUNTANT                118
ENGINEERING               118
CHEF                      118
AVIATION                  117
FITNESS                   117
SALES                     116
BANKING                   115
HEALTHCARE                115
CONSULTANT                115
CONSTRUCTION              112
PUBLIC-RELATIONS          111
HR                        110
DESIGNER                  107
ARTS                      103
TEACHER                   102
APPAREL                    97
DIGITAL-MEDIA              96
AGRICULTURE                63
AUTOMOBILE                 36
BPO                        22
Name: count, dtype: int64


In [43]:
# Take 50% relevant and 50% irrelevant CVs (30 CVs total)
# Filter CVs by category
relevant_cvs = df_cvs[df_cvs['Category'] == 'INFORMATION-TECHNOLOGY'].sample(n=5, random_state=42)

irrelevant_cvs = df_cvs[df_cvs['Category'] != 'INFORMATION-TECHNOLOGY'].sample(n=5, random_state=42)

# Combine into one 100-row dataset
df_cvs_it_test = pd.concat([relevant_cvs, irrelevant_cvs], ignore_index=True)

# Shuffle the result (so the LLM doesn't learn order)
df_cvs_it_test = df_cvs_it_test.sample(frac=1, random_state=42).reset_index(drop=True)

# Print the unique categories in the CVs DataFrame
print(df_cvs_it_test['Category'].value_counts())

# Print the first 5 rows of the CVs DataFrame
print(df_cvs_it_test.head())

Category
INFORMATION-TECHNOLOGY    5
ENGINEERING               2
PUBLIC-RELATIONS          1
DIGITAL-MEDIA             1
FINANCE                   1
Name: count, dtype: int64
         ID                                         Resume_str  \
0  61319162           MARKETING AND COMMUNICATIONS DIRECTOR...   
1  20237244           INFORMATION TECHNOLOGY SPECIALIST    ...   
2  23085604           CHIEF SYSTEM ARCHITECT, SVP SYSTEM IN...   
3  24083609           INFORMATION TECHNOLOGY SPECIALIST (IN...   
4  21298336           ENGINEERING INTERN         Summary   ...   

                                         Resume_html                Category  
0  <div class="fontsize fontface vmargins hmargin...        PUBLIC-RELATIONS  
1  <div class="fontsize fontface vmargins hmargin...  INFORMATION-TECHNOLOGY  
2  <div class="fontsize fontface vmargins hmargin...           DIGITAL-MEDIA  
3  <div class="fontsize fontface vmargins hmargin...  INFORMATION-TECHNOLOGY  
4  <div class="fontsize fontface 

In [42]:
# Define model setup and parameters
# Load credentials
WX_API_KEY = config("WX_API_KEY")
WX_PROJECT_ID = config("WX_PROJECT_ID")
WX_API_URL = "https://us-south.ml.cloud.ibm.com"

# Create client
credentials = Credentials(
    api_key=WX_API_KEY,
    url=WX_API_URL,
)
client = APIClient(
    credentials=credentials,
    project_id=WX_PROJECT_ID,
)

# Define generation parameters
PARAMS = TextGenParameters(
    temperature=0,
    max_new_tokens=200,
    stop_sequences=["### END"]
)

# Load the model (same as in your previous assignments)
model = ModelInference(
    api_client=client,
    model_id="mistralai/mistral-large",
    params=PARAMS
)

In [51]:
# Select the first job posting from the IT category
job_description_it = df_it.iloc[0]['job_description']

# Create empty lists to hold scores and explanations
scores = []
explanations = []

# Define the prompt template
def build_match_prompt(job_description: str, resume_text: str) -> str:
    return f"""
You are a recruitment assistant.

Your task is to evaluate how well the candidate's CV (full text of CV) matches the job description (full text of job description).

Give:
1. A match score from 0 to 10, where 0 = very poor fit and 10 = excellent fit.
2. A short explanation (max 2 sentences) justifying the score.

JOB DESCRIPTION:
{job_description}

CANDIDATE RESUME:
{resume_text}

Respond strictly in this JSON format (no markdown, no explanations before or after), which serves an example for your response:
{{
  "score": 8,
  "explanation": "The candidate has relevant engineering and Java experience, but lacks SDLC and Hyperion knowledge."
}}

### END
"""

In [58]:
# Send each CV to the LLM with the job description
for _, row in tqdm(df_cvs_it_test.iterrows(), total=len(df_cvs_it_test)):
    resume = row['Resume_str']
    prompt = build_match_prompt(job_description_it, resume)

    print("[DEBUG] Prompt:", repr(prompt))
    
    # Call the LLM
    response = model.generate(prompt)

    # Updated JSON extraction logic
    raw_output = response["results"][0]["generated_text"]
    clean_output = re.sub(r"json|", "", raw_output).strip()

    try:
        match = re.search(r'\{.*?\}', clean_output, re.DOTALL)
        if match:
            parsed = json.loads(match.group())
            scores.append(float(parsed.get("score", 0)))
            explanations.append(parsed.get("explanation", ""))
        else:
            print("[ERROR] No JSON object found in output.")
            scores.append(0)
            explanations.append("NO_JSON")
    except Exception as e:
        print(f"[ERROR] Failed to parse JSON: {e}")
        print("[RAW OUTPUT]", clean_output)
        scores.append(0)
        explanations.append("PARSE_ERROR")

  0%|          | 0/10 [00:00<?, ?it/s]

[DEBUG] Prompt: '\nYou are a recruitment assistant.\n\nYour task is to evaluate how well the candidate\'s CV (full text of CV) matches the job description (full text of job description).\n\nGive:\n1. A match score from 0 to 10, where 0 = very poor fit and 10 = excellent fit.\n2. A short explanation (max 2 sentences) justifying the score.\n\nJOB DESCRIPTION:\nRESPONSIBILITIES:Kforce has a client in Irvine, CA that is searching for a Business Systems Analyst - Sales Operations.Responsibilities: Perform application development and maintenance support of application systems under direct supervision Will work closely with coworkers in a team oriented environment Act as lead on small enhancements and support items Provide analytical and application development support of applications for Sales Operation systems under direct supervision Design, develop and maintain systems Meets with end usersREQUIREMENTS: Must have mainframe application development experience Must have experience adhering to

 10%|█         | 1/10 [00:04<00:36,  4.06s/it]

[DEBUG] Prompt: '\nYou are a recruitment assistant.\n\nYour task is to evaluate how well the candidate\'s CV (full text of CV) matches the job description (full text of job description).\n\nGive:\n1. A match score from 0 to 10, where 0 = very poor fit and 10 = excellent fit.\n2. A short explanation (max 2 sentences) justifying the score.\n\nJOB DESCRIPTION:\nRESPONSIBILITIES:Kforce has a client in Irvine, CA that is searching for a Business Systems Analyst - Sales Operations.Responsibilities: Perform application development and maintenance support of application systems under direct supervision Will work closely with coworkers in a team oriented environment Act as lead on small enhancements and support items Provide analytical and application development support of applications for Sales Operation systems under direct supervision Design, develop and maintain systems Meets with end usersREQUIREMENTS: Must have mainframe application development experience Must have experience adhering to

 10%|█         | 1/10 [00:38<05:43, 38.18s/it]


KeyboardInterrupt: 

In [59]:
# Print the scores and explanations
for i, row in df_cvs_it_test.iterrows():
    print(f"CV {i+1} - Score: {scores[i]}, Explanation: {explanations[i]}")

CV 1 - Score: 2.0, Explanation: The candidate's background is primarily in marketing, communications, and public relations, with no mention of mainframe application development, SDLC methodologies, or Hyperion experience required for the Business Systems Analyst role.


IndexError: list index out of range

In [31]:
# Send each CV to the LLM with the job description
for _, row in tqdm(df_cvs_it_test.iterrows(), total=len(df_cvs_it_test)):
    resume = row['Resume_str']
    prompt = build_match_prompt(job_description_it, resume)

    if _ == 0:  # Only print the first one
        print("Full prompt:\n", prompt)
        print("[DEBUG] Prompt length (chars):", len(prompt))

100%|██████████| 30/30 [00:00<00:00, 9606.74it/s]

Full prompt:
 
You are a recruitment assistant.

Your task is to evaluate how well the candidate's CV (full text of CV) matches the job description (full text of job description).

Give:
1. A match score from 0 to 10, where 0 = very poor fit and 10 = excellent fit.
2. A short explanation (max 2 sentences) justifying the score.

JOB DESCRIPTION:
RESPONSIBILITIES:Kforce has a client in Irvine, CA that is searching for a Business Systems Analyst - Sales Operations.Responsibilities: Perform application development and maintenance support of application systems under direct supervision Will work closely with coworkers in a team oriented environment Act as lead on small enhancements and support items Provide analytical and application development support of applications for Sales Operation systems under direct supervision Design, develop and maintain systems Meets with end usersREQUIREMENTS: Must have mainframe application development experience Must have experience adhering to SDLC methodol


