In [1]:
# recreate the job posts csv with correct classification

In [2]:
import pandas as pd
import time
from sklearn.model_selection import train_test_split
from ibm_watsonx_ai import APIClient
from ibm_watsonx_ai import Credentials
from ibm_watsonx_ai.foundation_models import ModelInference
from ibm_watsonx_ai.foundation_models.schema import TextGenParameters
from sklearn.metrics import classification_report 
from tqdm import tqdm

In [12]:

# Read the CSV file into a DataFrame
df2 = pd.read_csv('data/jobposts.csv')

sample_df = df2.sample(frac=0.5, random_state=42)

In [4]:

label_map = [
     'INFORMATION-TECHNOLOGY',
     'BUSINESS-DEVELOPMENT',
     'FINANCE',
     'ADVOCATE',
     'ACCOUNTANT',
     'ENGINEERING',
     'CHEF',
     'AVIATION',
     'FITNESS',
     'SALES',
     'BANKING',
     'HEALTHCARE',
     'CONSULTANT',
     'CONSTRUCTION',
     'PUBLIC-RELATIONS',
     'HR',
     'DESIGNER',
     'ARTS',
     'TEACHER',
     'APPAREL',
     'DIGITAL-MEDIA',
     'AGRICULTURE',
     'AUTOMOBILE',
     'ВРО',
]

label_map2 = [
    'INFORMATION-TECHNOLOGY',
     'BUSINESS-DEVELOPMENT',
     'FINANCE',
     'ADVOCATE',
     'ACCOUNTANT',
     'ENGINEERING',
     'CHEF',
     'AVIATION',
     'FITNESS',
     'SALES',
     'BANKING',
     'HEALTHCARE',
     'CONSULTANT',
     'CONSTRUCTION',
     'PUBLIC-RELATIONS',
     'HR',
     'DESIGNER',
     'ARTS',
     'TEACHER',
     'APPAREL',
     'DIGITAL-MEDIA',
     'AGRICULTURE',
     'AUTOMOBILE',
     'ВРО',
     'OTHER'
]

In [64]:
from decouple import config


WX_API_KEY = config('WX_API_KEY')

credentials = Credentials(
    url = "https://us-south.ml.cloud.ibm.com",
    api_key = WX_API_KEY
)

client = APIClient(
    credentials=credentials, 
    project_id="c8d96942-6948-4a06-9909-2ea6394aca25"
)

In [7]:

PARAMS = TextGenParameters(
    temperature=0,              # Higher temperature means more randomness - In this case we don't want randomness
    max_new_tokens=10,          # Maximum number of tokens to generate
    stop_sequences=[".", "\n"], # Stop generating text when these sequences are encountered
)

model = ModelInference(
    api_client=client,
    model_id="meta-llama/llama-3-3-70b-instruct",  
    params=PARAMS
)

In [11]:
SYSTEM_PROMPT = """You task is to classify the job descriptions into one of the given categories. If you think the data doesn't fit in any of them assign it to OTHER. Please assign the correct category to the text. Answer with the correct category and nothing else.

CATEGORIES:
{categories}

JOB POST:
{text}

The correct job category for the job post is:
"""

In [67]:
CATEGORIES = "- " + "\n- ".join(label_map2)  # Create a string with all categories

predictions = []

refined_df = []


for index, row in tqdm(sample_df.iterrows()):
    try:
        text= (row['job_title'] + ". " + row['job_description'])

        # format the prompt with the categories and the text
        prompt = SYSTEM_PROMPT.format(categories=CATEGORIES, text= text)
        
        # generate the response from the model
        response = model.generate(prompt)

        # extract the generated text from the response
        prediction = response["results"][0]["generated_text"].strip().replace(".", "")

        row['predictions'] = prediction

        if prediction in label_map:
            # add the prediction to the new dataframe
            refined_df.append(row)
        else:
            print(prediction)
    except Exception as e:
        time.sleep(5)
        # By this way we can know about the type of error occurring
        print("The error is: ",e)



0it [00:00, ?it/s]

1100it [08:03,  2.28it/s]


In [30]:
column_names = list(df2).append('predictions')
updated_df = pd.DataFrame(refined_df, columns=column_names)
updated_df.head()

Unnamed: 0,country,country_code,date_added,has_expired,job_board,job_description,job_title,job_type,location,organization,page_url,salary,sector,uniq_id,predictions
13035,United States of America,US,,No,jobs.monster.com,The QC Lead position requires an understanding...,Quality Control Lead Job in Santa Cruz,Full Time,"Santa Cruz, CA 95060",Manufacturing - Other,http://jobview.monster.com/Quality-Control-Lea...,,Experienced (Non-Manager),c44af81ceb38d2b6fe00baf15d0dffe6,HEALTHCARE
8732,United States of America,US,,No,jobs.monster.com,POSITION PURPOSE The Senior Manager Foundation...,Sr Manager,Full Time,"Atlanta, GA 30301",Retail,http://jobview.monster.com/sr-manager-foundati...,,Manager (Manager/Supervisor of Staff),ef7db9ba10dd8bf7754ecf6ddd09a5aa,PUBLIC-RELATIONS
7591,United States of America,US,,No,jobs.monster.com,GroundskeeperWest Knox Apart need a F/T Ground...,Groundskeeper Job in Knoxville,Full Time,"Knoxville, TN 37923",,http://jobview.monster.com/Groundskeeper-Job-K...,,,cd3adb94fc647ca5ea3af731dc1da065,CONSTRUCTION
17295,United States of America,US,,No,jobs.monster.com,Financial Additions has partnered up with a la...,Staff Accountant Job in Dallas,Full Time,"Dallas, TX",Accounting and Auditing Services,http://jobview.monster.com/Staff-Accountant-Jo...,,Entry Level,02eb906a2bfbca24bf1e9bd26596d603,ACCOUNTANT
5857,United States of America,US,,No,jobs.monster.com,"Plastipak Packaging, Inc. is seeking a Legal A...",Legal Assistant Job in Plymouth,Full Time,"Plymouth, MI 48170",,http://jobview.monster.com/Legal-Assistant-Job...,,Legal,c89fc1a032e406b38eee2cd202cab910,ADVOCATE


In [34]:
updated_df.to_csv('data/processed/jobposts.csv', index=False)

In [33]:
updated_df['predictions'].value_counts()

predictions
SALES                     1365
HEALTHCARE                1193
ENGINEERING               1107
INFORMATION-TECHNOLOGY    1056
CONSTRUCTION               710
CHEF                       419
ACCOUNTANT                 386
HR                         341
AUTOMOBILE                 283
ADVOCATE                   250
FINANCE                    211
BUSINESS-DEVELOPMENT       140
PUBLIC-RELATIONS           125
AGRICULTURE                 81
TEACHER                     68
AVIATION                    52
CONSULTANT                  42
APPAREL                     33
BANKING                     31
DESIGNER                    29
DIGITAL-MEDIA               27
FITNESS                     10
ARTS                         5
Name: count, dtype: int64

***

# Let's do a manual classification of job posts so we can see how well is AI doing and if the results are acceptable   

In [None]:
import pandas as pd

In [None]:
# Load the dataset
df = pd.read_csv('data/processed/jobposts.csv')

# Select the first 100 rows
subset = df.head(100)

# Create a new column to store user input
subset['manually_classified_category'] = ''

# Loop through each row and prompt the user for input
for i, row in subset.iterrows():
    print(f"\nJob #{i+1}")
    print("Title:", row['job_title'])
    print("Description:", row['job_description'])
    category = input("Enter job category: ")
    print("Actual ", row['predictions'])
    subset.at[i, 'manually_classified_category'] = category

# Save the updated DataFrame to a new CSV
subset.to_csv('data/jobposts_with_human_classification.csv', index=False)
print("\nSaved to data/jobposts_with_human_classification.csv")