In [10]:
# recreate the job posts csv with correct classification

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from ibm_watsonx_ai import APIClient
from ibm_watsonx_ai import Credentials
from ibm_watsonx_ai.foundation_models import ModelInference
from ibm_watsonx_ai.foundation_models.schema import TextGenParameters
from sklearn.metrics import classification_report 
from tqdm import tqdm

In [12]:

# Read the CSV file into a DataFrame
df2 = pd.read_csv('/Users/nikostsatsabas/Desktop/cbs courses/semester 3/ai and machine learning/final project/aiml_exam/data/jobposts.csv')

sample_df = df2.sample(frac=0.02, random_state=42)

In [13]:

label_map = [
     'INFORMATION-TECHNOLOGY',
     'BUSINESS-DEVELOPMENT',
     'FINANCE',
     'ADVOCATE',
     'ACCOUNTANT',
     'ENGINEERING',
     'CHEF',
     'AVIATION',
     'FITNESS',
     'SALES',
     'BANKING',
     'HEALTHCARE',
     'CONSULTANT',
     'CONSTRUCTION',
     'PUBLIC-RELATIONS',
     'HR',
     'DESIGNER',
     'ARTS',
     'TEACHER',
     'APPAREL',
     'DIGITAL-MEDIA',
     'AGRICULTURE',
     'AUTOMOBILE',
     'ВРО',
     'null' 
]

In [14]:
from decouple import config


WX_API_KEY = config('WX_API_KEY')

credentials = Credentials(
    url = "https://us-south.ml.cloud.ibm.com",
    api_key = WX_API_KEY
)

client = APIClient(
    credentials=credentials, 
    project_id="c8d96942-6948-4a06-9909-2ea6394aca25"
)

In [15]:

PARAMS = TextGenParameters(
    temperature=0,              # Higher temperature means more randomness - In this case we don't want randomness
    max_new_tokens=10,          # Maximum number of tokens to generate
    stop_sequences=[".", "\n"], # Stop generating text when these sequences are encountered
)

model = ModelInference(
    api_client=client,
    model_id="meta-llama/llama-3-3-70b-instruct",  
    params=PARAMS
)

In [16]:
SYSTEM_PROMPT = """You task is to classify the job descriptions into one of the twenty five categories, check the first twenty four ones and if nothing matches assign it to the last one

CATEGORIES:
{categories}

TEXT:
{text}

Please assign the correct category to the text. Answer with the correct category and nothing else.

Category:
"""

In [17]:
CATEGORIES = "- " + "\n- ".join(label_map)  # Create a string with all categories

predictions = []

sample_df['text'] = sample_df['job_title'] + ". " + sample_df['job_description']

for text in tqdm(sample_df["text"]):

    # format the prompt with the categories and the text
    prompt = SYSTEM_PROMPT.format(categories=CATEGORIES, text=text)
    
    # generate the response from the model
    response = model.generate(prompt)

    # extract the generated text from the response
    prediction = response["results"][0]["generated_text"].strip()

    # append the prediction to the list of predictions
    predictions.append(prediction)

sample_df['predicted_label'] = predictions


100%|██████████| 440/440 [02:55<00:00,  2.51it/s]


In [18]:
print(classification_report(sample_df.predicted_label, predictions))

                                               precision    recall  f1-score   support

                                   ACCOUNTANT       1.00      1.00      1.00        18
                                     ADVOCATE       1.00      1.00      1.00        14
                                  AGRICULTURE       1.00      1.00      1.00         4
                                      APPAREL       1.00      1.00      1.00         4
                     APPAREL  is not correct,       1.00      1.00      1.00         3
                                   AUTOMOBILE       1.00      1.00      1.00        16
                                     AVIATION       1.00      1.00      1.00         3
                                      BANKING       1.00      1.00      1.00         1
                         BUSINESS-DEVELOPMENT       1.00      1.00      1.00         5
                                         CHEF       1.00      1.00      1.00        16
                                 CONSTRUCT

In [None]:
refined