In [60]:
# recreate the job posts csv with correct classification

In [61]:
import pandas as pd
from sklearn.model_selection import train_test_split
from ibm_watsonx_ai import APIClient
from ibm_watsonx_ai import Credentials
from ibm_watsonx_ai.foundation_models import ModelInference
from ibm_watsonx_ai.foundation_models.schema import TextGenParameters
from sklearn.metrics import classification_report 
from tqdm import tqdm

In [62]:

# Read the CSV file into a DataFrame
df2 = pd.read_csv('/Users/nikostsatsabas/Desktop/cbs courses/semester 3/ai and machine learning/final project/aiml_exam/data/jobposts.csv')

sample_df = df2.sample(frac=0.05, random_state=42)

In [63]:

label_map = [
     'INFORMATION-TECHNOLOGY',
     'BUSINESS-DEVELOPMENT',
     'FINANCE',
     'ADVOCATE',
     'ACCOUNTANT',
     'ENGINEERING',
     'CHEF',
     'AVIATION',
     'FITNESS',
     'SALES',
     'BANKING',
     'HEALTHCARE',
     'CONSULTANT',
     'CONSTRUCTION',
     'PUBLIC-RELATIONS',
     'HR',
     'DESIGNER',
     'ARTS',
     'TEACHER',
     'APPAREL',
     'DIGITAL-MEDIA',
     'AGRICULTURE',
     'AUTOMOBILE',
     'ВРО',
]

label_map2 = [
    'INFORMATION-TECHNOLOGY',
     'BUSINESS-DEVELOPMENT',
     'FINANCE',
     'ADVOCATE',
     'ACCOUNTANT',
     'ENGINEERING',
     'CHEF',
     'AVIATION',
     'FITNESS',
     'SALES',
     'BANKING',
     'HEALTHCARE',
     'CONSULTANT',
     'CONSTRUCTION',
     'PUBLIC-RELATIONS',
     'HR',
     'DESIGNER',
     'ARTS',
     'TEACHER',
     'APPAREL',
     'DIGITAL-MEDIA',
     'AGRICULTURE',
     'AUTOMOBILE',
     'ВРО',
     'OTHER'
]

In [64]:
from decouple import config


WX_API_KEY = config('WX_API_KEY')

credentials = Credentials(
    url = "https://us-south.ml.cloud.ibm.com",
    api_key = WX_API_KEY
)

client = APIClient(
    credentials=credentials, 
    project_id="c8d96942-6948-4a06-9909-2ea6394aca25"
)

In [65]:

PARAMS = TextGenParameters(
    temperature=0,              # Higher temperature means more randomness - In this case we don't want randomness
    max_new_tokens=10,          # Maximum number of tokens to generate
    stop_sequences=[".", "\n"], # Stop generating text when these sequences are encountered
)

model = ModelInference(
    api_client=client,
    model_id="meta-llama/llama-3-3-70b-instruct",  
    params=PARAMS
)

In [66]:
SYSTEM_PROMPT = """You task is to classify the job descriptions into one of the categories. If you think the data doesn't fit in any of them assign it to OTHER

CATEGORIES:
{categories}

TEXT:
{text}

Please assign the correct category to the text. Answer with the correct category and nothing else.

Category:
"""

In [67]:
CATEGORIES = "- " + "\n- ".join(label_map2)  # Create a string with all categories

predictions = []

refined_df = []



for index, row in tqdm(sample_df.iterrows()):

    text= (row['job_title'] + ". " + row['job_description'])

    # format the prompt with the categories and the text
    prompt = SYSTEM_PROMPT.format(categories=CATEGORIES, text= text)
    
    # generate the response from the model
    response = model.generate(prompt)

    # extract the generated text from the response
    prediction = response["results"][0]["generated_text"].strip()

    row ['predictions'] = prediction

    if prediction in label_map:
        # add the prediction to the new dataframe
        refined_df.append(row)




0it [00:00, ?it/s]

1100it [08:03,  2.28it/s]


In [68]:
column_names = list(df2).append('predictions')
updated_df = pd.DataFrame(refined_df, columns=column_names)
updated_df.head()

Unnamed: 0,country,country_code,date_added,has_expired,job_board,job_description,job_title,job_type,location,organization,page_url,salary,sector,uniq_id,predictions
13035,United States of America,US,,No,jobs.monster.com,The QC Lead position requires an understanding...,Quality Control Lead Job in Santa Cruz,Full Time,"Santa Cruz, CA 95060",Manufacturing - Other,http://jobview.monster.com/Quality-Control-Lea...,,Experienced (Non-Manager),c44af81ceb38d2b6fe00baf15d0dffe6,HEALTHCARE
3115,United States of America,US,,No,jobs.monster.com,JOB DESCRIPTION ...,Case Manager Job in Sunset Point,Full Time Employee,"Sunset Point, FL 33765",Healthcare Services,http://jobview.monster.com/Case-Manager-Job-Su...,,Medical/Health,217600308ffc96a1a81eb8afba5ee522,HEALTHCARE
8732,United States of America,US,,No,jobs.monster.com,POSITION PURPOSE The Senior Manager Foundation...,Sr Manager,Full Time,"Atlanta, GA 30301",Retail,http://jobview.monster.com/sr-manager-foundati...,,Manager (Manager/Supervisor of Staff),ef7db9ba10dd8bf7754ecf6ddd09a5aa,PUBLIC-RELATIONS
7591,United States of America,US,,No,jobs.monster.com,GroundskeeperWest Knox Apart need a F/T Ground...,Groundskeeper Job in Knoxville,Full Time,"Knoxville, TN 37923",,http://jobview.monster.com/Groundskeeper-Job-K...,,,cd3adb94fc647ca5ea3af731dc1da065,CONSTRUCTION
221,United States of America,US,,No,jobs.monster.com,Combined Insurance is seeking Sales Agents to ...,Sales Agent Job in Knoxville,Full Time,"Knoxville, TN",Insurance,http://jobview.monster.com/Sales-Agent-Job-Kno...,,,ec59e3f419b851259632b757183d7365,SALES


In [69]:
updated_df.to_csv('/Users/nikostsatsabas/Desktop/cbs courses/semester 3/ai and machine learning/final project/aiml_exam/data/processed_data/processed_jobposts.csv', index=False)

In [70]:
updated_df['predictions'].value_counts()

predictions
HEALTHCARE                191
INFORMATION-TECHNOLOGY    188
SALES                     158
ENGINEERING               129
CONSTRUCTION               68
CHEF                       43
ACCOUNTANT                 42
ADVOCATE                   38
AUTOMOBILE                 38
HR                         32
FINANCE                    27
PUBLIC-RELATIONS           20
BUSINESS-DEVELOPMENT       19
TEACHER                    11
DIGITAL-MEDIA               8
AGRICULTURE                 6
BANKING                     4
DESIGNER                    3
AVIATION                    2
APPAREL                     2
CONSULTANT                  2
FITNESS                     1
Name: count, dtype: int64