In [14]:
import openai
import pandas as pd
import numpy as np
import os
import re
from tqdm import tqdm
os.chdir("""INSERT DIRECTORY HERE""")

# Data

In [37]:
df = pd.read_excel('lightcast_skills.xlsx')
df

Unnamed: 0,skill
0,Regulation Creation
1,Leadership
2,Conflict Resolution
3,Multitasking
4,Revisions
...,...
2067,Electrical Theory
2068,Programming Tools
2069,Product Defect
2070,Self-Regulation


# Prompt

In [12]:
client = openai.OpenAI(

    api_key= """INSERT API KEY"""

)

In [48]:
# To catch GPT prompt only
chosen_type_list = []

# This is supposed to be only run once (i.e. if the cell below has not been run yet), as running this WILL OVERWRITE ANY SAVED PROGRESS

In [None]:
def get_chat_session(model="gpt-4o"): # to remember reading prompt
    chat_history = [
        {"role": "system", "content": "You are an AI assistant tasked with classifying skills based on skills."},
        {"role": "user", "content": f"Please remember the following skill classification system for future responses:\n\n{process_reading()}"},
        {"role": "assistant", "content": "Understood. I will remember this classification system for our session."}
    ]
    return chat_history

def gpt_response(prompt, chat_history, model="gpt-4o", max_retries=3):
    for attempt in range(max_retries):
        try:
            # Append user query to chat history
            chat_history.append({"role": "user", "content": prompt})

            # Generate response
            response = client.chat.completions.create(
                model=model,
                messages=chat_history
            )

            # Extract response content
            response_content = response.choices[0].message.content.strip()
            if not response_content:
                print(f"Empty response detected for prompt: {prompt}, assigning Error.")
                return "Error"

            # Append AI response to chat history (preserves memory)
            chat_history.append({"role": "assistant", "content": response_content})
            return response_content

        except Exception as e:
            if attempt == max_retries - 1:
                print(f"Error on final attempt: {e}")
                raise e
            print(f"Error occurred, retrying... ({attempt + 1}/{max_retries})")
            continue

def process_reading():
    return f"""
    1. Green Skills (Strictly Green)
        A. Definition: Skills that directly contribute to reducing environmental impact by performing tasks aimed at: 
            a. Producing greener outputs (e.g., renewable energy, recycling).
            b. Reducing a firm’s environmental footprint (e.g., pollution control, waste management).
        B. Examples: Environmental Engineers, Solar Panel Technicians, Recycling Plant Operators, Water Treatment Specialists.
        C. Key Characteristics: Involves core green tasks, exists across industries, requires specialized green skills.
    
    2. Green Potential Skills
        A. Definition: Skills that do not currently require green tasks but could become green as industries transition.
        B. Examples: Energy Engineers, Repair Technicians, Construction Workers, Farmers & Foresters.
        C. Key Characteristics: No inherent green tasks but can transition based on industry shifts.

    3. Non-Green Skills
        A. Definition: Skills unrelated to environmental sustainability or green transition.
        B. Examples: Office Administrators, Cashiers, Delivery Drivers, Sales Representatives.
        C. Key Characteristics: Environmentally neutral, unaffected by green industry shifts.
    """

def custom_prompt(skill):
    return f"""
    You are an AI assistant tasked with matching skills to skill types. Skill types include Green Skills, Green Potential Skills, and Non-Green Skills.

    **Instructions:**
    1. Classify the following skill into the most appropriate category: **{skill}** using the information you previously memorized;
    2. Provide the chosen skill type.

    **Response Format:**
    'Green skill' or 'Green Potential skills' or 'Non-Green skills'

    Ensure the response strictly follows those format and only return one of those three you see most fit.
    """

def process_row(chat_history):
    prompt = custom_prompt(skill)
    try:
        response = gpt_response(prompt, chat_history)
        if response:
            chosen_type_list.append(response) # To save progress as later on progress_apply does not do it
            return response
        else:
            print(f"Response was none for {skill}:\n{response}")
            chosen_type = 'Error in receiving response at process_row'
    except Exception as e:
        print(f"Error processing prompt at {skill}:\n{e}")
        chosen_type = 'Error in prompting at process_row'

    return pd.Series({
            'chosen_type': chosen_type
        })

# Process skills dataset
tqdm.pandas(desc="Processing rows")
results_df = df.head(5).copy()  # Currently still on test, change to process all
chat_history = get_chat_session()
results_df[['skill_type']] = results_df.progress_apply(
    lambda row: process_row(row['skill'], chat_history), axis=1
)

# Print final results
results_df.to_csv('Lightcast Green Skills Type.csv', index = False)
print("\nResults with GPT Analysis:")
print(results_df[['skill', 'skill_type']])