In [None]:
import openai
import pandas as pd
import numpy as np
import os
import re
from tqdm import tqdm

# Data

In [None]:
df = pd.read_excel(##-Insert Data Here-##)

In [None]:
local_skill = pd.DataFrame({'skill': df[##--Insert Skill Column Here--##].unique().tolist()})
local_skill['skill'] = local_skill['skill'].apply(lambda x: f"<skill>{x}</skill>" if pd.notna(x) else x) # better readibility for prompt
print(local_skill.head(5))

In [None]:
reading_df = pd.read_csv(##-Insert Data Here-##)

# Prompt

In [None]:
# ChatGPT API
client = openai.OpenAI(
    api_key = ##-Insert API Key here-##
)

# Enable tqdm for pandas
tqdm.pandas(desc="Processing skill categorizing")

# Caching the prompt
chosen_reason_list = []
chosen_type_list = []

# Establishing prompt
def get_chat_session(model="gpt-4o"):
    chat_history = [
        {"role": "system", "content": "You are an AI assistant tasked with classifying skills based on skills."},
        {"role": "user", "content": f"Please remember the following skill classification system for future responses:\n\n{process_reading(reading_df)}"},
        {"role": "assistant", "content": "Understood. I will remember this classification system for our session."}
    ]
    return chat_history

def get_gpt4_response(prompt, chat_history, model="gpt-4o", max_retries=3):
    for attempt in range(max_retries):
        try:
            # Append user query to chat history
            chat_history.append({"role": "user", "content": prompt})

            # Generate response
            response = client.chat.completions.create(
                model=model,
                messages=chat_history
            )

            # Extract response content
            response_content = response.choices[0].message.content.strip()
            if not response_content:
                print(f"Empty response detected for prompt: {prompt}, assigning Error.")
                return "Error"

            # Append AI response to chat history (preserves memory)
            chat_history.append({"role": "assistant", "content": response_content})
            return response_content

        except Exception as e:
            if attempt == max_retries - 1:
                print(f"Error on final attempt: {e}")
                raise e
            print(f"Error occurred, retrying... ({attempt + 1}/{max_retries})")
            continue


# Write the prompt
def process_reading(reading_df):
    return ##-Insert Prompt Here-##

def custom_prompt(skill):
    return ##-Insert Prompt Here-##


# Extract the skill inside the <skill>x<skill> format
def extract_skill(skill_text):
    match = re.search(r'<skill>(.*?)</skill>', skill_text)
    if match:
        return match.group(1)
    return skill_text


# Processing rows function
def process_row(skill_text, chat_history):
    skill = extract_skill(skill_text)

    # Obtain prompt result
    prompt = custom_prompt(skill)
    try:
        response = get_gpt4_response(prompt, chat_history)
        if response:
            reason_match = re.search(r'Reason Tag: (.*?)\n', response)
            type_match = re.search(r'Chosen Skill Type Tag: (.*?)$', response)
            chosen_reason = reason_match.group(1) if reason_match else 'Reason was none'
            chosen_type = type_match.group(1) if type_match else 'Type was none'
        else:
            print(f"Non-direct response detected for {skill_text}:\n{response}")
            chosen_reason, chosen_type = 'Error in receiving response', 'Error in receiving response'
    except Exception as e:
        print(f"Error processing {skill_text}: {e}")
        chosen_reason, chosen_type = 'Error in requesting GPT', 'Error in requesting GPT'

    # Store results in cache
    chosen_reason_list.append(chosen_reason)
    chosen_type_list.append(chosen_type)

    return pd.Series({
        'chosen_reason': chosen_reason,
        'chosen_type': chosen_type
    })

# Resuming
output_path = ##-Insert Output Path (File) Here-##
if os.path.exists(output_path):
    print(f"Resuming from {output_path}...")
    existing_data = pd.read_csv(output_path)
    completed_rows = max(existing_data["chosen_type"].notna().sum(), existing_data["chosen_reason"].notna().sum())
    local_skill[['chosen_reason', 'chosen_type']] = existing_data[['chosen_reason', 'chosen_type']] 
else:
    print("Starting fresh processing...")
    completed_rows = 0
    local_skill[['chosen_reason', 'chosen_type']] = None, None

# Initialize GPT-4o chat session (memorizes process_reading only once)
chat_history = get_chat_session()

# Total rows to process
total_rows = len(local_skill)
batch_size = ##-Adjust Batch Size Here-##

# Main Process Loop
for i in tqdm(range(completed_rows, total_rows, batch_size), desc="Processing in batches"):
    batch_end = min(i+batch_size, total_rows)
    print(f"Processing in batch.. from row {i} to {batch_end}...")
    
    # Process rows
    results_batch = local_skill.loc[i:batch_end, 'skill']
    processed_batch =  results_batch.progress_apply(lambda skill: process_row(skill, chat_history)) # lambda for more than 1 argument function
    local_skill.loc[i:batch_end, ['chosen_reason', 'chosen_type']] = processed_batch

    # Save progress every batch
    local_skill.to_csv(f"skill_type_{batch_end}.csv", index=False) # for per batch save, otherwise batch saves will always be replaced as loop goes
    local_skill.to_csv(output_path, index=False) # for resuming
    print(f"Saved progress up to row {batch_end}.")

# Print final results
local_skill.to_csv(output_path, index=False)
print("Processing complete and saved.")