In [8]:
import os
import pandas as pd
import datetime as datetime
import time as time
# AI
import tiktoken
import json
from openai import OpenAI
from dotenv import load_dotenv

In [9]:
# chatgpt api
load_dotenv('keys.env')
api_key = os.getenv('OPENAI_API_KEY')

client = OpenAI(api_key=api_key)

In [None]:
# Tokenizer for GPT model
tokenizer = tiktoken.encoding_for_model("gpt-4o-mini")

# Parameters
max_tokens_per_request = 16384  # Max tokens allowed by the model
max_tokens_input = max_tokens_per_request // 2  # Reserve half for input

# Function to process a batch
def process_batch(json):
    print(json)
    # Create a prompt
    prompt = f"""
                Here is a JSON data: 
                {json}
                
                Please process it according to the following instructions:

                1. Add the following new keys to each object in the dataset:
                - "skills": A list of strings extracted from the "description", each skill is a one or two word keywords.
                - "min_year_of_experience": An integer extracted from the "description" or left blank if not found.
                - "preferred_year_of_experience": An integer extracted from the "description" or left blank if not found.
                - "salary_low": An integer. If the key "salary" in the original dataset has a value, use it; otherwise, extract the information from the "description". If not found, leave it blank.
                - "salary_high": An integer. Process it similarly to "salary_low".
                - "avg_salary": An integer calculated as the average of "salary_low" and "salary_high" (if both are available; otherwise, leave it blank).
                - "location_st": A string standardized from the "location" field in the original dataset. Use the format "City, State". For example:
                    - "New York Metropolitan Area" → "New York, NY".
                    - "California, United States" → "None, CA".
                    - If the original value is simply "United States", leave it blank.
                - "min_degree": A string extracted from the "description" or left blank if not found.
                - "preferred_degree": A string extracted from the "description" or left blank if not found.

                2. Extract all additional information from the "description" field if not directly available in other fields.

                3. Ensure all new keys are included in the output, even if their values are blank.

                print the new JSON dataset without the "description" and without any characters that is not part of a JSON data structure.
            """
    
    # Call OpenAI API
    response = client.chat.completions.create(
        model="gpt-4o-mini",  # Update to a valid model name
        messages=[
            {"role": "system", "content": "You are a data analyst."},
            {"role": "user", "content": prompt}
        ]
    )
    
    # Extract and parse the response
    content = response.choices[0].message.content
    print(content)
    # content = str(response.choices[0].message.content)
    try:
        return pd.read_csv(content)  # Parse JSON safely
    except Exception as e:
        print(f"Error parsing API response: {e}")
        return pd.DataFrame()

# Function for batch processing
def batch_process(df):
    results = []
    current_batch = []
    current_tokens = 0

    for index, row in df.iterrows():
        # Convert row to JSON
        row_json = row.to_dict()
        row_tokens = len(tokenizer.encode(json.dumps(row_json)))
        
        # Check if adding the current row exceeds the token limit
        if current_tokens + row_tokens > max_tokens_input:
            # Process the current batch
            batch_json = json.dumps(current_batch)  # Convert to JSON array
            print(f"Processing batch with {len(current_batch)} rows at index {index}...")
            try:
                result_df = process_batch(batch_json)
                results.append(result_df)
            except Exception as e:
                print(f"Error processing batch: {e}")
            
            # Reset for the next batch
            current_batch = []
            current_tokens = 0
        
        # Add row to the current batch
        current_batch.append(row_json)
        current_tokens += row_tokens

    # Process the last batch
    if current_batch:
        batch_json = json.dumps(current_batch)
        print(f"Processing final batch with {len(current_batch)} rows...")
        try:
            result_df = process_batch(batch_json)
            results.append(result_df)
        except Exception as e:
            print(f"Error processing final batch: {e}")

    # Concatenate all results into a single DataFrame
    final_df = pd.concat(results, ignore_index=True) if results else pd.DataFrame()
    return final_df

In [None]:
def process(df):
    new_cols = ['skills', 'location_st', 'salary_low', 'salary_high', 'min_year_of_experience', 'preferred_year_of_experience', ]
# this function call the openai api, input the prompt and return the response in a string
# string is cleaned to only return the python dictionary
def transform(row):
    json_data = row.to_json()
    print(json_data)
    # concat directions and the input text
    prompt = f"""
                Here is a JSON data: 
                {json_data}
                
                Add values to the following keys:
                - "skills": A list of strings extracted from the "description", each skill is a one or two word keywords.
                - "min year of experience": An integer extracted from the "description" or left blank if not found.
                - "preferred_year_of_experience": An integer extracted from the "description" or left blank if not found.
                - "salary low": An integer. If the key "salary" in the original dataset has a value, use it; otherwise, extract the information from the "description". If not found, leave it blank.
                - "salary high": An integer. Process it similarly to "salary_low".
                - "location_st": A string standardized from the "location" key. Use the format "City, State". State must be two letter. For example:
                    - "New York Metropolitan Area" → "New York, NY".
                    - "California, United States" → "None, CA".
                    - If the original value is simply "United States", leave it blank.
                - "min_degree": A string extracted from the "description" or left blank if not found.
                - "preferred_degree": A string extracted from the "description" or left blank if not found.

                output the new JSON object minus "description".
            """
    # print(prompt)
    retry = 0

    # call api
    while retry <= 10:
        print(f"Start API call at {datetime.now()}")
        try:
            # Call API
            response = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[{"role": "user", "content": prompt}],
                timeout=60
            )
            content = str(response.choices[0].message.content)

            # Handle 'null' and 'None' by replacing them with actual Python None
            content = content.replace("null", "None")

            # Extract JSON-like content between first `{` and last `}`
            start = content.find('{')
            end = content.rfind('}')
            if start != -1 and end != -1:
                content = content[start:end+1]
                # make sure it's a valid dictionary format that can be converted using eval
                row = pd.DataFrame([[content]], columns=['test_content'])
                row = row['test_content'].apply(eval)
                print(f"retrieved dictionary at {datetime.now()}")
                return content

            else:
                print(f"No valid JSON-like content found in response at {datetime.now()}")

        except Exception as e:
            print(f"Error during API call or processing: {e}")
        
        retry += 1
        print(f"Retrying {retry}...")
        time.sleep(4)

    print("Failed to retrieve dictionary after maximum retries.")
    return None