# Formatting Data for Submission

In order to cut down on API requests & cut costs, one step that we considered was to read through the top 10 most popular jobs from our data, and omit that from our api submission. Instead, we categorize these jobs ourselves. To accomplish this, first locate which jobs are the most popular.

In [None]:
import os
import json
from collections import Counter

def process_jsonl_folder(folder_path, output_file):
    title_counter = Counter()
    total_titles = 0
    total_lines_read = 0

    # Iterate through all files in the folder
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.jsonl'):
            file_path = os.path.join(folder_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as file:
                for line in file:
                    total_lines_read += 1
                    try:
                        record = json.loads(line.strip())
                        title_name = record.get('title.name')
                        if title_name:
                            # Normalize to lowercase for case-insensitivity
                            title_counter[title_name.lower()] += 1
                            total_titles += 1
                    except json.JSONDecodeError:
                        print(f"Error decoding line in file {file_name}: {line.strip()}")

    # Get the top 100 titles by frequency, sorted in descending order
    top_titles = title_counter.most_common(100)

    # Write results to the output JSONL file
    with open(output_file, 'w', encoding='utf-8') as out_file:
        for title, frequency in top_titles:
            percentage = f"{(frequency / total_titles) * 100:.2f}%"
            output_record = {
                "title.name": title,
                "frequency": frequency,
                "percentage": percentage
            }
            out_file.write(json.dumps(output_record) + '\n')

    print(f"Top 100 job titles written to {output_file}")
    print(f"Total lines read from all input files: {total_lines_read}")

# Example usage
folder_path = "path/to/folder/with/your/data"  # Replace with your folder path
output_file = "path/to/jsonl/output/file"   # Replace with your desired output file path
process_jsonl_folder(folder_path, output_file)


The above code will find the 100 most popualar jobs across all provided data. We used the 10 most popular to filter from, but this can be changed as needed. Furthermore, when applying our filter, we associate each job with a position, either Entry, Mid, Senior, or Executive with a map. This is so that we have a seniority level to categorize the position with manually, and so that our manual data resembles what OpenAI will send us.

## Data Concatenation

This is entirely optional depending on how your data was handled, but when worked on the project, our data was split into multiple partitions. However, for the next step, only a single file is used as input. Below is a simple concatentation script which will write all lines of jsonl files in a directory into a single new file.

In [None]:
import os

def combine_jsonl_files():
    # Hardcoded directory containing the jsonl files
    directory = "path/to/data/directory"

    # Name of the new file that will contain combined data
    output_filename = "combined.jsonl"
    output_filepath = os.path.join(directory, output_filename)

    # Open the output file for writing
    with open(output_filepath, "w", encoding="utf-8") as outfile:
        # Loop through everything in the directory
        for filename in os.listdir(directory):
            # Only process .jsonl files and skip if it's the output file itself
            if filename.endswith(".jsonl") and filename != output_filename:
                print(f"Writing data from {filename}")
                file_path = os.path.join(directory, filename)

                # Read each line from the current file and write it to the output file
                with open(file_path, "r", encoding="utf-8") as infile:
                    for line in infile:
                        outfile.write(line)


if __name__ == "__main__":
    combine_jsonl_files()


## Preparing Data for Submission

In order for OpenAI to be able to interpret the data that we send them, we must create a series of jsonl files that adhere to how they attempt to read our data. The code below will do the following:

- Separate data into 2 categories, those whose title.name is or isn't found in our popular job map
- Partition unpopular job group into files containing at most 50000 lines (the maximum for OpenAI batch submissions)
- Create custom ids for each unpopular job entry, based on their employee_id
- Create a single jsonl file of the entries with popular jobs, who we manually analyzed

Note that the prompt & model should be changed as needed. For our project, we used the gpt-4o-mini model. It is also possible that the API endpoint is changed in the future, but as of the time of this project, we used the chat completions endpoint. 

# Necessary files & inputs

- A single jsonl file containing all data to be processed.
- An output directory for popular jobs (manually reviewed data)
- An output directory for unpopular jobs (formatted data to be submitted to OpenAI)
- Popular jobs to filter through (change as needed)
- A prompt that you would like the AI to read.
- Optionally, change chunk size depending on how many divisions you would like data to be (maximum 50000)

In [None]:
import json

# Input file paths
input_file = "/Users/will/research/DataRetrieval/filtered_data/combined.jsonl"

# Directories for output
popular_dir = "/Users/will/research/DataRetrieval/splitData/popular"
unpopular_dir = "/Users/will/research/DataRetrieval/splitData/unpopular"

# Popular titles and their seniority levels
popular_titles = {
    "personal banker": "Entry",
    "vice president": "Executive",
    "manager": "Mid",
    "associate": "Mid",
    "teller": "Entry",
    "financial advisor": "Mid",
    "bank teller": "Entry",
    "analyst": "Mid",
    "project manager": "Senior",
    "business analyst": "Mid"
}

# File limits for unpopular jobs
chunk_size = 50000

# Initialize counters and buffers
unpopular_buffer = []
unpopular_file_index = 1

# Track occurrences of each employee_id
employee_id_counts = {}

# System message for preparing unpopular jobs
system_message = """Your prompt here."""

def write_unpopular_jobs():
    global unpopular_buffer, unpopular_file_index

    if not unpopular_buffer:
        return

    output_file = f"{unpopular_dir}/unpopular_jobs_{unpopular_file_index}.jsonl"
    print(f"Writing {len(unpopular_buffer)} unpopular jobs to {output_file}")
    
    with open(output_file, 'w') as f:
        for custom_id, record_data in unpopular_buffer:
            output_line = {
                "custom_id": custom_id,
                "method": "POST",
                "url": "/v1/chat/completions",
                "body": {
                    "model": "gpt-4o-mini",
                    "messages": [
                        {"role": "system", "content": system_message},
                        # Write everything except "employee_id" in the user content
                        {"role": "user", "content": json.dumps(record_data, ensure_ascii=False)}
                    ],
                    "max_tokens": 1000
                }
            }
            f.write(json.dumps(output_line, ensure_ascii=False) + "\n")

    unpopular_file_index += 1
    unpopular_buffer = []

def get_or_make_employee_id(original_id):
    """
    If this employee_id has been seen before, increment the counter
    and produce e.g. 'asdf_2', otherwise store and return as-is.
    """
    if not original_id:
        # Fallback if no employee_id is present
        return "no_employee_id"

    if original_id not in employee_id_counts:
        employee_id_counts[original_id] = 1
        return original_id
    else:
        employee_id_counts[original_id] += 1
        return f"{original_id}_{employee_id_counts[original_id]}"

# Process the input file
try:
    with open(input_file, 'r') as infile:
        for line_num, line in enumerate(infile, start=1):
            try:
                data = json.loads(line.strip())
                title_name = data.get("title.name")
                if title_name and isinstance(title_name, str):
                    title_name = title_name.lower()
                else:
                    title_name = ""

                # Extract and prepare custom_id
                original_employee_id = data.get("employee_id", "")
                final_employee_id = get_or_make_employee_id(original_employee_id)
                
                # Remove employee_id from the content we'll send to "user"
                # but keep the rest
                data_no_id = {k: v for k, v in data.items() if k != "employee_id"}

                if any(popular_title in title_name for popular_title in popular_titles):
                    # Find the matching popular title
                    matching_title = next(title for title in popular_titles if title in title_name)
                    seniority = popular_titles[matching_title]
                    
                    output_line = {
                        "custom_id": final_employee_id,
                        "all results": [
                            {
                                "job_function": matching_title,
                                "job_function_certainty": "manual",
                                "seniority": seniority,
                                "seniority_certainty": "manual"
                            }
                        ]
                    }
                    output_file = f"{popular_dir}/popular_jobs.jsonl"
                    with open(output_file, 'a') as f:
                        f.write(json.dumps(output_line) + "\n")
                else:
                    # Accumulate for unpopular jobs
                    # We'll store the custom_id and the data-no-ID together
                    unpopular_buffer.append((final_employee_id, data_no_id))

                    # Write out if we hit chunk_size
                    if len(unpopular_buffer) >= chunk_size:
                        write_unpopular_jobs()

            except json.JSONDecodeError as e:
                print(f"Error decoding JSON on line {line_num}: {e}")

    # Write any remaining unpopular jobs
    write_unpopular_jobs()

    print("Processing complete. Files saved to output directories.")

except FileNotFoundError as e:
    print(f"Input file not found: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")

## Additional Token Utility

One important additional detail to consider is how many tokens your data contains. Everyone using the API has a rate limit, which stops excessive use of the API. While this limit can be increased by additional spending, it is important to consider how many tokens your data takes before submitting.

Luckily, OpenAI uses a Python library called "tiktoken" in order to tokenize their data, meaning that we can use this same library to get a good estimation of how many tokens our data will take. While we can actually exactly estimate the input tokens, it is impossible to know exactly how many tokens will be in the output. So, I reccomend running small tests and calculating an average token for the response, and basing your output calculations on that. For our data, our respones were always less than 100 tokens per responses, so I used 100 as a worst case per data point. Also, there are some additional "overhead" tokens which OpenAI adds to the API calls which we account for (note that these rules only apply to the GPT-4 models, and may likely change in the future, although the overhead is very small). 

It is reccomended that you be aware of how many tokens your data will use before submitting to OpenAI. Exceeding the daily rate limit will cause the submission, and all subsequent submissions that day, to fail. Also, there is a maximum of 50000 lines in the jsonl input file and a maximum of 200MB of data per file as of the time of writing. Ensure that your rate limit (visible in your OpenAI dashboard) is greater than the sum of the data that you will submit, and that no individual file exceeds 200MB in size.

### Input

- Provide a directory containing all files you wish to analyze.

In [None]:
pip install tiktoken

In [None]:
import os
import json
import tiktoken

def count_tokens_gpt4(messages):
    encoding = tiktoken.get_encoding("cl100k_base")
    
    tokens_per_message = 3
    tokens_per_name = 1
    
    total_tokens = 0
    
    for message in messages:
        # Add overhead tokens per message
        total_tokens += tokens_per_message
        
        # Add tokens for each key-value pair
        for key, value in message.items():
            if key == "name":
                total_tokens += tokens_per_name
            # Ensure we're encoding a string
            if isinstance(value, str):
                total_tokens += len(encoding.encode(value))
            else:
                total_tokens += len(encoding.encode(str(value)))
                
    # Add 3 tokens for the assistant's start-of-reply overhead
    total_tokens += 3
    
    return total_tokens

def process_directory(directory_path):
    # We assume the worst-case completion might use 100 tokens
    max_response_tokens = 100

    grand_total_prompt = 0
    grand_total_estimated = 0
    request_count = 0

    # Iterate through all .jsonl files in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith(".jsonl"):
            file_path = os.path.join(directory_path, filename)
            print(f"Processing file: {filename}")

            with open(file_path, "r", encoding="utf-8") as f:
                for line in f:
                    line = line.strip()
                    if not line:
                        continue

                    data = json.loads(line)

                    # Extract the messages array
                    body = data.get("body", {})
                    messages = body.get("messages", [])

                    # Count the prompt tokens using GPT-4's known rules
                    prompt_tokens = count_tokens_gpt4(messages)

                    # Our worst-case total usage = prompt_tokens + 100
                    estimated_total_tokens = prompt_tokens + max_response_tokens

                    # Print line-by-line info
                    request_count += 1
                    print(f"  Request {request_count} prompt tokens: {prompt_tokens}")
                    print(f"  Request {request_count} estimated total with response: {estimated_total_tokens}\n")

                    # Accumulate grand totals
                    grand_total_prompt += prompt_tokens
                    grand_total_estimated += estimated_total_tokens

    print("-" * 50)
    print(f"Total requests processed: {request_count}")
    print(f"Grand total prompt tokens (across all requests): {grand_total_prompt}")
    print(f"Grand total estimated tokens (prompt + 100 each): {grand_total_estimated}")

def main():
    directory_path = "path/to/your/directory"  # Update this path as needed
    process_directory(directory_path)

if __name__ == "__main__":
    main()
