In [3]:
# Ensure pandas is imported
import pandas as pd


def clean_messy_csv(file_path):
    """
    Reads a CSV file with tab-separated values, cleans it, and returns a pandas DataFrame.

    Parameters:
    file_path (str): The path to the CSV file.

    Returns:
    pd.DataFrame: Cleaned data as a pandas DataFrame.
    """
    # Read the file as a text file to manually process the lines
    with open(file_path, "r", encoding='utf-8') as file:  # Added encoding to handle potential UnicodeDecodeErrors
        lines = file.readlines()

    # Split the header and data lines
    header = lines[0].strip().split("\t")
    data_lines = [line.strip().split("\t") for line in lines[1:]]

    # Create a DataFrame from the processed data
    cleaned_data = pd.DataFrame(data_lines, columns=header)

    # Remove any leading/trailing whitespace characters from the headers
    cleaned_data.columns = cleaned_data.columns.str.strip()

    # Optionally, remove any rows with entirely empty values
    cleaned_data.dropna(how="all", inplace=True)

    return cleaned_data


# Now, let's use the function and display the first few rows of the cleaned DataFrame
clean_data = clean_messy_csv('data/Spend_Intake_010124_063024.csv')
# Extract a smaller number of fields from a dataframe
smaller_df = clean_data[['supplier_id', 'supplier_name', 'dba', 'gl_account_code', 'gl_account_desc',
                         'cost_centre_code', 'cost_centre_code_desc', 'internal_classification_code',
                         'internal_classification_desc', 'item_code', 'order_type']]
smaller_df.head()

Unnamed: 0,supplier_id,supplier_name,dba,gl_account_code,gl_account_desc,cost_centre_code,cost_centre_code_desc,internal_classification_code,internal_classification_desc,item_code,order_type
0,776691,RHEEM SALES COMPANY INC,RHEEM SALES COMPANY INC,92020210.5207,Serialized Equipment,92020210,HVAC HDepot AOR McCarthy Heat,31450447,Supplier Invoice Number,UP18AZ48AJVCA (W18231586,JOB/FIELD TICKET PURCHASE
1,776691,RHEEM SALES COMPANY INC,RHEEM SALES COMPANY INC,92020210.5207,Serialized Equipment,92020210,HVAC HDepot AOR McCarthy Heat,31450447,Supplier Invoice Number,RHMVZ6021SEACAJ (W162370,JOB/FIELD TICKET PURCHASE
2,776691,RHEEM SALES COMPANY INC,RHEEM SALES COMPANY INC,92020210.5205,Parts and Materials,92020210,HVAC HDepot AOR McCarthy Heat,31450447,Supplier Invoice Number,Sales tax,JOB/FIELD TICKET PURCHASE
3,776691,RHEEM SALES COMPANY INC,RHEEM SALES COMPANY INC,92020210.5209,Vendor Rebates Earned,92020210,HVAC HDepot AOR McCarthy Heat,31450447,Supplier Invoice Number,Vendor Rebates Earned,JOB/FIELD TICKET PURCHASE
4,649327,C & J FAMILY TRUST,C & J FAMILY TRUST,81010000.721,Rent Expense,81010000,Admn RR Orange #560,0124BASE1,Supplier Invoice Number,C&J Family Trust (Base Rent),


In [18]:
from openai import OpenAI
import os
import json


def show_json(obj):
    display(json.loads(obj.model_dump_json()))


api_key = os.environ.get("OPENAI_API_KEY")
if not api_key:
    raise ValueError("OPENAI_API_KEY environment variable is not set.")
client = OpenAI(api_key=api_key)
print("OpenAI API key is set.")

thread = client.beta.threads.create()
show_json(thread)

supplier_name_field = smaller_df['supplier_name']
test_supplier_name = supplier_name_field.iloc[0]

message = client.beta.threads.messages.create(
    thread_id=thread.id,
    role="user",
    content=f"I would like to validate and classify the organization: {test_supplier_name}, please include a "
            f"confidence score for each classification and any sources cited. If results are inconclusive, please "
            f"indicate that is the case. Please produce your response in a tablular format with the following "
            f"columns: Validated?, Confidence Score, Source, Classification and Comments."
)

run = client.beta.threads.runs.create(
    thread_id=thread.id,
    assistant_id='asst_5KogpF8Tb1u7tTcLAEDH98UM',
)

show_json(run)

OpenAI API key is set.


{'id': 'thread_3hkUPdmksifVldeAHxqfLlHY',
 'created_at': 1721193192,
 'metadata': {},
 'object': 'thread',
 'tool_resources': {'code_interpreter': None, 'file_search': None}}

{'id': 'run_ElZlrOLxNwRqRTpEwk76PPYh',
 'assistant_id': 'asst_5KogpF8Tb1u7tTcLAEDH98UM',
 'cancelled_at': None,
 'completed_at': None,
 'created_at': 1721193192,
 'expires_at': 1721193792,
 'failed_at': None,
 'incomplete_details': None,
 'instructions': "You are a supplier relationship manager looking to validate the validity of a provided company based on its name and other details provided in a query. Your instructions are to use any information provided to confirm the existence of a company based on it's web presence and then assign them a classification based upon the UNSPSC category tree. Your responses should be short and succinct indicating a Yes/No value to the discovery of a web presence as well as the code and description of the UNSPSC category code, for example 43231503 - Procurement software.",
 'last_error': None,
 'max_completion_tokens': None,
 'max_prompt_tokens': None,
 'metadata': {},
 'model': 'gpt-3.5-turbo-16k',
 'object': 'thread.run',
 'parallel_tool_calls': Tru

In [19]:
import time


def wait_on_run(run, thread):
    while run.status == "queued" or run.status == "in_progress":
        run = client.beta.threads.runs.retrieve(
            thread_id=thread.id,
            run_id=run.id,
        )
        time.sleep(0.5)
    return run


run = wait_on_run(run, thread)
show_json(run)

{'id': 'run_ElZlrOLxNwRqRTpEwk76PPYh',
 'assistant_id': 'asst_5KogpF8Tb1u7tTcLAEDH98UM',
 'cancelled_at': None,
 'completed_at': None,
 'created_at': 1721193192,
 'expires_at': 1721193792,
 'failed_at': None,
 'incomplete_details': None,
 'instructions': "You are a supplier relationship manager looking to validate the validity of a provided company based on its name and other details provided in a query. Your instructions are to use any information provided to confirm the existence of a company based on it's web presence and then assign them a classification based upon the UNSPSC category tree. Your responses should be short and succinct indicating a Yes/No value to the discovery of a web presence as well as the code and description of the UNSPSC category code, for example 43231503 - Procurement software.",
 'last_error': None,
 'max_completion_tokens': None,
 'max_prompt_tokens': None,
 'metadata': {},
 'model': 'gpt-3.5-turbo-16k',
 'object': 'thread.run',
 'parallel_tool_calls': Tru

In [20]:
tool_outputs = []

for tool in run.required_action.submit_tool_outputs.tool_calls:
    if tool.function.name == "get_vendor_classification":
        tool_outputs.append({
            "tool_call_id": tool.id,
            "validity": tool.function.arguments[0],
            "classification_code": tool.function.arguments[1],
            "classification_desc": tool.function.arguments[2],
            "comments": tool.function.arguments[3],
        })
    elif tool.function.name == "get_item_classification":
        tool_outputs.append({
            "tool_call_id": tool.id,
            "classification_code": tool.function.arguments[1],
            "classification_desc": tool.function.arguments[2],
            "comments": tool.function.arguments[3],
        })
print(tool_outputs)

if tool_outputs:
    try:
        run = client.beta.threads.runs.submit_tool_outputs_and_poll(
            thread_id=thread.id,
            run_id=run.id,
            tool_outputs=tool_outputs
        )
        print("Tool outputs submitted successfully.")
    except Exception as e:
        print("Failed to submit tool outputs:", e)
    else:
        print("No tool outputs to submit.")
if run.status == 'completed':
    messages = client.beta.threads.messages.list(
        thread_id=thread.id
    )
    print(messages)
else:
    print(run.status)

[{'tool_call_id': 'call_1bT0Bz1GCqWDnxhfhSVANu08', 'validity': '{', 'classification_code': '\n', 'classification_desc': ' ', 'comments': ' '}]
Failed to submit tool outputs: Error code: 400 - {'error': {'message': "Missing required parameter: 'tool_outputs[0].output'.", 'type': 'invalid_request_error', 'param': 'tool_outputs[0].output', 'code': 'missing_required_parameter'}}
requires_action
