In [None]:
import pandas as pd

# Path to the CSV file
file_path = '/home/azureuser/llm_equivalence_evaluation/Accuracy Calculation Dataset - Accuracy Calculation Dataset.csv'

# Load the CSV file into a DataFrame
df = pd.read_csv(file_path)

In [None]:
import os
from openai import OpenAI

# Temporarily set the environment variable for the session
os.environ['OPENAI_API_KEY'] = 'MY_API_TOKEN'

client = OpenAI()



In [None]:
def get_text_from_image_url(image_url):
    """
    Extracts text from an image URL using the OpenAI API.

    Args:
        image_url (str): The URL of the image.

    Returns:
        str: The extracted text from the image.
    """

    response = client.chat.completions.create(
        model="gpt-4-turbo",
        messages=[
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What's in this image?"},
                    {
                        "type": "image_url",
                        "image_url": {
                            "url": image_url,
                        },
                    },
                ],
            }
        ],
        max_tokens=300,
    )

    return response.choices[0].message.content

In [None]:
import ast


def get_prompt(conversation_history, user_response):
    """
    Generates a prompt for the LLM based on the conversation history and user response.

    Args:
        conversation_history (str): A string representation of the conversation history.
        user_response (str): The user's response to the final question.

    Returns:
        str: The generated prompt for the LLM.
    """

    prompt = "Below is a conversation history, a final question, and user response. Treat the conversation history as context, and check whether the user response is the correct answer to the final question. If it is correct then just return the word EQUIVALENT. If it is incorrect return the word NOT_EQUIVALENT. Do not return any other text.\n\n"

    prompt += "Conversation History:\n"

    # Convert the string representation of conversation history to a list of dictionaries
    conversation_history = ast.literal_eval(conversation_history)

    for i in range(len(conversation_history) - 1):
        message = conversation_history[i]
        if 'user' in message:
            user_msg = ""
            if isinstance(message['user'], list):
                for item in message['user']:
                    if item['type'] == 'text':
                        user_msg += item['text'] + ' '
                    elif item['type'] == 'image_url':
                        image_url = item['image_url']['url']
                        image_text = get_text_from_image_url(image_url)
                        user_msg += f"User provided image: {image_text} "
            else:
                user_msg = message['user']
            prompt += f"\tUser: {user_msg.strip()}\n"

        if 'bot' in message:
            prompt += f"\tBot: {message['bot']}\n"

    final_question = conversation_history[-1]['bot']
    prompt += f"\nFinal Question: {final_question}\n"
    prompt += f"User Response: {user_response}\n"

    return prompt


def process_dataframe(df):
    """
    Processes the DataFrame by applying the get_prompt function to each row.

    Args:
        df (pandas.DataFrame): The input DataFrame containing conversation history and user responses.

    Returns:
        pandas.DataFrame: The processed DataFrame with an additional 'Prompt' column.
    """

    def generate_prompt(row):
        try:
            prompt = get_prompt(row['Conversation History'], row['User Response'])
            return prompt
        except Exception as e:
            print(f"Error processing row: {e}")
            return None

    df['Prompt'] = df.apply(generate_prompt, axis=1)
    return df

In [None]:
# Generate the prompts to be sent to the LLM
process_dataframe(df=df)

In [None]:
import time

def get_llm_response(prompt):
    """
    Sends a prompt to the OpenAI API and retrieves the LLM response.

    Args:
        prompt (str): The prompt to send to the OpenAI API.

    Returns:
        tuple: A tuple containing the LLM response, time taken, and cost.
            - llm_response (str): The response generated by the LLM.
            - time_taken (float): The time taken (in seconds) to generate the response.
            - cost (float): The cost of generating the response.
    """
    try:
        start_time = time.time()
        response = client.chat.completions.create(
            model="gpt-4-turbo",
            messages=[
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": prompt}
            ]
        )
        end_time = time.time()
        time_taken = end_time - start_time
        cost = response.usage.completion_tokens * 0.00003 + response.usage.prompt_tokens * 0.00001
        return response.choices[0].message.content, time_taken, cost
    except Exception as e:
        print(f"Error processing prompt: {e}")
        return None, None, None

def process_llm_response(row):
    """
    Processes a row of the DataFrame by sending the prompt to the OpenAI API and retrieving the LLM response.

    Args:
        row (pandas.Series): A row of the DataFrame containing the 'Prompt' column.

    Returns:
        pandas.Series: A series containing the LLM response, time taken, and cost.
    """
    if pd.notnull(row['Prompt']):
        llm_response, time_taken, cost = get_llm_response(row['Prompt'])
        return pd.Series([llm_response, time_taken, cost])
    else:
        return pd.Series([None, None, None])

In [None]:
#Generate the final results
df[['LLM Equivalence Evaluation (Response)', 'Time taken to complete the request', 'Cost in dollars for the request']] = df.apply(process_llm_response, axis=1)


In [None]:
def evaluate_equivalence(row):
    if row['Human Evaluation'] == 'NOT_EQUIVALENT' and row['LLM Equivalence Evaluation (Response)'] == 'NOT_EQUIVALENT':
        return 'True Negative'
    elif row['Human Evaluation'] == 'EQUIVALENT' and row['LLM Equivalence Evaluation (Response)'] == 'EQUIVALENT':
        return 'True Positive'
    else:
        return 'Mismatch'

df['Evaluation Result'] = df.apply(evaluate_equivalence, axis=1)

In [None]:
# Remove the 'Prompt' column from the updated_df DataFrame
df = df.drop('Prompt', axis=1)

# Save the updated DataFrame to a CSV file
df.to_csv('final_results.csv', index=False)