In [1]:
import pandas as pd
import error_prompts as p
import os
from dotenv import load_dotenv
from langfuse.callback import CallbackHandler
from langchain.chains import LLMChain
from langchain_openai import OpenAI
from langchain.chains import LLMChain, SimpleSequentialChain
from langfuse.callback import CallbackHandler
from langchain_openai import ChatOpenAI
import pandas as pd

# Load the .env file
load_dotenv()

# Access the environment variables
os.environ["LANGFUSE_PUBLIC_KEY"] = os.environ.get("LANGFUSE_PUBLIC_KEY")
os.environ["LANGFUSE_SECRET_KEY"] = os.environ.get("LANGFUSE_SECRET_KEY")
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")

handler = CallbackHandler()
handler.auth_check()



prompt_name = 'error_class_LLM'
prompt = p.error_class_LLM

file = "structured_unstructured_singleterm_zeroshot_3.5-turbo"

DATA_PATH = f'../../Datasets/Evaluations/Sentiment Analysis/Error_Analysis/{file}.csv'
OUTPUT_PATH = f"../../Datasets/Evaluations/Sentiment Analysis/Error_Analysis/error_class_LLM_analysis.csv"

MODEL = "gpt-4-turbo-preview"
TEMP = 0

struct_unstruct = 'structured_analysis'

In [3]:
import os
df = pd.read_csv(DATA_PATH)

df_correct = df[df['error'] == 'correct']
df_correct_samples = df_correct.sample(n=5, random_state=42)

df_error = df[df['error'] == 'error']
df_error_samples = df_error.sample(n=20, random_state=42)

print(df_correct_samples.shape, df_error_samples.shape)
print(df_correct_samples.head())
print(df_error_samples.head())

if os.path.exists(OUTPUT_PATH):
    df_answer = pd.read_csv(OUTPUT_PATH)
else:
    df_answer = pd.DataFrame(columns=['Prompt_Name', 'Prompt', 'Model', 'Database', 'Answer', 'Context'])





(5, 12) (20, 12)
    Unnamed: 0  text_id  term_id  \
63         228       86      228   
89         374      142      374   
80         113       43      113   
95          73       27       73   
67         318      121      318   

                                                 text         term  polarity  \
63  The mango salsa with fish cake was too sour, t...  apple suace  negative   
89  My only advice is: increase portion sizes; and...      portion  positive   
80  The staff was courteous and explained the menu...        staff  positive   
95  We were so excited since I was reading great r...        place  negative   
67  Once at the table our waitress managed on two ...       drinks   neutral   

   polarity_pred                    prompt_name  \
63      negative  prompt_3_zeroshot_single_term   
89      positive  prompt_3_zeroshot_single_term   
80      positive  prompt_3_zeroshot_single_term   
95      negative  prompt_3_zeroshot_single_term   
67       neutral  prompt_3_zer

In [4]:
def execute(df, prompt, df_answer):
    i = 1
    if(MODEL == "gpt-3.5-turbo-instruct"):
        llm = OpenAI(model_name = MODEL, temperature = TEMP, timeout=150)
    else:
        llm = ChatOpenAI(model_name = MODEL, temperature = TEMP, timeout=150)
    examples = ""
    examples += "False Decisions:\n"
    for j in range(len(df_error_samples)):
        examples += f"Task {i}:\n"
        examples += "Prompt: " + df_error_samples.iloc[j]['prompt'] + "\n"
        examples += "Sentiment Decision: " + str(df_error_samples.iloc[j]['polarity_pred']) + "\n"
        examples += "Explanation: " + df_error_samples.iloc[j][struct_unstruct] + "\n"
        examples += "Actual Sentiment: " + str(df_error_samples.iloc[j]['polarity']) + "\n"
        i += 1
    examples += "\nCorrect Decisions (Only as a reference):\n"
    for j in range(len(df_correct_samples)):
        examples += f"Task {i}:\n"
        examples += "Prompt: " + df_correct_samples.iloc[j]['prompt'] + "\n"
        examples += "Sentiment Decision: " + str(df_correct_samples.iloc[j]['polarity_pred']) + "\n"
        examples += "Explanation: " + df_correct_samples.iloc[j][struct_unstruct] + "\n"
        examples += "Actual Sentiment: " + str(df_correct_samples.iloc[j]['polarity']) + "\n"
        i += 1
    chain = LLMChain(llm=llm, prompt=prompt, callbacks=[handler])
    try: 
        result = chain.run(examples = examples, callbacks=[handler])
    except Exception as e:
        print(e)
        print(chain.prompt.format_prompt(examples = examples).to_string())
    handler.langfuse.flush()
    prompt_text = chain.prompt.format_prompt(examples = examples).to_string()
    print(prompt_text)
    print("\n")
    print(result)
    print("\n")
    new_row = {'Prompt_Name': prompt_name, 'Prompt': prompt_text, 'Model': MODEL, 'Database': file, 'Answer': result, 'Context': struct_unstruct}
    df_new_row = pd.DataFrame([new_row])
    df_answer = pd.concat([df_answer, df_new_row])
    return result, df_answer


answer, df_answer = execute(df, prompt, df_answer)
    



  warn_deprecated(
  warn_beta(


In the following I will give you a few Aspect Based Sentiment Analysis tasks together with a sentiment decision, details about the decision and the sentiment towards each term.
The Prediction was made by an LLM.
Can you please group the wrong decisions into fault categories? Please also indicate how often each one occurs.
There are also some correct decisions in the examples. Please just use them as a reference and don't categorize them.
False Decisions:
Task 1:
Prompt: The round pizza doesn't taste as good as I recall, it's still wonderful, but the Sicilian is to die for.

    What is the sentiment on 'round pizza'? Only respond with "positive", "negative" or "neutral" as one word.
Sentiment Decision: neutral
Explanation: [{"adjective":"wonderful","importance":"1.0"}]
Actual Sentiment: negative
Task 2:
Prompt: Bouley has done an excellent job creating this dining establishment with almost a museum sense, the artworks on the wall have a very Klimty feeling and the gold ceiling complete

In [7]:
print(df_answer)

       Prompt_Name                                             Prompt  \
0  error_class_LLM  In the following I will give you a few Aspect ...   
0  error_class_LLM  In the following I will give you a few Aspect ...   

                 Model                                           Database  \
0        gpt-3.5-turbo  structured_unstructured_singleterm_zeroshot_3....   
0  gpt-4-turbo-preview  structured_unstructured_singleterm_zeroshot_3....   

                                              Answer              Context  
0  Fault Categories:\n1. Incorrect sentiment due ...  structured_analysis  
0  Based on the provided false decisions, we can ...  structured_analysis  


In [24]:
df_answer.to_csv(OUTPUT_PATH, index=False)