In [1]:
import pandas as pd
import error_prompts as p
import os
from dotenv import load_dotenv
from langfuse.callback import CallbackHandler
from langchain.chains import LLMChain
from langchain_openai import OpenAI
from langchain.chains import LLMChain, SimpleSequentialChain
from langfuse.callback import CallbackHandler
from langchain_openai import ChatOpenAI
import pandas as pd

# Load the .env file
load_dotenv()

# Access the environment variables
os.environ["LANGFUSE_PUBLIC_KEY"] = os.environ.get("LANGFUSE_PUBLIC_KEY")
os.environ["LANGFUSE_SECRET_KEY"] = os.environ.get("LANGFUSE_SECRET_KEY")
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")

handler = CallbackHandler()
handler.auth_check()

prompt_name = 'error_class_ML'
prompt = p.error_class_ML

file = "logistic_regression"

DATA_PATH = f'../../Datasets/Evaluations/Sentiment Analysis/{file}.csv'
OUTPUT_PATH = f"../../Datasets/Evaluations/Sentiment Analysis/Error_Analysis/error_class_LLM_analysis.csv"

MODEL = "gpt-4-turbo-preview"
TEMP = 0


In [3]:
import os
df = pd.read_csv(DATA_PATH)
df['error'] = None
df['error'] = ['correct' if polarity == polarity_pred else 'error' for polarity, polarity_pred in zip(df['polarity'], df['polarity_pred'])]

df_correct = df[df['error'] == 'correct']
df_correct_samples = df_correct.sample(n=5, random_state=42)

df_error = df[df['error'] == 'error']
df_error_samples = df_error.sample(n=20, random_state=42)

print(df_correct_samples.shape, df_error_samples.shape)
print(df_correct_samples.head())
print(df_error_samples.head())

if os.path.exists(OUTPUT_PATH):
    df_answer = pd.read_csv(OUTPUT_PATH)
else:
    df_answer = pd.DataFrame(columns=['Prompt_Name', 'Prompt', 'Model', 'Database', 'Answer', 'Context'])





(5, 9) (20, 9)
     Unnamed: 0  text_id  term_id  \
124         124       47      124   
400         400      152      400   
71           71       26       71   
275         275      105      275   
220         220       83      220   

                                                  text        term  polarity  \
124  Not including the tip for our server, who we s...      server  negative   
400  THe bartendars were right on top of getting us...  bartendars  positive   
71   Ive had better burgers and fries at the local ...       fries  positive   
275  The Scene Shun Lee Palace is popular with midt...       lunch   neutral   
220  The service was non-existant, the manager spen...         bar   neutral   

    polarity_pred                                      token_weights    error  
124      negative  not : 2.476956248275856, including : -0.154560...  correct  
400      positive  the : 0.1599567509788442, bartendars : -0.0945...  correct  
71       positive  ive : -0.2945337453645

In [4]:
def execute(df, prompt, df_answer):
    i = 1
    if(MODEL == "gpt-3.5-turbo-instruct"):
        llm = OpenAI(model_name = MODEL, temperature = TEMP, timeout=150)
    else:
        llm = ChatOpenAI(model_name = MODEL, temperature = TEMP, timeout=150)
    examples = ""
    examples += "False Decisions:\n"
    for j in range(len(df_error_samples)):
        examples += f"Task {i}:\n"
        examples += "Text: " + df_error_samples.iloc[j]['text'] + "\n"
        examples += f"Sentiment Decision towards the term '{df_error_samples.iloc[j]['term']}': " + str(df_error_samples.iloc[j]['polarity_pred']) + "\n"
        examples += "Explanation: " + df_error_samples.iloc[j]['token_weights'] + "\n"
        examples += "Actual Sentiment: " + str(df_error_samples.iloc[j]['polarity']) + "\n"
        i += 1
    examples += "\nCorrect Decisions (Only as a reference):\n"
    for j in range(len(df_correct_samples)):
        examples += f"Task {i}:\n"
        examples += "Text: " + df_correct_samples.iloc[j]['text'] + "\n"
        examples += f"Sentiment Decision towards the term '{df_correct_samples.iloc[j]['term']}': " + str(df_correct_samples.iloc[j]['polarity_pred']) + "\n"
        examples += "Explanation: " + df_correct_samples.iloc[j]['token_weights'] + "\n"
        examples += "Actual Sentiment: " + str(df_correct_samples.iloc[j]['polarity']) + "\n"
        i += 1
    chain = LLMChain(llm=llm, prompt=prompt, callbacks=[handler])
    try: 
        result = chain.run(examples = examples, callbacks=[handler])
    except Exception as e:
        print(e)
        print(chain.prompt.format_prompt(examples = examples).to_string())
    handler.langfuse.flush()
    prompt_text = chain.prompt.format_prompt(examples = examples).to_string()
    print(prompt_text)
    print("\n")
    print(result)
    print("\n")
    new_row = {'Prompt_Name': prompt_name, 'Prompt': prompt_text, 'Model': MODEL, 'Database': file, 'Answer': result, 'Context': 'logistic_regression'}
    df_new_row = pd.DataFrame([new_row])
    df_answer = pd.concat([df_answer, df_new_row])
    return result, df_answer


answer, df_answer = execute(df, prompt, df_answer)
    



  warn_deprecated(
  warn_beta(


In the following I will give you a few Aspect Based Sentiment Analysis tasks together with the sentiment decision towards a specific term, details about the decision and the actualsentiment towards each term.
The decision was made by a Logistic Regression Machine Learning Model and the additional information shows the weights the model assigned to the different tokens. 
Can you please group the wrong decisions into distinct fault categories? Please also indicate how often each one occurs.
There are also some correct decisions in the examples. Please just use them as a reference and don't categorize them.
False Decisions:
Task 1:
Text: I'm totally into the space-age vibe of this place, but I wish they had more tables to sit down at.
Sentiment Decision towards the term 'tables': negative
Explanation: totally : 0.6466147231399947, into : 0.24452865139758487, the : 0.1599567509788442, space : -0.5407900591076409, age : -0.037666791864239324, vibe : 0.529004771560355, of : -0.67211332398387

In [5]:
print(df_answer)

      Prompt_Name                                             Prompt  \
0  error_class_ML  In the following I will give you a few Aspect ...   

                 Model             Database  \
0  gpt-4-turbo-preview  logistic_regression   

                                              Answer              Context  
0  Based on the provided false decisions, we can ...  logistic_regression  


In [24]:
df_answer.to_csv(OUTPUT_PATH, index=False)