In [42]:
import pandas as pd
import error_prompts as p
import os
from dotenv import load_dotenv
from langfuse.callback import CallbackHandler
from langchain.chains import LLMChain
from langchain_openai import OpenAI
from langchain.chains import LLMChain, SimpleSequentialChain
from langfuse.callback import CallbackHandler
from langchain_openai import ChatOpenAI
import pandas as pd

# Load the .env file
load_dotenv()

# Access the environment variables
os.environ["LANGFUSE_PUBLIC_KEY"] = os.environ.get("LANGFUSE_PUBLIC_KEY")
os.environ["LANGFUSE_SECRET_KEY"] = os.environ.get("LANGFUSE_SECRET_KEY")
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")

handler = CallbackHandler()
handler.auth_check()


prompt_name_structured = 'structured_analysis'
prompt_structured = p.structured_analysis
prompt_name_unstructured = 'unstructured_analysis'
prompt_unstructured = p.unstructured_analysis

file = "zero_shot_multi_table_gpt-3.5-turbo"

DATA_PATH = f'../../Datasets/Evaluations/Schema Matching/{file}.csv'
OUTPUT_PATH = f"../../Datasets/Evaluations/Schema Matching/Error_Analysis/structured_unstructured_{file}.csv"

MODEL = "gpt-4-turbo-preview"
TEMP = 0





In [43]:
df = pd.read_csv(DATA_PATH)
print(df.head())

   Unnamed: 0  table_index                      table_name  column_index_left  \
0           0            0  29021592_3_2299138476894681059                  0   
1           1            1  29021592_3_2299138476894681059                  3   
2           2            2  29021592_3_2299138476894681059                  4   
3           3            3  29021592_3_2299138476894681059                  2   
4           4            4  29021592_3_2299138476894681059                  0   

   column_index_right  y_true  y_pred            prompt_name  \
0                   0    True    True  zero_shot_multi_table   
1                   0   False   False  zero_shot_multi_table   
2                   0   False   False  zero_shot_multi_table   
3                   0   False   False  zero_shot_multi_table   
4                   3   False   False  zero_shot_multi_table   

                                              prompt  \
0  System: Description: Please identify the match...   
1  System: Descr

In [44]:

df['error'] = None
df['error'] = ['correct' if y_true == y_pred else 'error' for y_true, y_pred in zip(df['y_true'], df['y_pred'])]

print(df['error'].value_counts())

df_correct = df[df['error'] == 'correct']
df_error = df[df['error'] == 'error']



df_error_sample = df_error.sample(n=20)
df_correct_sample = df_correct.sample(n=5)
df = pd.concat([df_error_sample, df_correct_sample])
df['structured_analysis'] = None
df['unstructured_analysis'] = None
df = df.reset_index(drop=True)




error
correct    379
error       71
Name: count, dtype: int64


In [45]:
print(df.shape)
print(df.head())

(25, 17)
   Unnamed: 0  table_index                      table_name  column_index_left  \
0         183          183  36039980_4_4283009829602711082                  1   
1         137          137  86747932_0_7532457067740920052                  3   
2         215          215  86297395_0_6919201319699354263                  2   
3         294          294  39173938_0_7916056990138658530                  4   
4         438          438  90593344_0_8311455501234425088                  2   

   column_index_right  y_true  y_pred            prompt_name  \
0                   4    True   False  zero_shot_multi_table   
1                   0    True   False  zero_shot_multi_table   
2                   2    True   False  zero_shot_multi_table   
3                   0    True   False  zero_shot_multi_table   
4                   2    True   False  zero_shot_multi_table   

                                              prompt  \
0  System: Description: Please identify the match...   
1  Syst

In [46]:
#unstructured Analysis
def execute(df, prompt, start = 0):
    if(MODEL == "gpt-3.5-turbo-instruct"):
        llm = OpenAI(model_name = MODEL, temperature = TEMP, timeout=100)
    else:
        llm = ChatOpenAI(model_name = MODEL, temperature = TEMP, timeout=100)
    for i in range(len(df)):
        if(df["unstructured_analysis"][i] != None and df["unstructured_analysis"][i] != "error"):
            continue
        print(i)
        chain = LLMChain(llm=llm, prompt=prompt, callbacks=[handler])
        user_prompt = df.loc[i, 'prompt']
        ai_answer = str(df.loc[i, 'ai_answer'])
        match = ""
        if str(df.loc[i, 'y_pred']).lower() == 'false':
            match = "not match"
        else:
            match = "match"
        columns = "Column A-" + str(df.loc[i, 'column_index_left']) + " and Column B-" + str(df.loc[i, 'column_index_right'])
        try: 
            result = chain.run(user_prompt = user_prompt, ai_answer = ai_answer, match = match, columns = columns, callbacks=[handler])
        except Exception as e:
            print(e)
            print(chain.prompt.format_prompt(user_prompt = user_prompt, ai_answer = ai_answer, match = match, columns = columns).to_string())
            df.loc[i, 'unstructured_analysis'] = 'error'
            continue
        handler.langfuse.flush()
        prompt_text = chain.prompt.format_prompt(user_prompt = user_prompt, ai_answer = ai_answer, match = match, columns = columns).to_string()
        if (i < 5):
            print(prompt_text)
            print("\n")
            print(result)
            print("\n")
        
        if (i % 50 == 0):
            print(str(i) + " of  " + str(len(df)))
        df.loc[i, 'unstructured_analysis']= result
    return df


df = execute(df, prompt_unstructured)

    



0
System: You are a helpful AI.
Human: System: Description: Please identify the matching columns between Table A and Table B. 
     For each column in Table A, specify the corresponding column in Table B. 
     If a column in A has no corresponding column in Table B, you can map it to 'None'. 
     Represent each column mapping using a pair of column headers in a list, i.e., [Table A Column, Table B column or None]. 
     Provide the mapping for each column in Table A and return all mappings in a list. Return the final result as JSON in the format {"column_mappings": "<a list of column pairs>"}.
Human: Question:
Table A:
|   Column A-0 | Column A-1   | Column A-2   |   Column A-3 | Column A-4           |
|-------------:|:-------------|:-------------|-------------:|:---------------------|
|        189   | GR           | 19.03 km     |         2578 | Forcel               |
|        190.1 | GR/TI        | 15.44 km     |         2739 | Motton               |
|        191.8 | GR/TI        |

In [27]:
#structured Analysis
def execute(df, prompt):
    if(MODEL == "gpt-3.5-turbo-instruct"):
        llm = OpenAI(model_name = MODEL, temperature = TEMP, timeout=10)
    else:
        llm = ChatOpenAI(model_name = MODEL, temperature = TEMP, timeout=10)
    for i in range(len(df)):
        if(df["structured_analysis"][i] != None and df["structured_analysis"][i] != "error"):
            continue
        print(i)
        chain = LLMChain(llm=llm, prompt=prompt, callbacks=[handler])
        user_prompt = df.loc[i, 'prompt']
        ai_answer = df.loc[i, 'polarity_pred']
        try: 
            result = chain.run(user_prompt = user_prompt, ai_answer = str(ai_answer), callbacks=[handler])
        except Exception as e:
            print(e)
            print(chain.prompt.format_prompt(user_prompt = user_prompt, ai_answer = str(ai_answer)).to_string())
            df['structured_analysis'][i] = 'error'
            continue
        handler.langfuse.flush()
        prompt_text = chain.prompt.format_prompt(user_prompt = user_prompt, ai_answer = str(ai_answer)).to_string()
        if (i < 5):
            print(prompt_text)
            print("\n")
            print(result)
            print("\n")
        
        if (i % 50 == 0):
            print(str(i) + " of  " + str(len(df)))
        df.loc[i, 'structured_analysis']= result
    return df

df = execute(df, prompt_unstructured)

0
System: You are a helpful AI.
Human: There were a lot of scensters who couldnt afford dinner hanging in the waiting area so we got bumped around a lot.

    What is the sentiment on 'dinner'? Only respond with "positive", "negative" or "neutral" as one word.
AI: neutral
Human: Now explain concisely how you made your prediction and explicitly mention the adjectives that had a high influence on your decision.


The sentiment analysis on the word "dinner" in the provided text is determined to be "neutral" because the context in which "dinner" is mentioned does not inherently carry a positive or negative connotation towards the meal itself. The focus of the sentence is on the situation of people who couldn't afford dinner and the inconvenience caused by being bumped around, rather than the quality or enjoyment of the dinner. There are no adjectives directly modifying "dinner" to suggest a positive or negative sentiment. The sentiment is derived more from the situation (people not affordi

In [5]:
df.to_csv(OUTPUT_PATH, index=False)