In [4]:
import pandas as pd
import error_prompts as p
import os
from dotenv import load_dotenv
from langfuse.callback import CallbackHandler
from langchain.chains import LLMChain
from langchain_openai import OpenAI
from langchain.chains import LLMChain, SimpleSequentialChain
from langfuse.callback import CallbackHandler
from langchain_openai import ChatOpenAI
import pandas as pd
import json

# Load the .env file
load_dotenv()

# Access the environment variables
os.environ["LANGFUSE_PUBLIC_KEY"] = os.environ.get("LANGFUSE_PUBLIC_KEY")
os.environ["LANGFUSE_SECRET_KEY"] = os.environ.get("LANGFUSE_SECRET_KEY")
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")

handler = CallbackHandler()
handler.auth_check()



prompt_name = 'confidence'
prompt = p.confidence
MODEL = "gpt-3.5-turbo"
TEMP = 0

file = "zero_shot_multi_table_gpt-3.5-turbo"

DATA_PATH = f'../../Datasets/Evaluations/Schema Matching/{file}.csv'
OUTPUT_PATH = f"../../Datasets/Evaluations/Schema Matching/Error_Analysis/{prompt_name}_{file}_{MODEL}.csv"






In [5]:
df = pd.read_csv(DATA_PATH)
df['analysis'] = None


In [6]:
def execute(df, prompt, prompt_name, start = 0):
    last = ""
    if(MODEL == "gpt-3.5-turbo-instruct"):
        llm = OpenAI(model_name = MODEL, temperature = TEMP, timeout=10)
    else:
        llm = ChatOpenAI(model_name = MODEL, temperature = TEMP, timeout=10)
    for i in range(start, len(df)):
        row = df.iloc[i]
        if last != row['table_name']:
            last = row['table_name']
            if(row["analysis"] != None and row["analysis"] != "error"):
                continue
            print(i)
            chain = LLMChain(llm=llm, prompt=prompt, callbacks=[handler])
            user_prompt = df.loc[i, 'prompt']
            ai_answer = df.loc[i, 'ai_answer']
            sub_table = df[df['table_name'] == last]
            try: 
                result = chain.run(user_prompt = user_prompt, ai_answer = ai_answer, callbacks=[handler])
            except Exception as e:
                print(e)
                print(result)
                for j in sub_table.index:
                    df.at[j, 'analysis'] =  'error'
                continue
            handler.langfuse.flush()
            prompt_text = chain.prompt.format_prompt(user_prompt = user_prompt, ai_answer = ai_answer).to_string()
            if (i < 5):
                print(prompt_text)
                print("\n")
                print(result)
                print("\n")
            
            if (i % 50 == 0):
                print(str(i) + " of  " + str(len(df)))
             
            try: 
                data = json.loads(result)
            except Exception as e:
                print("replacing Nones")
                result = result.replace('None', '"None"')
                data = json.loads(result)
            
            column_mappings = data["column_mappings"]
            
            for x in sub_table.index:
                y_pred = check_output(sub_table['column_index_left'][x], sub_table['column_index_right'][x], column_mappings)
                df.loc[x, 'y_pred'] =  y_pred
                df.loc[x, 'prompt_name'] = prompt_name
                df.loc[x, 'prompt'] = prompt_text
                df.loc[x, 'ai_answer'] = result
    return df

df = execute(df, PROMPT, prompt_name)
    



0
System: You are a helpful AI.
Human: System: Description: Please identify the matching columns between Table A and Table B. 
     For each column in Table A, specify the corresponding column in Table B. 
     If a column in A has no corresponding column in Table B, you can map it to 'None'. 
     Represent each column mapping using a pair of column headers in a list, i.e., [Table A Column, Table B column or None]. 
     Provide the mapping for each column in Table A and return all mappings in a list. Return the final result as JSON in the format {"column_mappings": "<a list of column pairs>"}.
Human: Question:
Table A:
| Column A-0   | Column A-1        |   Column A-2 | Column A-3                        | Column A-4   |
|:-------------|:------------------|-------------:|:----------------------------------|:-------------|
| Domitian     | son of Vespasian  |          nan | Titus Flavius Domitianus          | 81-96        |
| Nerva        | nan               |          nan | Marcus Coc

In [6]:
print(df['analysis'])

0      {"column_mappings": [["Column A-0", "Column B-...
1      {"column_mappings": [["Column A-0", "Column B-...
2      {"column_mappings": [["Column A-0", "Column B-...
3      {"column_mappings": [["Column A-0", "Column B-...
4      {"column_mappings": [["Column A-0", "Column B-...
                             ...                        
445                                                 None
446                                                 None
447                                                 None
448                                                 None
449                                                 None
Name: analysis, Length: 450, dtype: object


In [5]:
df.to_csv(OUTPUT_PATH, index=False)
