In [1]:
import os
from dotenv import load_dotenv
from langfuse.callback import CallbackHandler
from langchain.chains import LLMChain
from langchain_openai import OpenAI
from langchain.chains import LLMChain, SimpleSequentialChain
from langchain.prompts import PromptTemplate
from langfuse.callback import CallbackHandler
from langchain_openai import ChatOpenAI
import pandas as pd
import prompts as p


# Load the .env file
load_dotenv()

# Access the environment variables
os.environ["LANGFUSE_PUBLIC_KEY"] = os.environ.get("LANGFUSE_PUBLIC_KEY")
os.environ["LANGFUSE_SECRET_KEY"] = os.environ.get("LANGFUSE_SECRET_KEY")
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")

handler = CallbackHandler()
handler.auth_check()

PROMPT = p.zero_shot_single_table
prompt_name = "zero_shot_single_table"
MODEL = "gpt-3.5-turbo"
MAX_TOKENS = 15
TEMP = 0

OUTPUT_PATH = f"../../Datasets/Evaluations/Schema Matching/{prompt_name}_{MODEL}.csv"
CORRESPONDENCES_PATH = "../../Datasets/t2d_sm_nh/test/test_correspondences_sample.csv"





In [2]:
# Read the CSV file into a DataFrame
df = pd.read_csv(CORRESPONDENCES_PATH)
df["y_pred"] = None
df["prompt_name"] = None
df["prompt"] = None
print(df.head())

   table_index                      table_name  column_index_left  \
0            0  29021592_3_2299138476894681059                  0   
1            1  29021592_3_2299138476894681059                  3   
2            2  29021592_3_2299138476894681059                  4   
3            3  29021592_3_2299138476894681059                  2   
4            4  29021592_3_2299138476894681059                  0   

   column_index_right  y_true y_pred prompt_name prompt  \
0                   0    True   None        None   None   
1                   0   False   None        None   None   
2                   0   False   None        None   None   
3                   0   False   None        None   None   
4                   3   False   None        None   None   

                                      column_table_A  \
0        Domitian | Nerva | Hadrian | Trajan | Titus   
1  Titus Flavius Domitianus | Marcus Cocceius Ner...   
2              81-96 | 96-8 | 117-8 | 98-117 | 79-81   
3     

In [3]:
def execute(df, prompt, prompt_name, start = 0):
    last = df.iloc[0]
    if(MODEL == "gpt-3.5-turbo-instruct"):
        llm = OpenAI(model_name = MODEL, temperature = TEMP, max_tokens=MAX_TOKENS, timeout=10)
    else:
        llm = ChatOpenAI(model_name = MODEL, temperature = TEMP, max_tokens=MAX_TOKENS, timeout=10)
    for i in range(start, len(df)):
        if(df["y_pred"][i] != None and df["y_pred"][i] != "error"):
            continue
        print(i)
        chain = LLMChain(llm=llm, prompt=prompt, callbacks=[handler])
        row = df.iloc[i]
        if (row["table_name"] != last["table_name"]):
            last = row
        Table_A = last["table_A"]
        Table_B = last["table_B"]
        Column_A = row["column_index_left"]
        Column_B = row["column_index_right"]
        try: 
            result = chain.run(Column_A = str(Column_A), Column_B = str(Column_B), Table_A = Table_A, Table_B = Table_B, callbacks=[handler])
        except Exception as e:
            print(e)
            df.loc[i, 'y_pred'] =  'error'
            df.loc[i, 'prompt_name'] = prompt_name
            df.loc[i, 'prompt'] = chain.prompt.format_prompt(Column_A = str(Column_A), Column_B = str(Column_B), Table_A = Table_A, Table_B = Table_B).text
            continue
        handler.langfuse.flush()
        prompt_text = chain.prompt.format_prompt(Column_A = str(Column_A), Column_B = str(Column_B), Table_A = Table_A, Table_B = Table_B).to_string()
        if (i < 5):
            print(prompt_text)
            print("Actual Result:\n" + str(row["y_true"]) + "\n")
            print("\n")
            print(result)
            print("\n")
        
        if (i % 50 == 0):
            print(str(i) + " of  " + str(len(df)))
            
        if "true" in result.lower():
            column_pred = "True"
        elif "false" in result.lower():
            column_pred = "False"
        else: 
            print(prompt_text)
            print("Actual Result:\n" + str(row["y_true"]) + "\n")
            print("\n")
            print(result)
            print("\n")
            column_pred = "Error"
        df.at[i, 'y_pred'] =  column_pred
        df.at[i, 'prompt_name'] = prompt_name
        df.at[i, 'prompt'] = prompt_text
    return df

df = execute(df, PROMPT, prompt_name)
    



0


  warn_deprecated(
  warn_beta(


Human: Question:
Table A:
| Column A-0   | Column A-1        |   Column A-2 | Column A-3                        | Column A-4   |
|:-------------|:------------------|-------------:|:----------------------------------|:-------------|
| Domitian     | son of Vespasian  |          nan | Titus Flavius Domitianus          | 81-96        |
| Nerva        | nan               |          nan | Marcus Cocceius Nerva             | 96-8         |
| Hadrian      | Kinsman of Trajan |          nan | Publius Aelius Hadrianus          | 117-8        |
| Trajan       | nan               |          nan | Marcus Ulpius Trajanus            | 98-117       |
| Titus        | son of Vespasian  |          nan | Titus Flavius Sabinus Vespasianus | 79-81        |

Table B:
| Column B-0            |   Column B-1 | Column B-2                                          | Column B-3                                  | Column B-4                | Column B-5                 |
|:----------------------|-------------:|:----

KeyboardInterrupt: 

In [4]:
df.head(5)

Unnamed: 0,table_index,table_name,column_index_left,column_index_right,y_true,y_pred,prompt_name,prompt,column_table_A,column_table_B,table_A,table_B
0,0,29021592_3_2299138476894681059,0,0,True,True,zero_shot_single_table,System: Description: Please identify if Column...,Domitian | Hadrian | Nerva | Titus | Trajan,Marcus Silius Messala | Titus | Hadrian | Vesp...,| Column A-0 | Column A-1 | Column ...,| Column B-0 | Column B-1 | Colum...
1,1,29021592_3_2299138476894681059,3,0,False,True,zero_shot_single_table,System: Description: Please identify if Column...,Titus Flavius Domitianus | Publius Aelius Hadr...,Marcus Silius Messala | Titus | Hadrian | Vesp...,,
2,2,29021592_3_2299138476894681059,4,0,False,True,zero_shot_single_table,System: Description: Please identify if Column...,81-96 | 117-8 | 96-8 | 79-81 | 98-117,Marcus Silius Messala | Titus | Hadrian | Vesp...,,
3,3,29021592_3_2299138476894681059,2,0,False,True,zero_shot_single_table,System: Description: Please identify if Column...,nan | nan | nan | nan | nan,Marcus Silius Messala | Titus | Hadrian | Vesp...,,
4,4,29021592_3_2299138476894681059,0,3,False,True,zero_shot_single_table,System: Description: Please identify if Column...,Domitian | Hadrian | Nerva | Titus | Trajan,nan | Imperator Titus Caesar Vespasianus Augus...,,


In [6]:
df.to_csv(OUTPUT_PATH, index = True)