In [1]:
import os
from dotenv import load_dotenv
from langfuse.callback import CallbackHandler
from langchain.chains import LLMChain
from langchain_openai import OpenAI
from langchain.chains import LLMChain, SimpleSequentialChain
from langchain.prompts import PromptTemplate
from langfuse.callback import CallbackHandler
from langchain_openai import ChatOpenAI
import pandas as pd
import prompts as p
import json

# Load the .env file
load_dotenv()

# Access the environment variables
os.environ["LANGFUSE_PUBLIC_KEY"] = os.environ.get("LANGFUSE_PUBLIC_KEY")
os.environ["LANGFUSE_SECRET_KEY"] = os.environ.get("LANGFUSE_SECRET_KEY")
os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY")

handler = CallbackHandler()
handler.auth_check()

PROMPT = p.zero_shot_multi_table
prompt_name = "zero_shot_multi_table"
MODEL = "gpt-3.5-turbo"
TEMP = 0

DB_PEDIA_PATH = "../../Datasets/t2d_sm_nh/test/dbpedia_tables/{name}.csv"
WEBTABLES_PATH = "../../Datasets/t2d_sm_nh/test/webtables/{name}.csv"
OUTPUT_PATH = f"../../Datasets/Evaluations/Schema Matching/{prompt_name}_{MODEL}.csv"
CORRESPONDENCES_PATH = "../../Datasets/t2d_sm_nh/test/test_correspondences_sample.csv"


In [2]:
# Read the CSV file into a DataFrame
df = pd.read_csv(CORRESPONDENCES_PATH)
df['y_pred'] = None
df['prompt_name'] = None
df['prompt'] = None

In [3]:
def extract_column(s):
    if "none" in s.lower() or "null" in s.lower():
        return -1
    else: 
        return int(s.split('-')[1])

def check_output(left, right, output):
    for x in output:
        if extract_column(x[0]) == left and extract_column(x[1]) == right:
            return True
    return False

In [5]:
def execute(df, prompt, prompt_name, start = 0):
    last = ""
    if(MODEL == "gpt-3.5-turbo-instruct"):
        llm = OpenAI(model_name = MODEL, temperature = TEMP, timeout=10)
    else:
        llm = ChatOpenAI(model_name = MODEL, temperature = TEMP, timeout=10)
    for i in range(start, len(df)):
        row = df.iloc[i]
        if last != row['table_name']:
            last = row['table_name']
            if(row["y_pred"] != None and row["y_pred"] != "error"):
                continue
            print(i)
            chain = LLMChain(llm=llm, prompt=prompt, callbacks=[handler])

            print(df["table_name"][i])
            webtables_table =  row['table_A']
            db_pedia_table = row['table_B']
            sub_table = df[df['table_name'] == last]
            try: 
                result = chain.run(Table_A = webtables_table, Table_B = db_pedia_table, callbacks=[handler])
            except Exception as e:
                print(e)
                print(result)
                for j in sub_table.index:
                    df.at[j, 'y_pred'] =  'error'
                    df.at[j, 'prompt_name'] = prompt_name
                    df.at[j, 'prompt'] = chain.prompt.format_prompt(Table_A = webtables_table, Table_B = db_pedia_table).to_string()
                continue
            handler.langfuse.flush()
            prompt_text = chain.prompt.format_prompt(Table_A = webtables_table, Table_B = db_pedia_table).to_string()
            if (i < 500):
                print(prompt_text)
                print("\n")
                print(result)
                print("\n")
            
            if (i % 50 == 0):
                print(str(i) + " of  " + str(len(df)))
             
            try: 
                data = json.loads(result)
            except Exception as e:
                print("replacing Nones")
                result = result.replace('None', '"None"')
                data = json.loads(result)
            
            column_mappings = data["column_mappings"]
            
            for x in sub_table.index:
                y_pred = check_output(sub_table['column_index_left'][x], sub_table['column_index_right'][x], column_mappings)
                df.loc[x, 'y_pred'] =  y_pred
                df.loc[x, 'prompt_name'] = prompt_name
                df.loc[x, 'prompt'] = prompt_text
    return df

df = execute(df, PROMPT, prompt_name)
    



26
55961337_0_6548713781034932742
System: Description: Please identify the matching columns between Table A and Table B. 
     For each column in Table A, specify the corresponding column in Table B. 
     If a column in A has no corresponding column in Table B, you can map it to None. 
     Represent each column mapping using a pair of column headers in a list, i.e., [Table A Column, Table B column or None]. 
     Provide the mapping for each column in Table A and return all mappings in a list. Return the final result as JSON in the format {"column_mappings": "<a list of column pairs>"}.
Human: Question:
Table A:
| Column A-0                        | Column A-1   |   Column A-2 | Column A-3     | Column A-4   | Column A-5         |
|:----------------------------------|:-------------|-------------:|:---------------|:-------------|:-------------------|
| Congo, The Democratic Republic Of | nan          |          976 | 100 centimes   | CDF          | Franc Congolais    |
| French Southe

In [6]:
df.head(5)
print(df[df['y_pred'] == True])


     table_index                      table_name  column_index_left  \
0              0  29021592_3_2299138476894681059                  0   
12            12  34899692_0_6530393048033763438                  0   
19            19  34899692_0_6530393048033763438                  1   
82            82  66009064_0_9148652238372261251                  0   
96            96  84548468_0_5955155464119382182                  1   
149          149  48944826_0_2321751364268052533                  3   
173          173  36039980_4_4283009829602711082                  3   
185          185  48456557_0_3760853481322708783                  0   
197          197  48456557_0_3760853481322708783                  1   
210          210  86297395_0_6919201319699354263                  3   
254          254  46646666_0_5802598112171303204                  1   
274          274   3917335_0_7791699395300625164                  0   
280          280   3917335_0_7791699395300625164                  2   
285   

In [6]:
df.to_csv(OUTPUT_PATH, index = True)