In [1]:
import os
from dotenv import load_dotenv

from langchain import PromptTemplate, LLMChain
from langchain.chat_models import ChatOpenAI
from tqdm.notebook import tqdm

import json
import pickle
import string

from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score

import tiktoken
from pdb import set_trace

# Load OPENAI_API_KEY from .env file
load_dotenv()

True

In [2]:
PRICE_PER_1K_TOKENS_PROMPT = 0.0015
PRICE_PER_1K_TOKENS_COMPLETE = 0.002

MODEL_ID = "gpt-3.5-turbo-0301"

def num_tokens_from_string(string):
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.encoding_for_model(MODEL_ID)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [3]:
tasks = [
    
    'wdcproducts-80cc-seen-sampled-250-gs-2_domain-simple-free',
    'wdcproducts-80cc-seen-sampled-250-gs-2_domain-complex-free',
    'wdcproducts-80cc-seen-sampled-250-gs-2_domain-simple-force',
    'wdcproducts-80cc-seen-sampled-250-gs-2_domain-complex-force',
    'wdcproducts-80cc-seen-sampled-250-gs-2_general-simple-free',
    'wdcproducts-80cc-seen-sampled-250-gs-2_general-complex-free',
    'wdcproducts-80cc-seen-sampled-250-gs-2_general-simple-force',
    'wdcproducts-80cc-seen-sampled-250-gs-2_general-complex-force',

    'wdcproducts-80cc-seen-sampled-250-gs-2_domain-simple-force-rules-heterogeneity',
    'wdcproducts-80cc-seen-sampled-250-gs-2_domain-simple-force-rules-gen-handpicked',
    
    'wdcproducts-80cc-seen-sampled-250-gs-2_relatedwork-simple',
    'wdcproducts-80cc-seen-sampled-250-gs-2_relatedwork-complex',
    
    'abt-buy-sampled-gs_domain-simple-free',
    'abt-buy-sampled-gs_domain-complex-free',
    'abt-buy-sampled-gs_domain-simple-force',
    'abt-buy-sampled-gs_domain-complex-force',
    'abt-buy-sampled-gs_general-simple-free',
    'abt-buy-sampled-gs_general-complex-free',
    'abt-buy-sampled-gs_general-simple-force',
    'abt-buy-sampled-gs_general-complex-force',
    
    'abt-buy-sampled-gs_relatedwork-simple',
    'abt-buy-sampled-gs_relatedwork-complex',
    
    'abt-buy-sampled-gs_domain-simple-force-rules-heterogeneity',
    'abt-buy-sampled-gs_domain-simple-force-rules-gen-handpicked',
    
    'amazon-google-sampled-gs_domain-simple-free',
    'amazon-google-sampled-gs_domain-complex-free',
    'amazon-google-sampled-gs_domain-simple-force',
    'amazon-google-sampled-gs_domain-complex-force',
    'amazon-google-sampled-gs_general-simple-free',
    'amazon-google-sampled-gs_general-complex-free',
    'amazon-google-sampled-gs_general-simple-force',
    'amazon-google-sampled-gs_general-complex-force',
    
    'amazon-google-sampled-gs_relatedwork-simple',
    'amazon-google-sampled-gs_relatedwork-complex',
    
    'amazon-google-sampled-gs_domain-simple-force-rules-heterogeneity',
    'amazon-google-sampled-gs_domain-simple-force-rules-gen-handpicked',
    
    'walmart-amazon-sampled-gs_domain-simple-free',
    'walmart-amazon-sampled-gs_domain-complex-free',
    'walmart-amazon-sampled-gs_domain-simple-force',
    'walmart-amazon-sampled-gs_domain-complex-force',
    'walmart-amazon-sampled-gs_general-simple-free',
    'walmart-amazon-sampled-gs_general-complex-free',
    'walmart-amazon-sampled-gs_general-simple-force',
    'walmart-amazon-sampled-gs_general-complex-force',
    
    'walmart-amazon-sampled-gs_relatedwork-simple',
    'walmart-amazon-sampled-gs_relatedwork-complex',
    
    'walmart-amazon-sampled-gs_domain-simple-force-rules-heterogeneity',
    'walmart-amazon-sampled-gs_domain-simple-force-rules-gen-handpicked',
    
    'dblp-scholar-sampled-gs_domain-simple-free',
    'dblp-scholar-sampled-gs_domain-complex-free',
    'dblp-scholar-sampled-gs_domain-simple-force',
    'dblp-scholar-sampled-gs_domain-complex-force',
    'dblp-scholar-sampled-gs_general-simple-free',
    'dblp-scholar-sampled-gs_general-complex-free',
    'dblp-scholar-sampled-gs_general-simple-force',
    'dblp-scholar-sampled-gs_general-complex-force',
    
    'dblp-scholar-sampled-gs_relatedwork-simple',
    'dblp-scholar-sampled-gs_relatedwork-complex',
    
    'dblp-scholar-sampled-gs_domain-simple-force-rules-heterogeneity',
    'dblp-scholar-sampled-gs_domain-simple-force-rules-gen-handpicked',
    
    'dblp-acm-sampled-gs_domain-simple-free',
    'dblp-acm-sampled-gs_domain-complex-free',
    'dblp-acm-sampled-gs_domain-simple-force',
    'dblp-acm-sampled-gs_domain-complex-force',
    'dblp-acm-sampled-gs_general-simple-free',
    'dblp-acm-sampled-gs_general-complex-free',
    'dblp-acm-sampled-gs_general-simple-force',
    'dblp-acm-sampled-gs_general-complex-force',
    
    'dblp-acm-sampled-gs_relatedwork-simple',
    'dblp-acm-sampled-gs_relatedwork-complex',
    
    'dblp-acm-sampled-gs_domain-simple-force-rules-heterogeneity',
    'dblp-acm-sampled-gs_domain-simple-force-rules-gen-handpicked',
    ]

In [4]:
attributes = ['default']

for task in tasks:
    for attribute in attributes:
        
        token_count_prompt = 0
        token_count_completion = 0
        
        # open the JSON file in read mode
        with open(f'../tasks/{task}.json', 'r') as f:
            # load the JSON data from the file and convert it into a dictionary
            task_dict = json.load(f)
        

        # Create LangChain PromptTemplate

        template = """{task_prefix}{input_string}"""
        prompt = PromptTemplate(
                template=template,
                input_variables=['task_prefix', 'input_string']
        )

        model = ChatOpenAI(model_name=MODEL_ID, temperature=0)

        llm_chain = LLMChain(
            prompt=prompt,
            llm=model
        )
        
        prompts = []
        
        for example in task_dict['examples']:
            text_prompt = prompt.format(task_prefix=task_dict['task_prefix'], input_string=example['input'])
            prompts.append(text_prompt)
            tokens = num_tokens_from_string(text_prompt)
            token_count_prompt += tokens
        
        targets = [example['target_scores'] for example in task_dict['examples']]
        
        if attribute == 'default':
            preds = [llm_chain.run({'task_prefix': task_dict['task_prefix'], 'input_string': example['input']}) for example in tqdm(task_dict['examples'])]
         
        # save the prompts
        with open(f'../prompts/{task}_{attribute}_{MODEL_ID}_run-1.pickle', 'wb') as handle:
            pickle.dump(prompts, handle, protocol=pickle.HIGHEST_PROTOCOL)
        # save the answers
        with open(f'../answers/{task}_{attribute}_{MODEL_ID}_run-1.pickle', 'wb') as handle:
            pickle.dump(preds, handle, protocol=pickle.HIGHEST_PROTOCOL)
                
        for pred in preds:
            tokens = num_tokens_from_string(pred)
            token_count_completion += tokens

        # Do some data wrangling to format target and preds to match squad V2
        predictions = []
        truth = []
        unclear_answers = 0
        num_long_answers = 0
        for i in range(len(targets)):
            if targets[i]['Yes'] == 1:
                truth.append(1)
            else:
                truth.append(0)
                    
            # handle yes/no answers
            
            processed_pred = preds[i].strip().translate(str.maketrans('', '', string.punctuation)).lower()

            if processed_pred != 'yes' and processed_pred != 'no':
                print(f'Overlong Answer: {processed_pred}')
                num_long_answers += 1
            if 'yes' in processed_pred:
                processed_pred = 'yes'
            elif 'no' in processed_pred:
                processed_pred = 'no'
            else:
                processed_pred = 'no'
                unclear_answers += 1

            if processed_pred == 'yes':
                predictions.append(1)
            elif processed_pred == 'no':
                predictions.append(0)

        precision = precision_score(truth, predictions)
        recall = recall_score(truth, predictions)
        f1 = f1_score(truth, predictions)
        accuracy = accuracy_score(truth, predictions)
        
        price = (token_count_prompt/1000)*PRICE_PER_1K_TOKENS_PROMPT + (token_count_completion/1000)*PRICE_PER_1K_TOKENS_COMPLETE

        results = {
            'Accuracy': accuracy,
            'Precision': precision,
            'Recall': recall,
            'F1': f1,
            'cost': price
        }

        with open(f"../results/result_{task}_{attribute}_{MODEL_ID}_run-1.json", "w") as outfile:
            json.dump(results, outfile, indent=2)

  warn_deprecated(


  0%|          | 0/1239 [00:00<?, ?it/s]

  warn_deprecated(


Overlong Answer: no the two product descriptions do not match product 1 is a video security system while product 2 is an infrared range extender for a video system
Overlong Answer: no the two product descriptions do not match the first product is a chicago bulls trucker hat while the second product is a new york yankees 9fifty hat
Overlong Answer: no the two product descriptions do not match the first product is a black on white glossy tape with a size of 19cm x 7m and costs gbp 1349 for one roll the second product is a d1 tape with a size of 12mm x 7m and is sorted in red and black costing dkk 13900
Overlong Answer: yes both product descriptions refer to the samsung t7 portable ssd with a storage capacity of 2tb however there is a difference in color black vs indigo blue and currency gbp vs nzd additionally the first product description mentions a fingerprint id feature which is not mentioned in the second description
Overlong Answer: no the two product descriptions do not match produ

  0%|          | 0/1239 [00:00<?, ?it/s]

Overlong Answer: no the two product descriptions refer to different products product 1 is a video security system while product 2 is an infrared range extender for a video system
Overlong Answer: no the two product descriptions refer to different realworld products product 1 is a chicago bulls trucker hat while product 2 is a new york yankees 9fifty hat
Overlong Answer: yes both product descriptions refer to the same realworld product which is the dymo d1 tape
Overlong Answer: yes both product descriptions refer to the same realworld product which is the samsung t7 portable ssd with a storage capacity of 2tb the only difference is the color and the currency used in the pricing
Overlong Answer: no the two product descriptions do not refer to the same realworld product product 1 is a micro sd card made by kingston while product 2 is a solidstate drive made by transcend
Overlong Answer: no the two product descriptions refer to different products the first product is the jabra biz 2400 ii 

  0%|          | 0/1239 [00:00<?, ?it/s]

KeyboardInterrupt: 