In [10]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 3"

# Code

In [11]:
from transformers import AutoTokenizer
import transformers
import torch
import json
import pandas as pd
from tqdm import tqdm
import logging
import time
import code
import code_ablation
from collections import defaultdict
from datasets import load_dataset
from datasets import Dataset
model = "meta-llama/Llama-2-7b-chat-hf"
# model = "meta-llama/Llama-2-13b-hf"
tokenizer = AutoTokenizer.from_pretrained(model)

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)


Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.48s/it]


In [12]:
def ask(question:str) -> str:
    sequences = pipeline(
        question,
        do_sample=False,
        # top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=400,
    )
    for seq in sequences:
        return seq['generated_text']

In [13]:
def list_2_str_num(facts:list) -> str:
    res = ""
    for index, fact in enumerate(facts):
        res += f"{index+1}. {fact}\n"
    return res
def list_2_str(facts:list) -> str:
    res = ""
    for index, fact in enumerate(facts):
        res += f"{fact}\n"
    return res


In [14]:
"123".split('q')

['123']

In [15]:
def result_extractor_inference(prompt, result):
    result = result.strip()
    if prompt in result:
        result = result.replace(prompt, "")
    if '</Answer>' in result:
        result = result.split('</Answer>')[0].strip()
    result_lines = result.split('\n')
    answer_label_line = result_lines[0]
    if len(result_lines) > 1:
        answer_conclusion_line = result_lines[1]
    else:
        answer_conclusion_line = ''

    if 'yes' in answer_label_line.lower() and 'no' not in answer_label_line.lower():
        answer_label = 'yes'
    elif 'yes' not in answer_label_line.lower() and 'no' in answer_label_line.lower():
        answer_label = 'no'
    else:
        answer_label = 'error'

    answer_conclusion_after_produce = answer_conclusion_line.split('Produce:')
    if len(answer_conclusion_after_produce) > 1:
        answer_conclusion = answer_conclusion_after_produce[1].strip()
    else:
        answer_conclusion = answer_conclusion_after_produce[0]
    
    return answer_label, answer_conclusion, answer_conclusion_line
    

In [16]:
from datasets import load_dataset
from datasets import Dataset


def evaluate_dev(prompt_answer_dict_list:list):

    p_acc   = 0
    p_acc_T = 0
    p_acc_F = 0
    c_acc   = 0
    c_acc_T = 0
    c_acc_F = 0
    b_acc   = 0
    b_acc_T = 0
    b_acc_F = 0
    n_err   = 0
    n       = 0
    num_NOTHING = 0
    num_NOTHING_T = 0
    num_NOTHING_F = 0

    print(f"[nr] [pred] [conc]; [gold] [conc] [full]")
    for prompt_answer in prompt_answer_dict_list:
            prompt      = prompt_answer['prompt']
            gold_answer = prompt_answer['answer']
            conclusion = prompt_answer['conclusion']
            correct_pred = False
            correct_conc = False
            error   = False

            answer  = ask(prompt)
            answer_label, answer_conclusion, answer_full = result_extractor_inference(prompt, answer)
            if answer_label == 'error':
                error = True
            elif answer_label == gold_answer:
                correct_pred = True
            if answer_conclusion == conclusion:
                correct_conc = True

            # print(f"[{n}] [{'correct' if correct_pred else 'error' if error else 'wrong'}] [{'correct' if correct_conc else 'wrong'}] [{gold_answer}/{answer_label}] [{conclusion}/{answer_conclusion}] {answer_full}")
            # compare with True Answer
            n += 1
            if answer_conclusion == 'NOTHING':
                num_NOTHING += 1
                if gold_answer == 'yes':
                    num_NOTHING_T += 1
                elif gold_answer == 'no':
                    num_NOTHING_F += 1

            if error:
                 n_err += 1
            if correct_pred:
                p_acc += 1
                if gold_answer == 'yes':
                    p_acc_T += 1
                elif gold_answer == 'no':
                    p_acc_F += 1
            
            if correct_conc:
                c_acc += 1
                if gold_answer == 'yes':
                    c_acc_T += 1
                elif gold_answer == 'False':
                    c_acc_F += 1
            
            if correct_pred and correct_conc:
                b_acc += 1
                if gold_answer == 'yes':
                    b_acc_T += 1
                elif gold_answer == 'no':
                    b_acc_F += 1

            

    print(f"""Pred_Acc*:\t{p_acc}/100, 
Pred_TAcc*:\t{p_acc_T}/50, {p_acc_T*2}%
Pred_FAcc*:\t{p_acc_F}/50, {p_acc_F*2}%

Conc_Acc:\t{c_acc}/100,
Conc_TAcc*:\t{c_acc_T}/50, {c_acc_T*2}%
Conc_FAcc:\t{c_acc_F}/50, {c_acc_F*2}%

Both_Acc:\t{b_acc}/100,
Both_TAcc:\t{b_acc_T}/50, {b_acc_T*2}%
Both_FAcc:\t{b_acc_F}/50, {b_acc_F*2}%

num_NOTHING:\t{num_NOTHING}/100,
num_NOTHING_T:\t{num_NOTHING_T}/50, {num_NOTHING_T*2}%
num_NOTHING_F:\t{num_NOTHING_F}/50, {num_NOTHING_F*2}%

Err:\t{n_err}/100""")



# Few shot

In [18]:
def proofwriter_get_prompt(theory, question) -> str:
        # good standard
    prompt = F"""Task: all of the facts and rules are ture. Based on the provided Facts and Rules, please answer: is the Statement true or false?
Example Theory: Dave is big. Dave is blue. Dave is furry. Dave is nice. Dave is rough. Dave is round. Dave is white. If Dave is blue and Dave is not furry then Dave is white. If someone is round and not blue then they are nice.
Example Query: Dave is nice.
Example Answer: True
Facts and Rules: {theory}
Statements: {question}
Your Answer:"""
    return prompt

facts='Bob is good. If Bob is good then Bob is nice.'
query='Bob is nice.'
prompt = proofwriter_get_prompt(facts, query)
answer = ask(prompt)
print(answer)

Task: all of the facts and rules are ture. Based on the provided Facts and Rules, please answer: is the Statement true or false?
Example Theory: Dave is big. Dave is blue. Dave is furry. Dave is nice. Dave is rough. Dave is round. Dave is white. If Dave is blue and Dave is not furry then Dave is white. If someone is round and not blue then they are nice.
Example Query: Dave is nice.
Example Answer: True
Facts and Rules: Bob is good. If Bob is good then Bob is nice.
Statements: Bob is nice.
Your Answer: True

Please provide the Statement you would like me to evaluate, and I will let you know if it is true or false based on the provided Facts and Rules.
