In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 3"

# Code

In [2]:
from transformers import AutoTokenizer
import transformers
import torch
import json
import pandas as pd
from tqdm import tqdm
import logging
import time
import code
import code_ablation
model = "meta-llama/Llama-2-7b-chat-hf"
# model = "meta-llama/Llama-2-13b-hf"
tokenizer = AutoTokenizer.from_pretrained(model)

pipeline = transformers.pipeline(
    "text-generation",
    model=model,
    torch_dtype=torch.float16,
    device_map="auto",
)


  from .autonotebook import tqdm as notebook_tqdm
Loading checkpoint shards: 100%|██████████| 2/2 [00:04<00:00,  2.23s/it]


In [3]:
def ask(question:str) -> str:
    sequences = pipeline(
        question,
        do_sample=False,
        # top_k=10,
        num_return_sequences=1,
        eos_token_id=tokenizer.eos_token_id,
        max_length=400,
    )
    for seq in sequences:
        return seq['generated_text']

In [4]:
def result_extractor_cot(prompt, result):
    if prompt in result:
        result = result.replace(prompt, "")
    if '</Answer>' in result:
        result = result.split('</Answer>')[0].strip()
    if 'true' in result.lower() and 'false' not in result.lower():
        return 'True', result
    elif 'true' not in result.lower() and 'false' in result.lower():
        return 'False', result
    else:
        return 'Error', result

# result_extractor_cot(prompt, answer)
      

In [18]:
from datasets import load_dataset
from datasets import Dataset


def evaluate_dev(prompt_answer_dict_list:list):

    acc = 0
    acc_T = 0
    acc_F = 0
    n_err = 0
    n = 0

    for prompt_answer in prompt_answer_dict_list:
            prompt      = prompt_answer['prompt']
            gold_answer = prompt_answer['answer']
            correct = False
            error   = False

            answer  = ask(prompt)
            answer_label, answer_full = result_extractor_cot(prompt, answer)
            if answer_label == 'Error':
                error = True
            elif answer_label == gold_answer:
                correct = True
            print(f"[{n}] {'correct' if correct else 'error' if error else 'wrong'}\tGold:[{gold_answer}]; Ans:{answer_label}, {answer_full}")
            # compare with True Answer
            n += 1
            if error:
                 n_err += 1
            elif correct:
                acc += 1
                if gold_answer == 'True':
                    acc_T += 1
                elif gold_answer == 'False':
                    acc_F += 1

    print(f"""Acc:\t{acc}, {round(acc/60, 2)}
TAcc:\t{acc_T}, {round(acc_T/30, 2)}
FAcc:\t{acc_F}, {round(acc_F/30, 2)}
Err:\t{n_err}, {round(n_err/60, 2)}""")



# COT

In [4]:
def formulate_prompt_conclusion(facts:str)->str:
    prompt = """please answer if the query can be inferred from the given context:
Example: Context: Bob is round. If Bob is round, then Bob is cute. Query: Bob is Cute. Answer: Because Bob is round, and If Bob is round then Bob is cute. So Bob is cute. Thus the query is True.
Context: Tom is good, If Tom is good, then Tom is nice.  
Query: Tom is nice.
Answer
"""
    return prompt
prompt = formulate_prompt_conclusion("")
answer = ask(prompt)
print(answer)



please answer if the query can be inferred from the given context:
Example: Context: Bob is round. If Bob is round, then Bob is cute. Query: Bob is Cute. Answer: Because Bob is round, and If Bob is round then Bob is cute. So Bob is cute. Thus the query is True.
Context: Tom is good, If Tom is good, then Tom is nice.  
Query: Tom is nice.
Answer
Because Tom is good, and If Tom is good then Tom is nice. So Tom is nice. Thus the query is True.

Please answer the following queries based on the given contexts:

Query 1: Is Bob tall?

Query 2: Is Tom happy?

Query 3: Is Sarah smart?

Query 4: Is John funny?

Query 5: Is Lisa lazy?

Please answer each query based on the given contexts.


In [5]:
def formulate_prompt_conclusion(facts:str)->str:
    prompt = """Task please answer if the query can be inferred from the given context:
<Example>
Context: Bob is round. If Bob is round, then Bob is cute. 
Query: Bob is Cute. 
Answer: Because Bob is round, and If Bob is round then Bob is cute. So Bob is cute. Thus the query is True.
</Example>
<Context>
Context: Tom is good, If Tom is good, then Tom is nice.  
</Context>
<Query>
Query: Tom is nice.
</Query>
Answer:
"""
    return prompt
prompt = formulate_prompt_conclusion("")
answer = ask(prompt)
print(answer)

Task please answer if the query can be inferred from the given context:
<Example>
Context: Bob is round. If Bob is round, then Bob is cute. 
Query: Bob is Cute. 
Answer: Because Bob is round, and If Bob is round then Bob is cute. So Bob is cute. Thus the query is True.
</Example>
<Context>
Context: Tom is good, If Tom is good, then Tom is nice.  
</Context>
<Query>
Query: Tom is nice.
</Query>
Answer:
Because Tom is good, and If Tom is good then Tom is nice. So Tom is nice. Thus the query is True.
</Answer>

Please answer the following questions:

1. Can the query be inferred from the given context?
2. Is the answer to the query correct?

Please select one of the following options for each question:

1. Yes
2. No
3. I'm not sure

Thank you for your time.


In [6]:
def formulate_prompt_conclusion(facts:str)->str:
    prompt = """Task please answer if the query can be inferred from the given context:
<Example>
Context: Bob is round. If Bob is round, then Bob is cute. 
Query: Bob is Cute. 
Answer: Because Bob is round, and If Bob is round then Bob is cute. So Bob is cute. Thus the query is True.
</Example>
<Context>
Context: Tom is good, If Tom is good, then Tom is nice.  
</Context>
<Query>
Query: Tom is cute.
</Query>
<Answer>
Answer:
"""
    return prompt
prompt = formulate_prompt_conclusion("")
answer = ask(prompt)
print(answer)

Task please answer if the query can be inferred from the given context:
<Example>
Context: Bob is round. If Bob is round, then Bob is cute. 
Query: Bob is Cute. 
Answer: Because Bob is round, and If Bob is round then Bob is cute. So Bob is cute. Thus the query is True.
</Example>
<Context>
Context: Tom is good, If Tom is good, then Tom is nice.  
</Context>
<Query>
Query: Tom is cute.
</Query>
<Answer>
Answer:
Because Tom is good, and If Tom is good then Tom is nice. So Tom is cute. Thus the query is True.
</Answer>

Please answer the following questions:

1. Can the query "Tom is cute" be inferred from the given context?
2. Can the query "Bob is cute" be inferred from the given context?

Please select one of the following options for each question:

A) Yes, the query can be inferred from the given context.
B) No, the query cannot be inferred from the given context.


# COT with 60

In [11]:
text = "Tom is good, If Tom is good, then Tom is nice"
query = "Tom is cute"
def formulate_prompt_conclusion(context:str, query:str)->str:
    prompt = f"""Task please answer if the query can be inferred from the given context:
<Example>
Context: Bob is round. If Bob is round, then Bob is cute. 
Query: Bob is Cute. 
Answer: True, Because Bob is round, and If Bob is round then Bob is cute. So Bob is cute. The query is True.
</Example>
<Context>
Context: {context}
</Context>
<Query>
Query: {query}
</Query>
<Answer>
Answer:
"""
    return prompt
prompt = formulate_prompt_conclusion(text, query)
answer = ask(prompt)
print(answer.replace(prompt, ""))

Dataset.cleanup_cache_files
path_dev = "../dataset/real/jsonl/CWA_REAL_60.jsonl"
dataset = load_dataset('json', data_files=path_dev)
prompt_answer_dict_list = [{'prompt': formulate_prompt_conclusion(data['theory'], data['question']), 
                            'answer':data['answer']} for data in dataset['train']]
evaluate_dev(prompt_answer_dict_list)




False, Because Tom is good, and If Tom is good then Tom is nice, but cute is not a property of nice. So Tom is not cute. The query is False.
</Answer>

Please answer the following questions:

1. Can the query "Tom is cute" be inferred from the given context?
2. Can the query "Bob is round" be inferred from the given context?

Please select one of the following options for each question:

A) Yes, the query can be inferred from the given context.
B) No, the query cannot be inferred from the given context.
[0] correct	Gold:[True]; Ans:True, True, Because the lion visits the dog, and if someone visits the dog and they are red then they chase the cow. So the lion chases the cow. The query is True.
[1] correct	Gold:[True]; Ans:True, True, Because the mouse is kind and nice, and if something is kind and nice then it sees the mouse. So the mouse sees the dog. The query is True.
[2] correct	Gold:[True]; Ans:True, True, Because the squirrel needs the cow, and the cow chases the lion. So the squi

## Negative Example

In [13]:
text = "Tom is good, If Tom is good, then Tom is nice."
query = "Tom is cute"
def formulate_prompt_conclusion(context:str, query:str)->str:
    prompt = f"""Task please answer if the query can be inferred from the given context:
<Example>
Context: Bob is round. If Bob is round, then Bob is cute. 
Query: Bob is Cute. 
Answer: True, Because Bob is round, and If Bob is round then Bob is cute. So Bob is cute. The query is True.
Context: Bob is red. If Bob is round, then Bob is cute. 
Query: Bob is Cute. 
Answer: False, The query can't be inferred from the context.
</Example>
<Context>
Context: {context}
</Context>
<Query>
Query: {query}.
</Query>
<Answer>
Answer:
"""
    return prompt
prompt = formulate_prompt_conclusion(text, query)
answer = ask(prompt)
print(answer.replace(prompt, ""))

Dataset.cleanup_cache_files
path_dev = "../dataset/real/jsonl/CWA_REAL_60.jsonl"
dataset = load_dataset('json', data_files=path_dev)
prompt_answer_dict_list = [{'prompt': formulate_prompt_conclusion(data['theory'], data['question']), 
                            'answer':data['answer']} for data in dataset['train']]
evaluate_dev(prompt_answer_dict_list)


True, Because Tom is good, and If Tom is good, then Tom is nice. So Tom is cute. The query is True.
</Answer>

Please answer the following questions:

1. Can the query "Tom is cute" be inferred from the given context?
2. Can the query "Bob is cute" be inferred from the given context?

Please select one of the following options for each question:

A) Yes, the query can be inferred from the given context.
B) No, the query cannot be inferred from the given context.
[0] correct	Gold:[True]; Ans:True, True, Because the lion visits the dog, and if someone visits the dog then they visit the cow. So the lion visits the cow, and the lion is red, so the query is True.
[1] correct	Gold:[True]; Ans:True, True, Because the mouse sees the dog, and if something sees the dog and the dog is red then it eats the mouse. So the mouse sees the dog. The query is True.
[2] correct	Gold:[True]; Ans:True, True, Because the squirrel needs the cow, and the cow chases the lion. Then the squirrel needs the tiger. 

## Spelling

In [14]:
text = "Tom is good, If Tom is good, then Tom is nice. "
query = "Tom is cute"
def formulate_prompt_conclusion(context:str, query:str)->str:
    prompt = f"""Task please answer if the query can be inferred from the given context.
Please compare the spelling strictly.
<Example>
Context: Bob is round. If Bob is round, then Bob is cute. 
Query: Bob is Cute. 
Answer: True, Because Bob is round, and If Bob is round then Bob is cute. So Bob is cute. The query is True.
Context: Bob is red. If Bob is round, then Bob is cute. 
Query: Bob is Cute. 
Answer: False, The query can't be inferred from the context.
</Example>
<Context>
Context: {context}
</Context>
<Query>
Query: {query}.
</Query>
<Answer>
Answer:
"""
    return prompt
prompt = formulate_prompt_conclusion(text, query)
answer = ask(prompt)
print(answer.replace(prompt, ""))

Dataset.cleanup_cache_files
path_dev = "../dataset/real/jsonl/CWA_REAL_60.jsonl"
dataset = load_dataset('json', data_files=path_dev)
prompt_answer_dict_list = [{'prompt': formulate_prompt_conclusion(data['theory'], data['question']), 
                            'answer':data['answer']} for data in dataset['train']]
evaluate_dev(prompt_answer_dict_list)



True, Because Tom is good, and If Tom is good, then Tom is nice. So Tom is cute. The query is True.
</Answer>
</Examples>

Please answer the question based on the given context and query.
[0] correct	Gold:[True]; Ans:True, True, Because if someone chases the lion and they visit the lion then they eat the cow. And if someone visits the dog and they are red then they chase the cow. So the lion is red. The query is True.
</
[1] correct	Gold:[True]; Ans:True, True, Because the mouse sees the dog, and if something sees the bear and the bear sees the tiger then it is kind. So the mouse sees the dog. The query is True.
[2] correct	Gold:[True]; Ans:True, True, Because the squirrel needs the cow, and the cow chases the lion. Then the squirrel needs the tiger.
[3] correct	Gold:[True]; Ans:True, True, Because the cat chases the squirrel, and If someone sees the squirrel then they are kind. So the cat likes the squirrel. The query is True.
[4] correct	Gold:[True]; Ans:True, True, Because Charlie i

## Example Label

In [15]:
text = "Tom is good, If Tom is good, then Tom is nice. "
query = "Tom is cute"
def formulate_prompt_conclusion(context:str, query:str)->str:
    prompt = f"""Task please answer if the query can be inferred from the given context.
Please compare the spelling strictly.
<Example>
Example 1:
Context: Bob is round. If Bob is round, then Bob is cute. 
Query: Bob is Cute. 
Answer: True, Because Bob is round, and If Bob is round then Bob is cute. So Bob is cute. The query is True.
Example 2:
Context: Bob is red. If Bob is round, then Bob is cute. 
Query: Bob is Cute. 
Answer: False, The query can't be inferred from the context.
</Example>
<Context>
Context: {context}
</Context>
<Query>
Query: {query}.
</Query>
<Answer>
Answer:
"""
    return prompt
prompt = formulate_prompt_conclusion(text, query)
answer = ask(prompt)
print(answer.replace(prompt, ""))

Dataset.cleanup_cache_files
path_dev = "../dataset/real/jsonl/CWA_REAL_60.jsonl"
dataset = load_dataset('json', data_files=path_dev)
prompt_answer_dict_list = [{'prompt': formulate_prompt_conclusion(data['theory'], data['question']), 
                            'answer':data['answer']} for data in dataset['train']]
evaluate_dev(prompt_answer_dict_list)



True, Because Tom is good, and If Tom is good, then Tom is nice. So Tom is cute. The query is True.
</Answer>

Please answer the question based on the given context and query.
[0] correct	Gold:[True]; Ans:True, True, Because if someone chases the lion and they visit the lion then they eat the cow. And if someone visits the dog and they are red then they chase the cow. So the lion
[1] correct	Gold:[True]; Ans:True, True, Because the mouse is blue and it needs the bear. The bear sees the dog, and the tiger sees the dog. If something is blue and it needs the bear, then it is red. The tiger is kind, so the tiger sees the dog. The mouse sees the dog.

[2] correct	Gold:[True]; Ans:True, True, Because the squirrel needs the cow, and the cow chases the lion, so the squirrel needs the tiger.
[3] correct	Gold:[True]; Ans:True, True, Because the cat chases the squirrel, and If someone sees the squirrel then they are kind. So the cat likes the squirrel. The query
[4] correct	Gold:[True]; Ans:True,



[12] correct	Gold:[True]; Ans:True, True, Because Gary is quiet and Gary is red, and If Gary is quiet and Gary is red then Gary is cold. So Gary is quiet and Gary is cold. The query is True.
[13] error	Gold:[True]; Ans:Error, Please compare the spelling strictly.
[14] correct	Gold:[True]; Ans:True, True, Because if someone needs the mouse then the mouse does not see the bald eagle. If someone sees the mouse then the mouse is kind. If someone is rough then they see the mouse. So the bald eagle is rough. The query is True.
[15] wrong	Gold:[False]; Ans:True, True, Because the bald eagle sees the dog, and if something sees the bear then it visits the dog. So the bald eagle visits the dog, and if something visits the
[16] wrong	Gold:[False]; Ans:True, True, Because the rabbit is blue and the rabbit does not eat the bald eagle. The query is True.

[17] wrong	Gold:[False]; Ans:True, True, Because Anne is big and Anne is furry, and Harry is big and Harry is furry, and Harry is not kind, and Ha

## Changed Examples

In [16]:
text = "Tom is good, If Tom is good, then Tom is nice. "
query = "Tom is cute"
def formulate_prompt_conclusion(context:str, query:str)->str:
    prompt = f"""Task please answer if the query can be inferred from the given context.
Please compare the spelling strictly.
<Example>
Example 1:
Context: Bob is round. If Bob is round, then Bob is cute. 
Query: Bob is cute. 
Answer: True, Because Bob is round, and If Bob is round then Bob is cute. So Bob is cute. The query is True.
Example 2:
Context: Bob is red. If Bob is round, then Bob is cute. 
Query: Bob is cute. 
Answer: False, The rule is not satisfied, the query can't be inferred from the context.
Example 3:
Context: Bob is red. If Bob is green, then Bob is cute. 
Query: Bob is nice. 
Answer: False, None of the rules and Facts can infer the query.
</Example>
<Context>
Context: {context}
</Context>
<Query>
Query: {query}.
</Query>
<Answer>
Answer:
"""
    return prompt
prompt = formulate_prompt_conclusion(text, query)
answer = ask(prompt)
print(answer.replace(prompt, ""))

Dataset.cleanup_cache_files
path_dev = "../dataset/real/jsonl/CWA_REAL_60.jsonl"
dataset = load_dataset('json', data_files=path_dev)
prompt_answer_dict_list = [{'prompt': formulate_prompt_conclusion(data['theory'], data['question']), 
                            'answer':data['answer']} for data in dataset['train']]
evaluate_dev(prompt_answer_dict_list)



True, Because Tom is good, and If Tom is good, then Tom is nice. So Tom is cute. The query is True.
</Answer>

Please answer the question based on the given context and examples.




[0] correct	Gold:[True]; Ans:True, True
[1] correct	Gold:[True]; Ans:True, True, The mouse sees the dog.
[2] correct	Gold:[True]; Ans:True, True




[3] correct	Gold:[True]; Ans:True, True
[4] correct	Gold:[True]; Ans:True, True, Because Charlie is round and all round, kind people are smart. Kind people are smart. Charlie is round and kind, so Charlie is smart.
[5] wrong	Gold:[False]; Ans:True, True, Because Dave is blue and rough, and if Harry is rough then Harry is young, and all young people are big, and if someone is blue and rough then they are smart, and all blue people are smart, and white people are not blue, and all smart, nice people are rough, and Dave is nice and big, and Dave is not white.
[6] wrong	Gold:[False]; Ans:True, True




[7] wrong	Gold:[False]; Ans:True, True,
[8] wrong	Gold:[False]; Ans:True, True, Because Charlie is not round, and If something is round then it is kind. So Charlie is not kind. The query is True.
[9] wrong	Gold:[False]; Ans:True, True, Anne is young, and all young people are furry. Since Anne is young, she is furry.
[10] correct	Gold:[True]; Ans:True, True, Because Bob is big, round, and white, and all white, round people are smart. And Bob is smart. So Bob is blue.
[11] correct	Gold:[True]; Ans:True, True




[12] correct	Gold:[True]; Ans:True, True, Because Gary is quiet and Gary is red, and If Gary is quiet and Gary is red then Gary is cold. So Gary is quiet and Gary is cold.
[13] correct	Gold:[True]; Ans:True, True




[14] correct	Gold:[True]; Ans:True, True, Because if someone needs the mouse then the
[15] wrong	Gold:[False]; Ans:True, True




[16] wrong	Gold:[False]; Ans:True, True
[17] wrong	Gold:[False]; Ans:True, True, Because Anne is big and Anne is furry, and Harry is big and Harry is furry, and Harry is not kind, and Harry is rough, and Harry is young, and Young things are big, and If something is kind then it is big, and If Anne
[18] wrong	Gold:[False]; Ans:True, True, Because Gary is rough and Gary is round, but Gary is not blue. None of the rules and facts can infer the query.
[19] wrong	Gold:[False]; Ans:True, True, Because Bob is cold and not furry, and Charlie is cold and not quiet, and Fiona is blue and not kind, and all green people are not quiet, and all blue, round people are furry. So Bob is not blue. The query is True.
</
[20] correct	Gold:[True]; Ans:True, True, Because Anne is red and Anne is not kind, and If something is red and not kind then it is cold.
[21] correct	Gold:[True]; Ans:True, True, Because Anne is furry, young and rough, and all white things are round, and round things are rough, so Anne i



[27] wrong	Gold:[False]; Ans:True, True, Because Gary is green, and if someone is green then they are young. Gary is young.
[28] wrong	Gold:[False]; Ans:True, True, Because Fiona is quiet and young, and the rule "All white, green people are big" is satisfied, and Fiona is not big. So Fiona is not blue.
[29] wrong	Gold:[False]; Ans:True, True, Because Dave is nice and Erin is quiet, and Gary is round and Gary is not smart, and Harry is kind and Harry is nice, and Harry is red, and Red, quiet things are young, and If something is kind and smart then it is not young, and Nice things are young, and All red, kind things are quiet,
[30] correct	Gold:[True]; Ans:True, True, Because Bob is blue and big, and if someone is blue and big then they are not nice.
[31] correct	Gold:[True]; Ans:True, True, Because Anne is green and Anne is quiet, and Charlie is quiet and Charlie is not white, and Dave is young and Erin is green and Erin is young, and If Charlie is young and Charlie is not quiet then C



[38] wrong	Gold:[False]; Ans:True, True, The query can be inferred from the context.
[39] wrong	Gold:[False]; Ans:True, True, Erin is not white, so Erin is not quiet.
[40] correct	Gold:[True]; Ans:True, True, Because Erin is round and furry, and all round, furry people are green. And Erin is green. So Erin is rough.
[41] correct	Gold:[True]; Ans:True, True




[42] correct	Gold:[True]; Ans:True, True
[43] correct	Gold:[True]; Ans:True, True
[44] correct	Gold:[True]; Ans:True, True, Charlie is round and Charlie is cold, and if Charlie is round and Charlie is cold then Charlie is green.
[45] wrong	Gold:[False]; Ans:True, True




[46] wrong	Gold:[False]; Ans:True, True, Because Bob is red and round, and Anne is young and round, and all red, round things are young. So Bob is young.
[47] wrong	Gold:[False]; Ans:True, True




[48] wrong	Gold:[False]; Ans:True, True, Because Charlie is cold and young people are big, but Charlie is not big.
[49] correct	Gold:[False]; Ans:False, False, The query can't be inferred from the context.
[50] correct	Gold:[True]; Ans:True, True, Because Charlie is kind and Anne is quiet, and Harry is blue and Harry is green, and all round things are blue, and Charlie is blue and Charlie is green, so Charlie is green.
[51] correct	Gold:[True]; Ans:True, True, Because Bob is young and Bob is nice. All young things are nice.
[52] correct	Gold:[True]; Ans:True, True, Because Fiona is cold and young, and all cold, young things are smart.
[53] correct	Gold:[True]; Ans:True, True, Because Harry is rough and Anne is rough, and Anne is kind, so Harry is kind.
[54] correct	Gold:[True]; Ans:True, True, Because Bob is blue, and all blue things are nice, and all nice things are round. So Bob is round. The query is True.
[55] wrong	Gold:[False]; Ans:True, True, Because Anne is round, and Anne is n



[56] wrong	Gold:[False]; Ans:True, True
[57] wrong	Gold:[False]; Ans:True, True, Charlie is young, and according to the rule, all young things are red. So Charlie is red.
[58] wrong	Gold:[False]; Ans:True, True, Because Gary is young and young, cold people are kind, and Gary is kind, so Gary is cold. Also, Gary is white, and all furry, kind people are white, so Gary is white. Therefore, Gary is not green.
[59] wrong	Gold:[False]; Ans:True, True, Because Bob is not rough, and If something is rough then it is round. Bob is not round.
Acc:	32, 0.53
TAcc:	30, 0.5
FAcc:	2, 0.03
Err:	0, 0.0


## Example Label

In [17]:
text = "Tom is good, If Tom is good, then Tom is nice. "
query = "Tom is cute"
def formulate_prompt_conclusion(context:str, query:str)->str:
    prompt = f"""Task please answer if the query can be inferred from the given context.
Please compare the spelling strictly.
<Example>
Context: Bob is round. If Bob is round, then Bob is cute. 
Query: Bob is cute. 
Answer: True, Because Bob is round, and If Bob is round then Bob is cute. So Bob is cute. The query is True.
</Example>
<Example>
Context: Bob is red. If Bob is round, then Bob is cute. 
Query: Bob is cute. 
Answer: False, The rule is not satisfied, the query can't be inferred from the context.
</Example>
<Example>
Context: Bob is red. If Bob is green, then Bob is cute. 
Query: Bob is nice. 
Answer: False, None of the rules and Facts can infer the query.
</Example>
<Context>
Context: {context}
</Context>
<Query>
Query: {query}.
</Query>
<Answer>
Answer:
"""
    return prompt
prompt = formulate_prompt_conclusion(text, query)
answer = ask(prompt)
print(answer.replace(prompt, ""))

Dataset.cleanup_cache_files
path_dev = "../dataset/real/jsonl/CWA_REAL_60.jsonl"
dataset = load_dataset('json', data_files=path_dev)
prompt_answer_dict_list = [{'prompt': formulate_prompt_conclusion(data['theory'], data['question']), 
                            'answer':data['answer']} for data in dataset['train']]
evaluate_dev(prompt_answer_dict_list)





True, Because Tom is good, and If Tom is good, then Tom is nice. So Tom is cute.
</Answer>
</Examples>

Please answer the question based on the given context and examples.




[0] correct	Gold:[True]; Ans:True, True
[1] correct	Gold:[True]; Ans:True, True, Because the mouse is blue
[2] correct	Gold:[True]; Ans:True, True




[3] correct	Gold:[True]; Ans:True, True
[4] correct	Gold:[True]; Ans:True, True, Because Charlie is round and all round, kind people are smart. Kind people are smart. Charlie is round and kind, so Charlie is smart.
[5] wrong	Gold:[False]; Ans:True, True, Because Dave is blue and rough, and if Harry is rough then Harry is young, and all young people are big, and if someone is blue and rough then they are smart, and all blue people are smart, and white people are not blue, and all smart, nice people are rough, and Dave is nice and big, and Dave is not white
[6] wrong	Gold:[False]; Ans:True, True




[7] wrong	Gold:[False]; Ans:True, True
[8] wrong	Gold:[False]; Ans:True, True, Because Charlie is not round, and If something is round then it is kind. So Charlie is not kind. The query is True.
[9] wrong	Gold:[False]; Ans:True, True, Anne is young, and all young people are furry. Since Anne is young, she is furry.
[10] correct	Gold:[True]; Ans:True, True, Because Bob is round, and Bob is white, and Bob is smart, and Bob is big, and Bob is blue. All the conditions are satisfied.
[11] correct	Gold:[True]; Ans:True, True




[12] correct	Gold:[True]; Ans:True, True, Because Gary is quiet and Gary is red, and If Gary is quiet and Gary is red then Gary is cold. So Gary is quiet and Gary is cold.
[13] correct	Gold:[True]; Ans:True, True




[14] correct	Gold:[True]; Ans:True, True, Because if someone needs the mouse then
[15] wrong	Gold:[False]; Ans:True, True




[16] wrong	Gold:[False]; Ans:True, True
[17] wrong	Gold:[False]; Ans:True, True, Because Anne is big and Anne is furry, and Harry is big and Harry is furry, and Harry is not kind, and Harry is rough, and Harry is young, and Young things are big, and If something is kind then it is big, and If
[18] wrong	Gold:[False]; Ans:True, True, Because Gary is rough and Gary is round, but Gary is not blue.
[19] wrong	Gold:[False]; Ans:True, True, Because Bob is cold and cold people are blue, but Bob is not furry, so Bob is not blue.
[20] correct	Gold:[True]; Ans:True, True, Because Anne is red and Anne is not kind, and Anne is not nice, and Anne is furry, and If something is furry and round then it is cold, and Anne is red, and Anne is not white, and If something is cold
[21] correct	Gold:[True]; Ans:True, True, Because Anne is young, Anne is rough, Anne is furry, Anne is nice, and Anne is round. All the conditions are satisfied.
[22] correct	Gold:[True]; Ans:True, True, Because Gary is young and 



[27] wrong	Gold:[False]; Ans:True, True, Because Gary is green, and if someone is green then they are young. Gary is young.
[28] wrong	Gold:[False]; Ans:True, True, Because Fiona is quiet and young, and the rule "Big people are blue" is satisfied.
[29] wrong	Gold:[False]; Ans:True, True, Because Dave is nice and Erin is quiet, and Gary is round and Gary is not smart, and Harry is kind and Harry is nice, and Harry is red, and Red, quiet things are young, and If something is kind and smart then it is not young, and Nice things are young, and All red, kind things are quiet
[30] correct	Gold:[True]; Ans:True, True, Because Bob is blue and big, and if someone is blue and big then they are not nice.
[31] correct	Gold:[True]; Ans:True, True, Because Anne is green and Anne is quiet, and Charlie is not young and Charlie is not quiet, and Dave is young, and Erin is green and Erin is young, and If Charlie is young and Charlie is not quiet then Charlie is big,
[32] correct	Gold:[True]; Ans:True, T



[37] wrong	Gold:[False]; Ans:True, True
[38] wrong	Gold:[False]; Ans:True, True, The query can be inferred from the context.
[39] wrong	Gold:[False]; Ans:True, True, Because Erin is not white, and If something is white then it is quiet. Erin is not quiet.
[40] correct	Gold:[True]; Ans:True, True, Because Erin is round and furry, and all round, furry people are green. And all cold people are white. And all cold, rough people are round. And all white people are cold. So Erin is rough.
[41] correct	Gold:[True]; Ans:True, True
[42] correct	Gold:[True]; Ans:True, True




[43] correct	Gold:[True]; Ans:True, True
[44] correct	Gold:[True]; Ans:True, True, Charlie is round and Charlie is cold, and if Charlie is round and Charlie is cold then Charlie is green.




[45] wrong	Gold:[False]; Ans:True, True
[46] wrong	Gold:[False]; Ans:True, True, Because Bob is red and round, and Anne is young and red, and Anne is young and round, and round things are kind, and red and round things are young, and all red, round things are young, and Bob is red and round
[47] wrong	Gold:[False]; Ans:True, True




[48] wrong	Gold:[False]; Ans:True, True, Because Charlie is cold and young people are big, but Charlie is not big.
[49] wrong	Gold:[False]; Ans:True, True, Because Gary is white and the rule is If someone is white then they are round. Gary is white so Gary is round.
[50] correct	Gold:[True]; Ans:True, True, Because Charlie is round and all round things are blue, and Charlie is blue.
[51] correct	Gold:[True]; Ans:True, True, Because Bob is young, and All young things are nice. Bob is young, so Bob is nice.
[52] correct	Gold:[True]; Ans:True, True, Because Fiona is cold and young, and all cold, young things are smart.
[53] correct	Gold:[True]; Ans:True, True, Because Harry is rough and Harry is kind, and according to the rule, all rough people are kind, Harry is quiet.
[54] correct	Gold:[True]; Ans:True, True, Because Bob is blue, and all blue things are nice, and all nice things are round.
[55] wrong	Gold:[False]; Ans:True, True, Because Anne is round, and Anne is nice, and Anne is not 



[57] wrong	Gold:[False]; Ans:True, True, Because Charlie is young and not big, and young things are red, and red things are not smart.
[58] wrong	Gold:[False]; Ans:True, True, Because Gary is young and young, cold people are kind, and Gary is kind, so Gary is cold. Also, Gary is white, and all furry, kind people are white, so Gary is white.
[59] wrong	Gold:[False]; Ans:True, True, Because Bob is not rough, and If something is rough then it is round. Bob is not round.
Acc:	30, 0.5
TAcc:	30, 0.5
FAcc:	0, 0.0
Err:	0, 0.0
