In [3]:
import numpy as np
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer, RobertaForQuestionAnswering
import torch, json, os

tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
model = RobertaForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")
sentence_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [4]:
data = []
folder_name = "crisisfacts_008"
path = f"/Users/yash/Documents/GitHub/Code-Maroon-Global-Edition/Test_Data/data/{folder_name}"
doc_count = 0
for file in os.listdir(path):
    # if doc_count == 5:
    #     break
    # doc_count += 1
    with open(f"{path}/{file}", 'r') as f:
        json_str = f.read()

    # Split the string into individual JSON objects
    json_objs = json_str.strip().split('}{')

    # Add back the missing braces to each object
    for i in range(len(json_objs)):
        if i == 0:
            json_objs[i] += '}'
        elif i == len(json_objs) - 1:
            json_objs[i] = '{' + json_objs[i]
        else:
            json_objs[i] = '{' + json_objs[i] + '}'

    
    for json_obj in json_objs:
        try:
            text_val = json.loads(json_obj)
            if text_val["source_type"] == "News":
                data.append(text_val) 
        except:
            continue        


In [5]:
text_lines = []
for i in range(len(data)):
    if data[i]['source_type'] == "News" and "heading" in data[i]:
        text_lines.append(data[i]['heading']) 

# Embed each line in text_lines
text_lines_embeddings = []

# Encode the text_lines list
text_lines_embeddings = sentence_model.encode(text_lines)

embedding_dict = dict(zip(text_lines, text_lines_embeddings))

In [6]:

def normalize_text(s):
    """Removing articles and punctuation, and standardizing whitespace are all typical text processing steps."""
    import string, re

    def remove_articles(text):
        regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
        return re.sub(regex, " ", text)

    def white_space_fix(text):
        return " ".join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return "".join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

def compute_exact_match(prediction, truth):
    return int(normalize_text(prediction) == normalize_text(truth))

def get_gold_answers(example):
    """helper function that retrieves all possible true answers from a squad2.0 example"""
    
    # gold_answers = [answer["text"] for answer in example.answers if answer["text"]]
    gold_answers = [answer for answer in example['answers'] if answer]

    # if gold_answers doesn't exist it's because this is a negative example - 
    # the only correct answer is an empty string
    if not gold_answers:
        gold_answers = [""]
        
    return gold_answers

def compute_f1(prediction, truth):
    pred_tokens = normalize_text(prediction).split()
    truth_tokens = normalize_text(truth).split()
    
    # if either the prediction or the truth is no-answer then f1 = 1 if they agree, 0 otherwise
    if len(pred_tokens) == 0 or len(truth_tokens) == 0:
        return int(pred_tokens == truth_tokens), 0, 0
    
    common_tokens = set(pred_tokens) & set(truth_tokens)
    
    # if there are no common tokens then f1 = 0
    if len(common_tokens) == 0:
        return 0, 0, 0
    
    prec = len(common_tokens) / len(pred_tokens)
    rec = len(common_tokens) / len(truth_tokens)
    
    return 2 * (prec * rec) / (prec + rec) , prec, rec

def get_question_context(question_embeddings):
    cosine_values = []
    for text in embedding_dict:
        value = np.dot(question_embeddings, np.array(embedding_dict[text]))
        cosine_values.append({"text": text, "value": value})

    sorted_cosine_values = sorted(cosine_values, key=lambda x: x['value'], reverse=True)

    # Get context string
    context = ""
    counter = 0
    for val in sorted_cosine_values:
        context += val['text'] + "\n"
        counter += 1
        if counter == 15:
            break
    return context

def get_answers(question, context):
    inputs = tokenizer(question, context, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)

    answer_start_index = outputs.start_logits.argmax()
    answer_end_index = outputs.end_logits.argmax()

    predict_answer_tokens = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
    answer = tokenizer.decode(predict_answer_tokens, skip_special_tokens=True)
    return answer



In [7]:
test_queries =  [
        {
        "question": "What was the name of the hurricane that made landfall in Alabama and Mississippi on September 16, 2020?",
        "answers": ["Hurricane Sally"]
        },
        
        {
        "question": "What was the maximum sustained wind speed of Hurricane Sally when it made landfall?",
        "answers": ["105 mph", "100 mph", "40 mph", "90 mph"]
        },
        
        {
        "question": "Where did Hurricane Sally make landfall in Alabama?",
        "answers": ["Gulf Shores"]
        },
        
        {
        "question": "Where did Hurricane Sally make landfall in Mississippi?",
        "answers": ["Bay St. Louis"]
        },
        
        {
        "question": "What was the total damage caused by Hurricane Sally?",
        "answers": ["More than $3 billion"]
        },
        
        {
        "question": "How many deaths were caused by Hurricane Sally?",
        "answers": ["At least 3"]
        },
        
        {
        "question": "What were the main impacts of Hurricane Sally?",
        "answers": ["Flooding", "Power outages", "Structural damage"]
        },
        
        {
        "question": "What was the weather like in the days leading up to Hurricane Sally?",
        "answers": ["Warm and humid"]
        },
        
        {
        "question": "What was the weather like in the days after Hurricane Sally?",
        "answers": ["Cool and rainy"]
        },
        
        {
        "question": "What were the evacuation orders for Hurricane Sally?",
        "answers": ["Mandatory evacuations for some areas"]
        },
        
        {
        "question": "What were the shelter options for Hurricane Sally?",
        "answers": ["Public shelters", "Private shelters", "Staying with friends or family"]
        },
        
        {
        "question": "What were the transportation options for Hurricane Sally?",
        "answers": ["Closed roads", "Cancelled flights", "Limited public transportation"]
        },
        
        {
        "question": "What were the communication options for Hurricane Sally?",
        "answers": ["Cell phone service was spotty", "Power outages affected landlines", "Radios were a good source of information"]
        }
    ]


In [8]:
test_queries_facts =  [
        {
        "question": "How much rain is measured in Pensacola, Florida as of Tuesday?",
        "answers": ["12.71 inches"]
        },
        
        {
        "question": "Which areas are expecting heavy rain along I-10 highway from Louisiana to Florida,",
        "answers": [" Gautier and Pascagoula, Mississippi","Gautier","Pascagoula, Mississippi"]
        },
        
        {
        "question": "What is the contact number to report downed power lines?",
        "answers": ["1-800-GUPOWER","1-800-487-6937","1-800-GUPOWER (1-800-487-6937)"]
        },
        
        {
        "question": "How many meal red cross has served?",
        "answers": ["622,000"]
        },
        
        {
        "question": "What is the contact number of Alabama Power?",
        "answers": ["1-800-888-2726"]
        }
    ]


# RoBERTa: A Robustly Optimized BERT Pretraining Approach EVALUATION 

In [9]:
# Roberta Squad Evaluation
f1_scores = []
em_scores = []
recall_scores = []
precision_scores = []
for query in test_queries:
    question = query['question']
    question_embeddings = sentence_model.encode([question])[0]

    context = get_question_context(question_embeddings)
    predicted_answer = get_answers(question, context)

    # print("Context: ", context)
    print("Question: ", question)
    print("Answer: ", predicted_answer)

    gold_answers = get_gold_answers(query)

    em_score = max((compute_exact_match(answer, predicted_answer)) for answer in gold_answers)
    # f1_score , prec, rec  = max((compute_f1(answer, predicted_answer)) for answer in gold_answers)
    f1_list = []
    prec_list = []
    rec_list = []
    for answer in gold_answers:
        f1_score , prec, rec = compute_f1(answer, predicted_answer)
        f1_list.append(f1_score)
        prec_list.append(prec)
        rec_list.append(rec)

    # print(f"Question: {query['question']}")
    # print(f"Prediction: {predicted_answer}")
    print(f"True Answers: {gold_answers}")
    print(f"EM: {em_score} \t F1: {f1_score}")
    print(f"Precision: {max(prec_list)} \t Recall: {max(rec_list)}")

    em_scores.append(em_score)
    f1_scores.append(f1_score)
    recall_scores.append(max(rec_list))
    precision_scores.append(max(prec_list))

print(f"Average Exact Match: {(sum(em_scores) * 100 )/len(em_scores)} %")    
print(f"Average F1 score: {sum(f1_scores)/len(f1_scores)}")    
print(f"Average Recall: {(sum(recall_scores) * 100)/len(recall_scores)} %")    
print(f"Average Precision: {(sum(precision_scores) * 100) /len(precision_scores)} %")    

Question:  What was the name of the hurricane that made landfall in Alabama and Mississippi on September 16, 2020?
Answer:  What was the name of the hurricane that made landfall in Alabama and Mississippi on September 16, 2020?Landfall is forecast to occur Wednesday morning near the Mississippi/Alabama state line.
Landfall is likely along the Louisiana or Mississippi coast.
Hurricane Watch from Southeast Louisiana through Alabama
Hurricane Sally
True Answers: ['Hurricane Sally']
EM: 0 	 F1: 0.06666666666666667
Precision: 1.0 	 Recall: 0.034482758620689655
Question:  What was the maximum sustained wind speed of Hurricane Sally when it made landfall?
Answer:   90 mph
True Answers: ['105 mph', '100 mph', '40 mph', '90 mph']
EM: 1 	 F1: 1.0
Precision: 1.0 	 Recall: 1.0
Question:  Where did Hurricane Sally make landfall in Alabama?
Answer:   Gulf Shores, Alabama
True Answers: ['Gulf Shores']
EM: 0 	 F1: 0.8
Precision: 1.0 	 Recall: 0.6666666666666666
Question:  Where did Hurricane Sally mak

# OPEN AI TESTING 

In [10]:
# %pip install openai
import openai
openai.api_key = "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"
model_engine = "text-davinci-003"
# model_engine = "curie-002"

In [11]:
def get_openai_answers(question, context):
    prompt_template = "Question: {question}\nContext: {context}\nAnswer:"

    prompt = prompt_template.format(question=question, context=context)
    response = openai.Completion.create(
                engine=model_engine,
        #   prompt=prompt, max_tokens=1024, n=1,
          # prompt=prompt, max_tokens=20, n=1,
          prompt=prompt, max_tokens=100, n=1,
            stop=None,
            # stop=" ",
            # temperature=0.5,
            temperature=0.3,
        )
    answer = response.choices[0].text.strip()
    return answer

In [12]:
# Open AI Evaluation
f1_scores = []
em_scores = []
recall_scores = []
precision_scores = []
for query in test_queries:
    question = query['question']
    question_embeddings = sentence_model.encode([question])[0]

    context = get_question_context(question_embeddings)
    predicted_answer = get_openai_answers(question, context)

    # print("Context: ", context)
    print("Question: ", question)
    print("Answer: ", predicted_answer)

    gold_answers = get_gold_answers(query)

    em_score = max((compute_exact_match(answer, predicted_answer)) for answer in gold_answers)
    # f1_score , prec, rec  = max((compute_f1(answer, predicted_answer)) for answer in gold_answers)
    f1_list = []
    prec_list = []
    rec_list = []
    for answer in gold_answers:
        f1_score , prec, rec = compute_f1(answer, predicted_answer)
        f1_list.append(f1_score)
        prec_list.append(prec)
        rec_list.append(rec)

    # print(f"Question: {query['question']}")
    # print(f"Prediction: {predicted_answer}")
    print(f"True Answers: {gold_answers}")
    print(f"EM: {em_score} \t F1: {f1_score}")
    print(f"Precision: {max(prec_list)} \t Recall: {max(rec_list)}")

    em_scores.append(em_score)
    f1_scores.append(f1_score)
    recall_scores.append(max(rec_list))
    precision_scores.append(max(prec_list))

print(f"Average Exact Match: {(sum(em_scores) * 100 )/len(em_scores)} %")    
print(f"Average F1 score: {sum(f1_scores)/len(f1_scores)}")    
print(f"Average Recall: {(sum(recall_scores) * 100)/len(recall_scores)} %")    
print(f"Average Precision: {(sum(precision_scores) * 100) /len(precision_scores)} %")    

Question:  What was the name of the hurricane that made landfall in Alabama and Mississippi on September 16, 2020?
Answer:  Hurricane Sally
True Answers: ['Hurricane Sally']
EM: 1 	 F1: 1.0
Precision: 1.0 	 Recall: 1.0
Question:  What was the maximum sustained wind speed of Hurricane Sally when it made landfall?
Answer:  The maximum sustained wind speed of Hurricane Sally when it made landfall was 105 mph.
True Answers: ['105 mph', '100 mph', '40 mph', '90 mph']
EM: 0 	 F1: 0.125
Precision: 1.0 	 Recall: 0.14285714285714285
Question:  Where did Hurricane Sally make landfall in Alabama?
Answer:  Hurricane Sally made landfall near Gulf Shores, Alabama.
True Answers: ['Gulf Shores']
EM: 0 	 F1: 0.4
Precision: 1.0 	 Recall: 0.25
Question:  Where did Hurricane Sally make landfall in Mississippi?
Answer:  Sally made landfall near the Louisiana-Mississippi border.
True Answers: ['Bay St. Louis']
EM: 0 	 F1: 0
Precision: 0 	 Recall: 0
Question:  What was the total damage caused by Hurricane Sa

# FLAN T5 Evaluation 

In [13]:
# %pip install chromadb langchain
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma
from langchain.text_splitter import CharacterTextSplitter
from langchain.chains import RetrievalQA
from langchain import HuggingFaceHub
from langchain.document_loaders import TextLoader
from langchain.embeddings import HuggingFaceEmbeddings
import os
os.environ["HUGGINGFACEHUB_API_TOKEN"] = "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"

In [14]:
llm = HuggingFaceHub(repo_id="google/flan-t5-small", model_kwargs={"temperature":0, "max_length": 64})
embeddings = HuggingFaceEmbeddings()

In [15]:
# Load context file with heading
with open("context.txt", "w+") as fd:
    fd.write(".\n\n".join(text_lines))

In [17]:
# It takes all heading data and then store embeddings in chroma DB as initialize qa model for finding answer of query
loader = TextLoader("context.txt")
documents = loader.load()
text_splitter = CharacterTextSplitter(separator=".\n\n",chunk_size=10,chunk_overlap=0,
# text_splitter = CharacterTextSplitter(chunk_size=100, chunk_overlap=30,
    length_function = len)
texts = text_splitter.split_documents(documents)
print(texts)
print(len(texts))
docsearch = Chroma.from_documents(texts, embeddings)

Created a chunk of size 105, which is longer than the specified 10
Created a chunk of size 81, which is longer than the specified 10
Created a chunk of size 73, which is longer than the specified 10
Created a chunk of size 109, which is longer than the specified 10
Created a chunk of size 111, which is longer than the specified 10
Created a chunk of size 119, which is longer than the specified 10
Created a chunk of size 72, which is longer than the specified 10
Created a chunk of size 191, which is longer than the specified 10
Created a chunk of size 96, which is longer than the specified 10
Created a chunk of size 178, which is longer than the specified 10
Created a chunk of size 200, which is longer than the specified 10
Created a chunk of size 102, which is longer than the specified 10
Created a chunk of size 150, which is longer than the specified 10
Created a chunk of size 200, which is longer than the specified 10
Created a chunk of size 127, which is longer than the specified 10

11787


In [20]:
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch.as_retriever(search_kwargs={"k": 50}))

In [22]:
# Loop through all queries and perform evaluation
f1_scores = []
em_scores = []
recall_scores = []
precision_scores = []
for query in test_queries:
    question = query['question']

    # Answer string (CHECK BY YASH) get output string
    predicted_answer = qa.run(question) 

    # Get gold answers
    gold_answers = get_gold_answers(query)

    # get EM score
    em_score = max((compute_exact_match(answer, predicted_answer)) for answer in gold_answers)
    # f1_score , prec, rec  = max((compute_f1(answer, predicted_answer)) for answer in gold_answers)
    
    # Get prec, rec and f1
    f1_list = []
    prec_list = []
    rec_list = []
    for answer in gold_answers:
        f1_score , prec, rec = compute_f1(answer, predicted_answer)
        f1_list.append(f1_score)
        prec_list.append(prec)
        rec_list.append(rec)

    print("Question: ", question)
    print("Answer: ", predicted_answer)
    print(f"True Answers: {gold_answers}")
    print(f"EM: {em_score} \t F1: {f1_score}")
    print(f"Precision: {max(prec_list)} \t Recall: {max(rec_list)}")

    em_scores.append(em_score)
    f1_scores.append(f1_score)
    recall_scores.append(max(rec_list))
    precision_scores.append(max(prec_list))

print(f"Average Exact Match: {(sum(em_scores) * 100 )/len(em_scores)} %")    
print(f"Average F1 score: {sum(f1_scores)/len(f1_scores)}")    
print(f"Average Recall: {(sum(recall_scores) * 100)/len(recall_scores)} %")    
print(f"Average Precision: {(sum(precision_scores) * 100) /len(precision_scores)} %") 

Question:  What was the name of the hurricane that made landfall in Alabama and Mississippi on September 16, 2020?
Answer:  Hurricane Sally
True Answers: ['Hurricane Sally']
EM: 1 	 F1: 1.0
Precision: 1.0 	 Recall: 1.0
Question:  What was the maximum sustained wind speed of Hurricane Sally when it made landfall?
Answer:  105 mph
True Answers: ['105 mph', '100 mph', '40 mph', '90 mph']
EM: 1 	 F1: 0.5
Precision: 1.0 	 Recall: 1.0
Question:  Where did Hurricane Sally make landfall in Alabama?
Answer:  Gulf Shores, Alabama
True Answers: ['Gulf Shores']
EM: 0 	 F1: 0.8
Precision: 1.0 	 Recall: 0.6666666666666666
Question:  Where did Hurricane Sally make landfall in Mississippi?
Answer:  Gulf Shores, Alabama
True Answers: ['Bay St. Louis']
EM: 0 	 F1: 0
Precision: 0 	 Recall: 0
Question:  What was the total damage caused by Hurricane Sally?
Answer:  a).
True Answers: ['More than $3 billion']
EM: 0 	 F1: 0
Precision: 0 	 Recall: 0
Question:  How many deaths were caused by Hurricane Sally?
An

In [117]:
# Loop through all queries and perform evaluation
f1_scores = []
em_scores = []
recall_scores = []
precision_scores = []
for query in test_queries_facts:
    question = query['question']

    # Answer string (CHECK BY YASH) get output string
    predicted_answer = qa.run(question) 

    # Get gold answers
    gold_answers = get_gold_answers(query)

    # get EM score
    em_score = max((compute_exact_match(answer, predicted_answer)) for answer in gold_answers)
    # f1_score , prec, rec  = max((compute_f1(answer, predicted_answer)) for answer in gold_answers)
    
    # Get prec, rec and f1
    f1_list = []
    prec_list = []
    rec_list = []
    for answer in gold_answers:
        f1_score , prec, rec = compute_f1(answer, predicted_answer)
        f1_list.append(f1_score)
        prec_list.append(prec)
        rec_list.append(rec)

    print("Question: ", question)
    print("Answer: ", predicted_answer)
    print(f"True Answers: {gold_answers}")
    print(f"EM: {em_score} \t F1: {f1_score}")
    print(f"Precision: {max(prec_list)} \t Recall: {max(rec_list)}")

    em_scores.append(em_score)
    f1_scores.append(f1_score)
    recall_scores.append(max(rec_list))
    precision_scores.append(max(prec_list))

print(f"Average Exact Match: {(sum(em_scores) * 100 )/len(em_scores)} %")    
print(f"Average F1 score: {sum(f1_scores)/len(f1_scores)}")    
print(f"Average Recall: {(sum(recall_scores) * 100)/len(recall_scores)} %")    
print(f"Average Precision: {(sum(precision_scores) * 100) /len(precision_scores)} %") 

Question:  How much rain is measured in Pensacola, Florida as of Tuesday?
Answer:  b).
True Answers: ['12.71 inches']
EM: 0 	 F1: 0
Precision: 0 	 Recall: 0
Question:  Which areas are expecting heavy rain along I-10 highway from Louisiana to Florida,
Answer:  Mobile Bay
True Answers: [' Gautier and Pascagoula, Mississippi', 'Gautier', 'Pascagoula, Mississippi']
EM: 0 	 F1: 0
Precision: 0 	 Recall: 0
Question:  What is the contact number to report downed power lines?
Answer:  311
True Answers: ['1-800-GUPOWER', '1-800-487-6937', '1-800-GUPOWER (1-800-487-6937)']
EM: 0 	 F1: 0
Precision: 0 	 Recall: 0
Question:  How many meal red cross has served?
Answer:  622,000
True Answers: ['622,000']
EM: 1 	 F1: 1.0
Precision: 1.0 	 Recall: 1.0
Question:  What is the contact number of Alabama Power?
Answer:  888-2726
True Answers: ['1-800-888-2726']
EM: 0 	 F1: 0
Precision: 0 	 Recall: 0
Average Exact Match: 20.0 %
Average F1 score: 0.2
Average Recall: 20.0 %
Average Precision: 20.0 %


In [112]:
def get_t5_answer(question,context):
    text_splitter = CharacterTextSplitter(chunk_size=20000, chunk_overlap=200,length_function = len)
    texts = text_splitter.split_text(context)
    embeddings = HuggingFaceEmbeddings()
    docsearch_new_t5 = Chroma.from_texts(texts, embeddings)
    qa_new_t5 = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=docsearch_new_t5.as_retriever(search_kwargs={"k": 1}))
    output_t5 = qa_new_t5.run(question)
    return output_t5

In [113]:
# Flan T5 Evaluation with Similarity Search same as Roberta and OpenAI.
f1_scores = []
em_scores = []
recall_scores = []
precision_scores = []
for query in test_queries:
    question = query['question']
    question_embeddings = sentence_model.encode([question])[0]

    context = get_question_context(question_embeddings)
    predicted_answer = get_t5_answer(question, context)

    # print("Context: ", context)
    print("Question: ", question)
    print("Answer: ", predicted_answer)

    gold_answers = get_gold_answers(query)

    em_score = max((compute_exact_match(answer, predicted_answer)) for answer in gold_answers)
    # f1_score , prec, rec  = max((compute_f1(answer, predicted_answer)) for answer in gold_answers)
    f1_list = []
    prec_list = []
    rec_list = []
    for answer in gold_answers:
        f1_score , prec, rec = compute_f1(answer, predicted_answer)
        f1_list.append(f1_score)
        prec_list.append(prec)
        rec_list.append(rec)

    # print(f"Question: {query['question']}")
    # print(f"Prediction: {predicted_answer}")
    print(f"True Answers: {gold_answers}")
    print(f"EM: {em_score} \t F1: {f1_score}")
    print(f"Precision: {max(prec_list)} \t Recall: {max(rec_list)}")

    em_scores.append(em_score)
    f1_scores.append(f1_score)
    recall_scores.append(max(rec_list))
    precision_scores.append(max(prec_list))

print(f"Average Exact Match: {(sum(em_scores) * 100 )/len(em_scores)} %")    
print(f"Average F1 score: {sum(f1_scores)/len(f1_scores)}")    
print(f"Average Recall: {(sum(recall_scores) * 100)/len(recall_scores)} %")    
print(f"Average Precision: {(sum(precision_scores) * 100) /len(precision_scores)} %")    

Using embedded DuckDB without persistence: data will be transient


Question:  What was the name of the hurricane that made landfall in Alabama and Mississippi on September 16, 2020?
Answer:  Hurricane Sally
True Answers: ['Hurricane Sally']
EM: 1 	 F1: 1.0
Precision: 1.0 	 Recall: 1.0


Using embedded DuckDB without persistence: data will be transient


Question:  What was the maximum sustained wind speed of Hurricane Sally when it made landfall?
Answer:  35 mph
True Answers: ['105 mph', '100 mph', '40 mph', '90 mph']
EM: 0 	 F1: 0.5
Precision: 0.5 	 Recall: 0.5


Using embedded DuckDB without persistence: data will be transient


Question:  Where did Hurricane Sally make landfall in Alabama?
Answer:  Gulf Shores
True Answers: ['Gulf Shores']
EM: 1 	 F1: 1.0
Precision: 1.0 	 Recall: 1.0


Using embedded DuckDB without persistence: data will be transient


Question:  Where did Hurricane Sally make landfall in Mississippi?
Answer:  Alabama
True Answers: ['Bay St. Louis']
EM: 0 	 F1: 0
Precision: 0 	 Recall: 0


Using embedded DuckDB without persistence: data will be transient


Question:  What was the total damage caused by Hurricane Sally?
Answer:  (iii)
True Answers: ['More than $3 billion']
EM: 0 	 F1: 0
Precision: 0 	 Recall: 0


Using embedded DuckDB without persistence: data will be transient


Question:  How many deaths were caused by Hurricane Sally?
Answer:  1).
True Answers: ['At least 3']
EM: 0 	 F1: 0
Precision: 0 	 Recall: 0


Using embedded DuckDB without persistence: data will be transient


Question:  What were the main impacts of Hurricane Sally?
Answer:  (iii)
True Answers: ['Flooding', 'Power outages', 'Structural damage']
EM: 0 	 F1: 0
Precision: 0 	 Recall: 0


Using embedded DuckDB without persistence: data will be transient


Question:  What was the weather like in the days leading up to Hurricane Sally?
Answer:  Clear skies and cooler temperatures
True Answers: ['Warm and humid']
EM: 0 	 F1: 0.25
Precision: 0.3333333333333333 	 Recall: 0.2


Using embedded DuckDB without persistence: data will be transient


Question:  What was the weather like in the days after Hurricane Sally?
Answer:  Clear skies and cooler temperatures
True Answers: ['Cool and rainy']
EM: 0 	 F1: 0.25
Precision: 0.3333333333333333 	 Recall: 0.2


Using embedded DuckDB without persistence: data will be transient


Question:  What were the evacuation orders for Hurricane Sally?
Answer:  Counties issue evacuation orders, open shelters ahead of Hurricane Sally
True Answers: ['Mandatory evacuations for some areas']
EM: 0 	 F1: 0
Precision: 0 	 Recall: 0


Using embedded DuckDB without persistence: data will be transient


Question:  What were the shelter options for Hurricane Sally?
Answer:  (iii)
True Answers: ['Public shelters', 'Private shelters', 'Staying with friends or family']
EM: 0 	 F1: 0
Precision: 0 	 Recall: 0


Using embedded DuckDB without persistence: data will be transient


Question:  What were the transportation options for Hurricane Sally?
Answer:  Destin's charter boats
True Answers: ['Closed roads', 'Cancelled flights', 'Limited public transportation']
EM: 0 	 F1: 0
Precision: 0 	 Recall: 0


Using embedded DuckDB without persistence: data will be transient


Question:  What were the communication options for Hurricane Sally?
Answer:  (iii)
True Answers: ['Cell phone service was spotty', 'Power outages affected landlines', 'Radios were a good source of information']
EM: 0 	 F1: 0
Precision: 0 	 Recall: 0
Average Exact Match: 15.384615384615385 %
Average F1 score: 0.23076923076923078
Average Recall: 22.307692307692314 %
Average Precision: 24.35897435897436 %
