In [None]:
from datasets import get_dataset_config_names
domains = get_dataset_config_names("subjqa")
domains

In [None]:
# We are only interested in the electronics section
from datasets import load_dataset
subjqa = load_dataset("subjqa", name = "electronics")
print(subjqa["train"]["answers"][1])

In [None]:
import pandas as pd
dfs = {split: dset.to_pandas() for split, dset in subjqa.flatten().items()}
for split, df in dfs.items():
    print(f"Number of questions in {split}: {df['id'].nunique()}")

In [None]:
# Focus on the columns you are interested in
qa_cols = ["title", "question", "answers.text", "answers.answer_start", "context"]
sample_df = dfs["train"][qa_cols].sample(2, random_state = 7)
sample_df

In [None]:
# We will directly use a fine tuned model first. We choose MiniLM cause its small and easy to iterate on
# Lets inspect the tokenizer first
from transformers import AutoTokenizer
model_ckpt = "deepset/minilm-uncased-squad2"
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

question = "How much music can this hold?"
context = """an MP3 is about 1 MB/minute, so about 6000 hours depending on \
the file size."""
inputs = tokenizer(question, context, return_tensors = "pt")

# Lets decode to see how it goes
print(tokenizer.decode(inputs["input_ids"][0]))

In [None]:
import torch
from transformers import AutoModelForQuestionAnswering

model = AutoModelForQuestionAnswering.from_pretrained(model_ckpt)
with torch.no_grad():
    outputs = model(**inputs)
print(outputs)

# To convert the outputs into an answer span, we first need to get the logits for the start and end token
start_logits = outputs.start_logits
end_logits = outputs.end_logits

print(f"Input IDs shape: {inputs.input_ids.size()}")
print(f"Start logits shape: {start_logits.size()}")
print(f"End logits shape: {end_logits.size()}")

# Now to get the answer, we can compute the argmax over start and end tokens logits and then slice the span from the input
import torch

start_idx = torch.argmax(start_logits)
end_idx = torch.argmax(end_logits) + 1
answer_span = inputs["input_ids"][0][start_idx:end_idx]
answer = tokenizer.decode(answer_span)
print(f"Question: {question}")
print(f"Answer: {answer}")

In [None]:
# Well this was a good disecting experience. In transformers, all these are wrapped nicely
from transformers import pipeline
pipe = pipeline("question-answering", model = model, tokenizer = tokenizer)
pipe(question = question, context = context, topk = 3)

In [None]:
# If you pass an impossible question, the pipeline maps it to an empty string (but its not really doing that)
pipe(question = "Why is there no data?", context = context, handle_impossible_answers = True)

In [None]:
# Okay enough tinkering. Now time to use haystack. Haystack needs two things:
# 1. A document store to store documents and metadata
# 2. Pipeline that combines all the components of a QA system to enable custom query flows, merging documents from multiple retrievers, and more

# We are running eleastic search in a docker container (https://www.elastic.co/guide/en/elasticsearch/reference/current/docker.html)
# import the store
from haystack.document_stores import ElasticsearchDocumentStore
document_store = ElasticsearchDocumentStore(host = "localhost", port = 9200, username="elastic", 
    password="cMYVjbUMj_8_664gC6R8", index="document", scheme = "https", verify_certs = True,
    ca_certs = "/home/tanvir/work/qa-experiment/http_ca.crt", return_embedding = True)

#hide
# It's a good idea to flush Elasticsearch with each notebook restart
if len(document_store.get_all_documents()) or len(document_store.get_all_labels()) > 0:
    document_store.delete_documents("document")
    document_store.delete_documents("label")

In [None]:
# The fields in meta can be used to apply filters during retrieval. We inclue the item_id and q_review_id columns
# of subjQA so we can filter by product and question ID, along with the corresponding training split.
# Now lets loop through and write the items
for split, df in dfs.items():
    # Exclude duplicate reviews
    docs = [{"content": row["context"], 
             "meta":{"item_id": row["title"], "question_id": row["id"], 
                     "split": split}} 
        for _,row in df.drop_duplicates(subset="context").iterrows()]
    document_store.write_documents(docs, index="document")

print(f"Loaded {document_store.get_document_count()} documents")

In [None]:
# We use Elasticsearch's default BM25 algorithm
from haystack.nodes import BM25Retriever
es_retriever = BM25Retriever(document_store=document_store)

item_id = "B0074BW614"
query = "Is it good for reading?"
retrieved_docs = es_retriever.retrieve(query = query, top_k = 3, filters = {"item_id":[item_id], "split":["train"]})

print(retrieved_docs)

In [None]:
# Lets initialize the reader
#hide_output
import os
from haystack.reader.farm import FARMReader

# Just a temporary workaround
os.environ["TOKENIZERS_PARALLELISM"] = "false"

model_ckpt = "deepset/minilm-uncased-squad2"
max_seq_length, doc_stride = 384, 128

reader = FARMReader(model_name_or_path = model_ckpt, progress_bar = False,
                    max_seq_len = max_seq_length, doc_stride = doc_stride, 
                    return_no_answer = True, use_gpu = True)

print(reader.predict_on_texts(question = question, texts = [context], top_k = 1))

In [None]:
# Now lets put everything together with a haystack pipeline
from haystack.pipelines import ExtractiveQAPipeline
pipeline = ExtractiveQAPipeline(reader, es_retriever)

n_answers = 3
preds = pipeline.run(query = query, params={"Retriever": {"top_k": 3}, "Reader": {"top_k": n_answers}, "filters": {"item_id": [item_id], "split":["train"]}})

print(f"Question: {preds['query']} \n")

for idx in range(n_answers):
    print(f"Answer {idx+1}: {preds['answers'][idx].answer}")
    print(f"Review snippet: ...{preds['answers'][idx].context}...")
    print("\n\n")

In [40]:
from haystack import Label, Answer, Document

labels = []
for i, row in dfs["test"].iterrows():
    # Metadata used for filtering in the Retriever
    meta = {"item_id": row["title"], "question_id": row["id"]}
    # Populate labels for questions with answers
    if len(row["answers.text"]):
        for answer in row["answers.text"]:
            label = Label(
                query = row["question"],
                answer = Answer(answer = answer),
                document = Document(
                    id = i, content_type = "text", content = row["context"]
                ),
                is_correct_answer = True,
                is_correct_document = True,
                no_answer = False,
                origin = "user-feedback",
                meta = meta
            )
            labels.append(label)
    # Populate labels for questions without answers
    else:
        label = Label(
            query = row["question"],
            answer = Answer(answer = ""),
            document = Document(
                id = i, content_type = "text", content = row["context"]
            ),
            is_correct_answer = True,
            is_correct_document = True,
            origin = "user-feedback",
            meta = meta,
            no_answer = True,
        )
        labels.append(label)
        
document_store.write_labels(labels, index = "label")
print(f"""Loaded {document_store.get_label_count(index="label")} \
question-answer pairs""")

Loaded 1268 question-answer pairs


In [None]:
labels_agg = document_store.get_all_labels_aggregated(
    index = "label",
    open_domain = True,
    aggregate_by_meta = ["item_id"]
)
print(len(labels_agg))
print(labels_agg[109])

In [None]:
def eval_pipeline(pipeline, top_k_retriever = 10, top_k_reader = 4):
    eval_result = pipeline.eval(labels = labels_agg, params = {"Retriever": {"top_k": top_k_retriever}, "Reader": {"top_k": top_k_reader}})
    metric = eval_result.calculate_metrics()
    return metric['Retriever']['recall_single_hit']

In [None]:
#hide_output
def evaluate_retriever(pipeline, topk_values = [1,3,5,10,20,40]):
    topk_results = {}
    for topk in topk_values:
        recall_value = eval_pipeline(pipeline, top_k_retriever = topk)
        topk_results[topk] = {"recall": recall_value}
        
    return pd.DataFrame.from_dict(topk_results, orient = "index")

es_topk_df = evaluate_retriever(pipeline)
print("Done")

Process ForkPoolWorker-53:
Process ForkPoolWorker-51:
Process ForkPoolWorker-52:
Process ForkPoolWorker-50:
Process ForkPoolWorker-49:
Process ForkPoolWorker-48:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/tanvir/.pyenv/versions/3.8.6/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/tanvir/.pyenv/versions/3.8.6/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/tanvir/.pyenv/versions/3.8.6/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/home/tanvir/.pyenv/versions/3.8.6/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/home/tanvir/.pyenv/versions/3.8.6/lib/python3.8/multiprocessing/process.py", line 315, in _boot

In [None]:
# Check how the result improves as k increases
import matplotlib.pyplot as plt
def plot_retriever_eval(dfs, retriever_names):
    fig, ax = plt.subplots()
    for df, retriever_name in zip(dfs, retriever_names):
        df.plot(y = "recall", ax=ax, label=retriever_name)
    plt.xticks(df.index)
    plt.ylabel("Top-k Recall")
    plt.xlabel("k")
    plt.show()
    
plot_retriever_eval([es_topk_df], ["BM25"])

In [None]:
# As we can see, the recall improves as we improve k. But can we do better? Lets use DPR
from haystack.nodes import DensePassageRetriever

dpr_retriever = DensePassageRetriever(
    document_store = document_store,
    query_embedding_model = "facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model = "facebook/dpr-ctx_encoder-single-nq-base",
    embed_title = False)

# Important:
# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation.
# While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
document_store.update_embeddings(dpr_retriever)
print("Done")

In [None]:
from haystack.utils import print_answers
# We can use the same reader as before since we are only evaluating retriever
dpr_pipe = ExtractiveQAPipeline(reader, dpr_retriever)

# Try a question
prediction = dpr_pipe.run(query = "How does the fan work?", params = {"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})
print_answers(prediction, details = "minimum")

In [None]:
#hide_output
# Now do a similar evaluation but with DRP retriever
dpr_topk_df = evaluate_retriever(dpr_pipe)
print("Done")

In [None]:
plot_retriever_eval([es_topk_df, dpr_topk_df], ["BM25", "DPR"])

In [None]:
# Lets take a little detour and try FAISS.
# FAISS is a library for efficient similarity search on a cluster of dense vectors. The FAISSDocumentStore uses a SQL(SQLite in-memory be default) database
# under-the-hood to store the document text and other meta data. The vector embeddings of the text are indexed on a FAISS Index that later is queried for
# searching answers. The default flavour of FAISSDocumentStore is "Flat" but can also be set to "HNSW" for faster search at the expense of some accuracy.
# Just set the faiss_index_factor_str argument in the constructor. For more info on which suits your use case: 
# https://github.com/facebookresearch/faiss/wiki/Guidelines-to-choose-an-index

from haystack.document_stores import FAISSDocumentStore
from haystack.utils import clean_wiki_text, convert_files_to_docs, fetch_archive_from_http, print_answers

document_store_faiss = FAISSDocumentStore(faiss_index_factory_str = "HNSW")


# Let's first get some GOT files that we want to use
doc_dir = "/mnt/sda/got_data"
s3_url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt6.zip"
fetch_archive_from_http(url = s3_url, output_dir = doc_dir)

# Convert files to dicts
docs = convert_files_to_docs(dir_path = doc_dir, clean_func = clean_wiki_text, split_paragraphs = True)

# Now, let's write the dicts containing documents to our DB.
document_store_faiss.write_documents(docs)

print("Done")

In [None]:
from haystack.nodes import DensePassageRetriever
from haystack.pipelines import ExtractiveQAPipeline

faiss_retriever = DensePassageRetriever(
    document_store = document_store_faiss,
    query_embedding_model = "facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model = "facebook/dpr-ctx_encoder-single-nq-base",
    max_seq_len_query = 64,
    max_seq_len_passage = 256,
    batch_size = 16,
    use_gpu = True,
    embed_title = True,
    use_fast_tokenizers = True,
)
# Important:
# Now that after we have the DPR initialized, we need to call update_embeddings() to iterate over all
# previously indexed documents and update their embedding representation.
# While this can be a time consuming operation (depending on corpus size), it only needs to be done once.
# At query time, we only need to embed the query and compare it the existing doc embeddings which is very fast.
document_store_faiss.update_embeddings(faiss_retriever)

faiss_reader = FARMReader(model_name_or_path = "deepset/roberta-base-squad2", use_gpu = True)
faiss_pipe = ExtractiveQAPipeline(faiss_reader, faiss_retriever)

faiss_prediction = faiss_pipe.run(query = "Who created the Dothraki vocabulary?", params = {"Retriever": {"top_k": 10}, "Reader": {"top_k": 5}})
print_answers(faiss_prediction, details = "minimum")

In [None]:
# This section is not running yet
from haystack.modeling.evaluation.squad import compute_f1, compute_exact
from haystack.nodes.evaluator import EvalAnswers
from haystack.pipeline import Pipeline

def evaluate_reader(reader):
    score_keys = ['top_1_em', 'top_1_f1']
    eval_reader = EvalAnswers(skip_incorrect_retrieval = False)
    eval_pipe = Pipeline()
    eval_pipe.add_node(component = reader, name = "QAReader", inputs = ["Query"])
    eval_pipe.add_node(component = eval_reader, name = "EvalReader", inputs = ["QAReader"])

    for l in labels_agg:
        doc = document_store.query(l.query, filters = {"question_id":[l.labels[0].meta['question_id']]})
        _ = eval_pipe.run(query = l.query, documents = doc, labels = l)
                
    return {k:v for k,v in eval_reader.__dict__.items() if k in score_keys}

print("Start eval")
reader_eval = {}
reader_eval["Fine-tune on SQuAD"] = evaluate_reader(reader)
print("Done")

In [None]:
def plot_reader_eval(reader_eval):
    fig, ax = plt.subplots()
    df = pd.DataFrame.from_dict(reader_eval)
    df.plot(kind="bar", ylabel="Score", rot=0, ax=ax)
    ax.set_xticklabels(["EM", "F1"])
    plt.legend(loc='upper left')
    plt.show()

plot_reader_eval(reader_eval)


In [60]:
# Now its time to train the model on SubjQA dataset. Squad model is pretty complicated so teh first step is to
# create a paragraph array associated with each product id
def create_paragraphs(df):
    paragraphs = []
    id2context = dict(zip(df["review_id"], df["context"]))
    for review_id, review in id2context.items():
        qas = []
        # Filter for all question-answer pairs about a specific context
        review_df = df.query(f"review_id == '{review_id}'")
        id2question = dict(zip(review_df["id"], review_df["question"]))
        # Build up the qas array
        for qid, question in id2question.items():
            # Filter for a single question ID
            question_df = df.query(f"id == '{qid}'").to_dict(orient="list")
            ans_start_idxs = question_df["answers.answer_start"][0].tolist()
            ans_text = question_df["answers.text"][0].tolist()
            # Fill answerable questions
            if len(ans_start_idxs):
                answers = [
                    {"text": text, "answer_start": answer_start}
                    for text, answer_start in zip(ans_text, ans_start_idxs)]
                is_impossible = False
            else:
                answers = []
                is_impossible = True
            # Add question-answer pairs to qas
            qas.append({"question": question, "id": qid, 
                        "is_impossible": is_impossible, "answers": answers})
        # Add context and question-answer pairs to paragraphs
        paragraphs.append({"qas": qas, "context": review})
    return paragraphs

# Now we just nee to apply the function to each product id and store the results in file

import json

def convert_to_squad(dfs):
    for split, df in dfs.items():
        subjqa_data = {}
        # Create `paragraphs` for each product ID
        groups = (df.groupby("title").apply(create_paragraphs)
            .to_frame(name="paragraphs").reset_index())
        subjqa_data["data"] = groups.to_dict(orient="records")
        # Save the result to disk
        with open(f"electronics-{split}.json", "w+", encoding="utf-8") as f:
            json.dump(subjqa_data, f)
            
convert_to_squad(dfs)

In [61]:
# Now its time to train and fine tune the reader
#hide_output
train_filename = "electronics-train.json"
dev_filename = "electronics-validation.json"

reader.train(data_dir = ".", use_gpu = True, n_epochs = 1, batch_size = 16,
             train_filename = train_filename, dev_filename = dev_filename)
print("Done training")

INFO - haystack.modeling.utils -  Using devices: CUDA
INFO - haystack.modeling.utils -  Number of GPUs: 1
INFO - haystack.modeling.data_handler.data_silo -  
Loading data into the data silo ... 
              ______
               |o  |   !
   __          |:`_|---'-.
  |__|______.-/ _ \-----.|       
 (o)(o)------'\ _ /     ( )      
 
INFO - haystack.modeling.data_handler.data_silo -  LOADING TRAIN DATA
INFO - haystack.modeling.data_handler.data_silo -  Loading train set from: electronics-train.json 
INFO - haystack.modeling.data_handler.data_silo -  Got ya 47 parallel workers to convert 1265 dictionaries to pytorch datasets (chunksize = 6)...
INFO - haystack.modeling.data_handler.data_silo -   0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0     0  
INFO - haystack

Done training
