In [None]:
# Import a subset of Amazon office product review dataset. For our example, we’ll be working with a subset of the Amazon Review Dataset
# that consists of around 50,000 reviews of items from Amazon’s office supplies category (http://jmcauley.ucsd.edu/data/amazon/links.html)
import pandas as pd
import random

file_path = "/mnt/sda/haystack/amazon_data/reviews_Office_Products_5.json"

reviews = pd.read_json(file_path, lines = True)
# print(reviews.columns)

# Extract review text and ids and convert them into a dictionary. The meta field contains item_id
# which will be used for filtering later.
texts = reviews.reviewText.values
ids = reviews.asin.values

dicts = [{'content': text, 'meta': {'item_id': id_}} for text, id_ in zip(texts, ids)]

random.choice(dicts)

In [None]:
# Since we’re going to work with the dense DPR retrieval method, we let the preprocessor split
# our reviews into chunks of length 100 and an overlap of five words.
from haystack.nodes import PreProcessor

processor = PreProcessor(
    split_by = 'word', 
    split_length = 100,
    split_respect_sentence_boundary = False,
    split_overlap = 5)
flattened_docs = processor.process(dicts)

random.choice(flattened_docs)

In [None]:
# Load the data in elastic
from haystack.document_stores import ElasticsearchDocumentStore

document_store = ElasticsearchDocumentStore(host = "localhost", port = 9200, username = "elastic", 
    password = "Dkmh=pOI=CSukfWwOoxh", index = "document", scheme = "https", verify_certs = True,
    ca_certs = "/home/tanvir/work/qa-experiment/http_ca.crt")
document_store.delete_documents()
document_store.write_documents(flattened_docs)
print(f"Loaded {document_store.get_document_count()} documents")

In [None]:
# Initiate the dense passage retriever and update embeddings.
from haystack.nodes import DensePassageRetriever
retriever = DensePassageRetriever(
    document_store = document_store,
    query_embedding_model = "facebook/dpr-question_encoder-single-nq-base",
    passage_embedding_model = "facebook/dpr-ctx_encoder-single-nq-base",
    max_seq_len_query = 64,
    max_seq_len_passage = 256,
    batch_size = 16,
    use_gpu = True,
    embed_title = True,
    use_fast_tokenizers = True,
)
document_store.update_embeddings(retriever)

In [None]:
from haystack.nodes import FARMReader
import os

# Supress the warning
os.environ["TOKENIZERS_PARALLELISM"] = "false"

reader = FARMReader(model_name_or_path = "deepset/roberta-base-squad2", use_gpu = True, return_no_answer = True)

In [None]:
# Set up the pipeline
from haystack.pipelines import ExtractiveQAPipeline
pipeline = ExtractiveQAPipeline(reader, retriever)

In [None]:
# Quick peek at the item with most reviews
reviews.groupby('asin').size().sort_values(ascending = False).head(1)

In [None]:
# In total we have 311 reviews for item id 'B0010T3QT2' (some kind of envelope).
# Lets invoke the pipeline with the filter with item id.
from haystack.utils import print_answers
import time
filter = {'item_id': ['B0010T3QT2']}
q = 'How well does this envelope stick?'
start_time = time.process_time()
answers = pipeline.run(q, params = {"Retriever": {"top_k": 30}, "Reader": {"top_k": 10}, "filters": filter})
elapsed_time = time.process_time() - start_time
print(f"Time taken with filter: {elapsed_time} seconds")
print_answers(answers, details = "all")

In [None]:
# Now lets run the same query without filters. It should take slightly longer but the answers won't be super relevant
start_time = time.process_time()
answers = pipeline.run(q, params = {"Retriever": {"top_k": 30}, "Reader": {"top_k": 10}})
elapsed_time = time.process_time() - start_time
print(f"Time taken without filter: {elapsed_time} seconds")
print_answers(answers, details = "all")

In [None]:
# Its possible to filter across multiple products to find answers about a product category
filter_mutiple = {'item_id': ['B00006IBLJ', 'B000GHJM9C', 'B000CS787S']}
q2 = 'Can things still break when they\'re wrapped in bubble wrap?'
start_time = time.process_time()
answers = pipeline.run(q2, params = {"Retriever": {"top_k": 100}, "Reader": {"top_k": 3}, "filters": filter_mutiple})
elapsed_time = time.process_time() - start_time
print(f"Time taken with multi-filter: {elapsed_time} seconds")
print_answers(answers, details = "all")

In [None]:
# QA is great but Haystack also works as a just retriever pipeline. Lets run the same set of experiments
# for the retriever pipeline.

In [None]:
# Filtering on one product id
from haystack.pipelines import DocumentSearchPipeline
from haystack.utils import print_documents

search_pipeline = DocumentSearchPipeline(retriever)
search_query = "Easy to seal"
start_time = time.process_time()
search_result = search_pipeline.run(search_query, params = {"Retriever": {"top_k": 10}, "filters": filter})
elapsed_time = time.process_time() - start_time
print(f"Time taken with multi-filter: {elapsed_time} seconds")
print_documents(search_result, max_text_len = 100, print_name = True, print_meta = True)

In [None]:
# Retriever without filter
start_time = time.process_time()
search_result = search_pipeline.run(search_query, params = {"Retriever": {"top_k": 10}})
elapsed_time = time.process_time() - start_time
print(f"Time taken without filter: {elapsed_time} seconds")
print_documents(search_result, max_text_len = 100, print_name = True, print_meta = True)

In [None]:
# Its possible to filter across multiple products to find answers about a product category
filter_mutiple = {'item_id': ['B00006IBLJ', 'B000GHJM9C', 'B000CS787S']}
start_time = time.process_time()
search_result = pipeline.run(search_query, params = {"Retriever": {"top_k": 10}, "filters": filter_mutiple})
elapsed_time = time.process_time() - start_time
print(f"Time taken with multi-filter: {elapsed_time} seconds")
print_documents(search_result, max_text_len = 100, print_name = True, print_meta = True)