In [4]:
# Import data
import os
import requests
import zipfile
from pathlib import Path

def download_and_extract(url, output_dir):
    """Download and extract zip file"""
    
    # Create output directory
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    
    # Download zip file
    print(f"Downloading from {url}...")
    zip_path = os.path.join(output_dir, "temp.zip")
    
    response = requests.get(url)
    with open(zip_path, 'wb') as f:
        f.write(response.content)
    
    # Extract
    print(f"Extracting to {output_dir}...")
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(output_dir)
    
    # Remove zip file
    os.remove(zip_path)
    print("✓ Download complete!")

# Usage
doc_dir = "../data/got_dataset"
url = "https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip"

download_and_extract(url, doc_dir)

# Get files
files_to_index = [os.path.join(doc_dir, f) for f in os.listdir(doc_dir) if f.endswith(".txt")]
print(f"{len(files_to_index)} files found for indexing.")

Downloading from https://s3.eu-central-1.amazonaws.com/deepset.ai-farm-qa/datasets/documents/wiki_gameofthrones_txt1.zip...
Extracting to ../data/got_dataset...
✓ Download complete!
183 files found for indexing.


In [6]:
from haystack import Document
from haystack.document_stores.in_memory import InMemoryDocumentStore

# Setup document store
document_store = InMemoryDocumentStore()

# Read files and create documents
documents = []
for file_path in files_to_index:
    with open(file_path, 'r', encoding='utf-8') as f:
        documents.append(Document(content=f.read()))

# Index documents
document_store.write_documents(documents)

print(f"✓ {document_store.count_documents()} documents indexed")

✓ 183 documents indexed


In [8]:
from haystack import Pipeline
from haystack.components.retrievers.in_memory import InMemoryBM25Retriever
from haystack.components.readers import ExtractiveReader

# 1. Create retriever
retriever = InMemoryBM25Retriever(document_store=document_store)

# 2. Create reader
reader = ExtractiveReader(model="deepset/roberta-base-squad2")

# 3. Build QA pipeline
qa_pipeline = Pipeline()
qa_pipeline.add_component("retriever", retriever)
qa_pipeline.add_component("reader", reader)
qa_pipeline.connect("retriever.documents", "reader.documents")

# 4. Ask a question
question = "Who is the father of Arya Stark?"

result = qa_pipeline.run({
    "retriever": {"query": question, "top_k": 5},
    "reader": {"query": question, "top_k": 3}
})

# 5. Print answers
print(f"Question: {question}\n")
for i, answer in enumerate(result["reader"]["answers"], 1):
    print(f"Answer {i}:")
    print(f"  {answer.data}")
    print(f"  Confidence: {answer.score:.2%}")
    print()

Question: Who is the father of Arya Stark?

Answer 1:
  Lord Eddard Stark
  Confidence: 79.52%

Answer 2:
  Eddard
  Confidence: 79.07%

Answer 3:
  Robert Baratheon
  Confidence: 75.55%

Answer 4:
  None
  Confidence: 1.05%



README.md:   0%|          | 0.00/46.0 [00:00<?, ?B/s]

data/train-00000-of-00001-4077bd623d5510(…):   0%|          | 0.00/119k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/151 [00:00<?, ? examples/s]