## 1) Configure document store

In [1]:
from haystack import Pipeline
from haystack.document_stores import FAISSDocumentStore
from haystack.nodes import EmbeddingRetriever

CREATE_NEW_DOCUMENT_STORE = False

index_path = "db_backup/index.faiss"
config_path = "db_backup/config.json"

doc_store = None

if CREATE_NEW_DOCUMENT_STORE:
    doc_store = FAISSDocumentStore()
else:
    doc_store = FAISSDocumentStore(faiss_index_path=index_path, faiss_config_path=config_path)

retriever = EmbeddingRetriever(
    embedding_model='ipipan/silver-retriever-base-v1', document_store=doc_store
)

query_pipeline = Pipeline()
query_pipeline.add_node(component=retriever, name="Retriever", inputs=["Query"])


In [2]:
from datasets import load_dataset

ds = load_dataset("clarin-knext/fiqa-pl", "corpus")
ds2 = load_dataset("clarin-knext/fiqa-pl", "queries")

realations = load_dataset("clarin-knext/fiqa-pl-qrels")

In [3]:
realations

DatasetDict({
    train: Dataset({
        features: ['query-id', 'corpus-id', 'score'],
        num_rows: 14166
    })
    validation: Dataset({
        features: ['query-id', 'corpus-id', 'score'],
        num_rows: 1238
    })
    test: Dataset({
        features: ['query-id', 'corpus-id', 'score'],
        num_rows: 1706
    })
})

In [4]:
from datasets import concatenate_datasets

combined = concatenate_datasets([realations['train'], realations['validation'], realations['test']])
combined

Dataset({
    features: ['query-id', 'corpus-id', 'score'],
    num_rows: 17110
})

In [5]:
from haystack import Document




doc_corpus = []

for row in ds['corpus']:
    doc_corpus.append(
        Document(
            meta={'name': row['title'], 'id': row['_id']},
			content=row['text']
        )
    )

if CREATE_NEW_DOCUMENT_STORE:
    doc_store.write_documents(doc_corpus)
    doc_store.update_embeddings(retriever=retriever)
    doc_store.save(index_path, config_path)


In [6]:
ds2['queries'][0]

{'_id': '0',
 'title': '',
 'text': 'Co jest uważane za wydatek służbowy w podróży służbowej?'}

In [7]:
combined.filter(lambda e: e['query-id'] == 0)['corpus-id']

[18850]

In [8]:
combined_dict = {}

for row in combined:
    if int(row['query-id']) in combined_dict:
        combined_dict[int(row['query-id'])].append((int(row['corpus-id']), int(row['score'])))
    else:
        combined_dict[int(row['query-id'])] = [(int(row['corpus-id']), int(row['score']))]

In [16]:
from sklearn.metrics import ndcg_score
import numpy as np
from more_itertools import chunked
import time

ndcg5_list = []
batch_size = 1000

combined_time = 0

for row_batch in chunked(ds2['queries'], batch_size):
    start_time = time.time()
    ans_batch = query_pipeline.run_batch([row['text'] for row in row_batch])
    end_time = time.time()
    combined_time += end_time - start_time
    for ans, row in zip(ans_batch['documents'], row_batch):
        wyn = combined_dict[int(row['_id'])]
        c_ids = [id for id, _ in wyn]
        scores = [score for _, score in wyn]
        values = []
        relevance = []
        for doc in ans:
            q_id = int(doc.meta['id'])
            if q_id in c_ids:
                relevance.append(float(scores[c_ids.index(q_id)]))
                values.append(1)
            else:
                relevance.append(0)
                values.append(0)
        ndcg5_list.append(ndcg_score(np.asarray([values]), np.asarray([relevance]), k=5))



ndcg5_final = np.mean(ndcg5_list)
print(ndcg5_final)
n = len(ds2['queries'])
print(f'took {combined_time}s, so {combined_time / n} per query.')

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/21 [00:00<?, ?it/s]

0.4052346570397112
took 382.18325114250183s, so 0.057488455346345044 per query.


6648

# Questions

#### Which of the methods: lexical match (e.g. ElasticSearch) or dense representation works better?

The Elastic search had a NDCG@5 score of 0.2657 in the best case, while the Neural network had 0.4052

#### Which of the methods is faster?

Neural Network took about 0.05s per query while elastic search had 648*4 queries which is rougly 2400 ant the whole cell took 16s so one query took 0.006s.

#### Try to determine the other pros and cons of using lexical search and dense document retrieval models.

Dense pros:
- The Dense Document retrival has better accuracy at ndcg score
- Can be fine tuned

Dense cons:
- Long time to create encoded documents
- Retrival of documents is 10 times longer

Elastic search pros:
- Indexing in elastic search took less than than dence retirval
- No model needed
- Faster retrival of data

Elastic search cons:
- Accuracy is 2 times worse 
- Needs exact keywords to find appropiate answer