# PolEval - Passage Retrieval

## Imports and consts

In [None]:
# Resets variables. Execute if needed
%reset

In [None]:
import os
import pandas as pd

from tqdm.auto import tqdm
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchBM25Retriever
from haystack import Document
from haystack import Pipeline
from huggingface_hub import hf_hub_download
from datasets import load_dataset
from haystack.document_stores.types.policy import DuplicatePolicy

In [None]:
dataset_path = 'datasets/'

## Download datasets

In [None]:
passages_allegro_source = hf_hub_download(repo_id="piotr-rybak/poleval2022-passage-retrieval-dataset", subfolder="allegro-faq", filename="passages.jl", repo_type="dataset")
questions_allegro_source = hf_hub_download(repo_id="piotr-rybak/poleval2022-passage-retrieval-dataset", subfolder="allegro-faq", filename="questions-test.jl", repo_type="dataset")
passages_legal_source = hf_hub_download(repo_id="piotr-rybak/poleval2022-passage-retrieval-dataset", subfolder="legal-questions", filename="passages.jl", repo_type="dataset")
questions_legal_source = hf_hub_download(repo_id="piotr-rybak/poleval2022-passage-retrieval-dataset", subfolder="legal-questions", filename="questions-test.jl", repo_type="dataset")
passages_wiki_source = hf_hub_download(repo_id="piotr-rybak/poleval2022-passage-retrieval-dataset", subfolder="wiki-trivia", filename="passages.jl", repo_type="dataset")
questions_wiki_source = hf_hub_download(repo_id="piotr-rybak/poleval2022-passage-retrieval-dataset", subfolder="wiki-trivia", filename="questions-test.jl", repo_type="dataset")

training_set = load_dataset("piotr-rybak/poleval2022-passage-retrieval-dataset", split="train")
test_set = load_dataset("piotr-rybak/poleval2022-passage-retrieval-dataset", split="test")

## Launch ElasticSearch in Docker and create stores

In terminal when in root directory run:
- docker-compose up

In [30]:
# Count documents in store to verify
def count_documents(stores):
    for key in stores:
        print(stores[key].count_documents())

In [31]:
# Declare stores
import requests
import json

document_stores = {
    'allegro': ElasticsearchDocumentStore(hosts='http://localhost:9200', index='allegro'),
    'wiki': ElasticsearchDocumentStore(hosts='http://localhost:9200', index='wiki'),
    'legal': ElasticsearchDocumentStore(hosts='http://localhost:9200', index='legal'),
}

def change_document_store_window_size(key, size):
    url = "http://localhost:9200/" + key + "/_settings"
    
    data = {
      "index": {
        "max_result_window" : size
      }
    }
    
    headers = {
        "content-type": "application/json"
    }
    
    response = requests.put(url, headers=headers, data=json.dumps(data))
    
    print(response.text)

change_document_store_window_size('wiki', 500000)
change_document_store_window_size('legal', 30000)

{"acknowledged":true}
{"acknowledged":true}


In [32]:
def delete_documents(store=None):
    if store is not None:
        store.delete_documents(document_ids=list(map(lambda document: document.id, store.filter_documents(filters={}))))
    else:
        count_documents(document_stores)
        
        for key in document_stores:
            documents_to_delete = list(map(lambda document: document.id, document_stores[key].filter_documents(filters={})))
            while len(documents_to_delete) > 0:
                print("Deleting...")
                document_stores[key].delete_documents(document_ids=documents_to_delete)
                documents_to_delete = list(map(lambda document: document.id, document_stores[key].filter_documents(filters={})))
                
        count_documents(document_stores)

    
delete_documents()

0
0
0
0
0
0


## Index Passages

In [28]:
passages = {
    'allegro': pd.read_json(passages_allegro_source, lines=True, chunksize=int(1e5)),
    'wiki': pd.read_json(passages_wiki_source, lines=True, chunksize=int(1e5)),
    'legal': pd.read_json(passages_legal_source, lines=True, chunksize=int(1e5)),
}

In [11]:
delete_documents()

def write_to_document_store(key):
    for batch in tqdm(passages[key]):
        if 'title' in batch:
            batch['title'] = batch['title'].fillna('')
            batch['text'] = batch.apply(lambda r: r['title'] + ' ' + r['text'], axis=1)
        
        batch = batch.rename(columns={'id': 'passage-id', 'text': 'content'})
        batch_as_dicts = batch.to_dict(orient='records')
        documents_list = [Document(content=str(passageDict['content']), id=str(passageDict['passage-id'])) for passageDict in batch_as_dicts]
        document_stores[key].write_documents(documents_list, policy=DuplicatePolicy.SKIP)

write_to_document_store('allegro')
write_to_document_store('legal')
# Takes ~4h
# write_to_document_store('wiki')

0
0
0
0
0
0


0it [00:00, ?it/s]

0it [00:00, ?it/s]

## Load questions

In [33]:
questions_allegro = pd.read_json(questions_allegro_source, lines=True)
print(questions_allegro.shape)

questions_wiki = pd.read_json(questions_wiki_source, lines=True)
print(questions_wiki.shape)

questions_legal = pd.read_json(questions_legal_source, lines=True)
print(questions_legal.shape)

(900, 2)
(1291, 2)
(718, 2)


## Construct pipes and gather predictions

In [34]:
def run_pipe(pipeline, pipe_param_callback, questions):
    preds = []
    
    for _, row in questions.iterrows():
        pipe_params = pipe_param_callback(row)
        top_passages = pipeline.run(pipe_params)
        
        for passage in top_passages['retriever']['documents']:
            passage = passage.to_dict()
            preds.append({
                'question-id': row['id'],
                'passage-id': passage['id'],
                'score': passage['score'],
            })
    
    return pd.DataFrame(preds)

### Base BM25 algorithm

Uses a retriever that utilizes BM25 algorithm (bag-of-words based)

In [14]:
bm25_retrievers = {
    'allegro': ElasticsearchBM25Retriever(document_store=document_stores['allegro']),
    'wiki': ElasticsearchBM25Retriever(document_store=document_stores['wiki']),
    'legal': ElasticsearchBM25Retriever(document_store=document_stores['legal']),
}

def construct_bm25_pipelines():
    pipes = {
        'allegro': Pipeline(),
        'wiki': Pipeline(),
        'legal': Pipeline(),
    }
    
    pipes['allegro'].add_component("retriever", bm25_retrievers['allegro'])
    pipes['wiki'].add_component("retriever", bm25_retrievers['wiki'])
    pipes['legal'].add_component("retriever", bm25_retrievers['legal'])
    
    return pipes

bm25_pipes = construct_bm25_pipelines()
bm25_pipe_param_callback = lambda row: { 'retriever': {"query": row['text']}}

bm25_predictions = {
    'discriminator': 'bm25',
    'allegro': run_pipe(bm25_pipes['allegro'], bm25_pipe_param_callback, questions_allegro),
    'legal': run_pipe(bm25_pipes['legal'], bm25_pipe_param_callback, questions_legal),
    # 'wiki': run_pipe(bm25_pipes['wiki'], bm25_pipe_param_callback, questions_wiki)
}

### Text embedding

In [35]:
### Definitions, functions
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchEmbeddingRetriever
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.embedders import SentenceTransformersDocumentEmbedder

model_mpnet = "sentence-transformers/all-mpnet-base-v2"
model_minilm = "sentence-transformers/all-MiniLM-L12-v2"

mppnet_document_embedder = SentenceTransformersDocumentEmbedder(model=model_mpnet)  
mppnet_document_embedder.warm_up()

minilm_document_embedder = SentenceTransformersDocumentEmbedder(model=model_minilm)  
minilm_document_embedder.warm_up()

def write_to_document_store_with_embeddings(key, document_embedder):
    for batch in tqdm(passages[key]):
        if 'title' in batch:
            batch['title'] = batch['title'].fillna('')
            batch['text'] = batch.apply(lambda r: r['title'] + ' ' + r['text'], axis=1)
        
        batch = batch.rename(columns={'id': 'passage-id', 'text': 'content'})
        batch_as_dicts = batch.to_dict(orient='records')
        documents_list = [Document(content=str(passageDict['content']), id=str(passageDict['passage-id'])) for passageDict in batch_as_dicts]
        documents_with_embeddings = document_embedder.run(documents_list)
        document_stores[key].write_documents(documents_with_embeddings['documents'], policy=DuplicatePolicy.SKIP)
        
embedding_retrievers = {
    'allegro': ElasticsearchEmbeddingRetriever(document_store=document_stores['allegro']),
    'wiki': ElasticsearchEmbeddingRetriever(document_store=document_stores['wiki']),
    'legal': ElasticsearchEmbeddingRetriever(document_store=document_stores['legal']),
}
        
def construct_embedding_pipelines(text_embedders):
    pipes = {
        'allegro': Pipeline(),
        'wiki': Pipeline(),
        'legal': Pipeline(),
    }
    
    pipes['allegro'].add_component("text_embedder", text_embedders['allegro'])
    pipes['allegro'].add_component("retriever", embedding_retrievers['allegro'])
    pipes['allegro'].connect("text_embedder.embedding", "retriever.query_embedding")
    
    pipes['wiki'].add_component("text_embedder", text_embedders['wiki'])
    pipes['wiki'].add_component("retriever", embedding_retrievers['wiki'])
    pipes['wiki'].connect("text_embedder.embedding", "retriever.query_embedding")
    
    pipes['legal'].add_component("text_embedder", text_embedders['legal'])
    pipes['legal'].add_component("retriever", embedding_retrievers['legal'])
    pipes['legal'].connect("text_embedder.embedding", "retriever.query_embedding")
    
    return pipes

#### mpnet-base-v2 model

- pretrained
- slow
- general use case, multilingual

In [36]:
passages = {
    'allegro': pd.read_json(passages_allegro_source, lines=True, chunksize=int(1e5)),
    'wiki': pd.read_json(passages_wiki_source, lines=True, chunksize=int(1e5)),
    'legal': pd.read_json(passages_legal_source, lines=True, chunksize=int(1e5)),
}

In [17]:
delete_documents()

write_to_document_store_with_embeddings('allegro', mppnet_document_embedder)
write_to_document_store_with_embeddings('legal', mppnet_document_embedder)
# Takes ~4h
# write_to_document_store_with_embeddings('wiki', mppnet_document_embedder)

mpnet_text_embedders = {
    'allegro': SentenceTransformersTextEmbedder(model=model_mpnet, progress_bar=False),
    'wiki': SentenceTransformersTextEmbedder(model=model_mpnet, progress_bar=False),
    'legal': SentenceTransformersTextEmbedder(model=model_mpnet, progress_bar=False),
}

mpnet_pipes = construct_embedding_pipelines(mpnet_text_embedders)
mpnet_pipe_param_callback = lambda row: {'text_embedder': {'text': row['text']}}

mpnet_predictions = {
    'discriminator': 'mpnet',
    'allegro': run_pipe(mpnet_pipes['allegro'], mpnet_pipe_param_callback, questions_allegro),
    'legal': run_pipe(mpnet_pipes['legal'], mpnet_pipe_param_callback, questions_legal),
    # 'wiki': run_pipe(mpnet_pipes['wiki'], mpnet_pipe_param_callback, questions_wiki)
}

921
0
25810
Deleting...
Deleting...
Deleting...
Deleting...
0
0
0


0it [00:00, ?it/s]

Batches:   0%|          | 0/29 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Batches:   0%|          | 0/822 [00:00<?, ?it/s]

In [18]:
print(mpnet_predictions)

{'discriminator': 'mpnet', 'allegro':       question-id passage-id     score
0               0        875  0.914359
1               0        509  0.914310
2               0          6  0.913015
3               0        685  0.912535
4               0        138  0.912485
...           ...        ...       ...
8995          899         66  0.860436
8996          899        152  0.859809
8997          899        277  0.859363
8998          899        351  0.859104
8999          899        229  0.857185

[9000 rows x 3 columns], 'legal':       question-id     passage-id     score
0               0   1997_553_134  0.880867
1               0    2001_628_73  0.878017
2               0     1997_715_9  0.877843
3               0   1997_553_208  0.872784
4               0   1997_553_194  0.872661
...           ...            ...       ...
7175          717    2001_1381_5  0.836527
7176          717    2004_623_37  0.836340
7177          717    2004_623_61  0.835774
7178          717  2001_1381_

#### MiniLM-L12-v2

- faster
- general use case, multilingual
- pretrained

In [19]:
passages = {
    'allegro': pd.read_json(passages_allegro_source, lines=True, chunksize=int(1e5)),
    'wiki': pd.read_json(passages_wiki_source, lines=True, chunksize=int(1e5)),
    'legal': pd.read_json(passages_legal_source, lines=True, chunksize=int(1e5)),
}

In [37]:
delete_documents()

write_to_document_store_with_embeddings('allegro', minilm_document_embedder)
write_to_document_store_with_embeddings('legal', minilm_document_embedder)
# Takes ~4h
# write_to_document_store_with_embeddings('wiki', minilm_document_embedder)

minilm_text_embedders = {
    'allegro': SentenceTransformersTextEmbedder(model=model_minilm, progress_bar=False),
    'wiki': SentenceTransformersTextEmbedder(model=model_minilm, progress_bar=False),
    'legal': SentenceTransformersTextEmbedder(model=model_minilm, progress_bar=False),
}

minilm_pipes = construct_embedding_pipelines(minilm_text_embedders)
minilm_pipe_param_callback = lambda row: {'text_embedder': {'text': row['text']}}

minilm_predictions = {
    'discriminator': 'MiniLM',
    'allegro': run_pipe(minilm_pipes['allegro'], minilm_pipe_param_callback, questions_allegro),
    'legal': run_pipe(minilm_pipes['legal'], minilm_pipe_param_callback, questions_legal),
    # 'wiki': run_pipe(minilm_pipes['wiki'], minilm_pipe_param_callback, questions_wiki)
}

0
0
0
0
0
0


0it [00:00, ?it/s]

Batches:   0%|          | 0/29 [00:00<?, ?it/s]

0it [00:00, ?it/s]

Batches:   0%|          | 0/822 [00:00<?, ?it/s]

In [38]:
print(minilm_predictions)

{'discriminator': 'MiniLM', 'allegro':       question-id passage-id     score
0               0        821  0.932249
1               0        685  0.901324
2               0          6  0.892788
3               0        874  0.854139
4               0        509  0.845707
...           ...        ...       ...
8995          899        617  0.795094
8996          899        813  0.792105
8997          899        576  0.791660
8998          899        822  0.788819
8999          899        716  0.785611

[9000 rows x 3 columns], 'legal':       question-id    passage-id     score
0               0  1997_553_345  0.853504
1               0   2001_634_26  0.848969
2               0  1997_555_118  0.846674
3               0    1997_715_9  0.845255
4               0  1997_557_195  0.844825
...           ...           ...       ...
7175          717   2001_452_27  0.817105
7176          717   2004_623_41  0.811789
7177          717   2000_179_14  0.810213
7178          717   2004_623_43  0.809

### spacy lemmatization - TODO

In [None]:
import spacy

# Load largest spaCy pipeline for Polish
nlp = spacy.load("pl_core_news_lg")


### spaCy lemmatization + averaged sentence embedding

### trained models

## Load into .tsv files

In [39]:
predictions = [
    bm25_predictions,
    minilm_predictions,
    mpnet_predictions,
]

for prediction in predictions:
    if 'allegro' in prediction:
        prediction['allegro'].to_csv(os.path.join(dataset_path + 'allegro/', prediction['discriminator'] + '/submission.tsv'), sep='\t', index=False)
    if 'wiki' in prediction:
        prediction['wiki'].to_csv(os.path.join(dataset_path + 'wiki/', prediction['discriminator'] + '/submission.tsv'), sep='\t', index=False)
    if 'legal' in prediction:
        prediction['legal'].to_csv(os.path.join(dataset_path + 'legal/', prediction['discriminator'] + '/submission.tsv'), sep='\t', index=False)
    

## Evaluate

`o evaluate the results, run these in terminal:`

### allegro:
`python ./eval.py --true datasets/allegro/pairs-test.tsv --pred datasets/allegro/<name_of_algorithm>/submission.tsv`

### wiki
`python ./eval.py --true datasets/wiki/pairs-test.tsv --pred datasets/wiki/<name_of_algorithm>/submission.tsv`

### legal
`python ./eval.py --true datasets/legal/pairs-test.tsv --pred datasets/legal/<name_of_algorithm>/submission.tsv`