# PolEval - Passage Retrieval

## Imports and consts

In [None]:
import spacy
# Resets variables. Execute if needed
%reset

In [None]:
import os
import pandas as pd

from tqdm.auto import tqdm
from haystack_integrations.document_stores.elasticsearch import ElasticsearchDocumentStore
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchBM25Retriever
from haystack import Document
from haystack import Pipeline
from huggingface_hub import hf_hub_download
from datasets import load_dataset
from haystack.document_stores.types.policy import DuplicatePolicy

In [None]:
dataset_path = 'datasets/'

## Download datasets

In [None]:
passages_source = {
    'allegro': hf_hub_download(repo_id="piotr-rybak/poleval2022-passage-retrieval-dataset", subfolder="allegro-faq", filename="passages.jl", repo_type="dataset"),
    'legal': hf_hub_download(repo_id="piotr-rybak/poleval2022-passage-retrieval-dataset", subfolder="legal-questions", filename="passages.jl", repo_type="dataset"),
    'wiki': hf_hub_download(repo_id="piotr-rybak/poleval2022-passage-retrieval-dataset", subfolder="wiki-trivia", filename="passages.jl", repo_type="dataset"),
}

questions_source = {
    'allegro':  hf_hub_download(repo_id="piotr-rybak/poleval2022-passage-retrieval-dataset", subfolder="allegro-faq", filename="questions-test.jl", repo_type="dataset"),
    'wiki': hf_hub_download(repo_id="piotr-rybak/poleval2022-passage-retrieval-dataset", subfolder="wiki-trivia", filename="questions-test.jl", repo_type="dataset"),
    'legal': hf_hub_download(repo_id="piotr-rybak/poleval2022-passage-retrieval-dataset", subfolder="legal-questions", filename="questions-test.jl", repo_type="dataset"),
    'dev-0': dataset_path + 'dev-0/in.tsv',
    'test-A': dataset_path + 'test-A/in.tsv',
    'test-B': dataset_path + 'test-B/in.tsv',
}

training_set = load_dataset("piotr-rybak/poleval2022-passage-retrieval-dataset", split="train")
test_set = load_dataset("piotr-rybak/poleval2022-passage-retrieval-dataset", split="test")
wiki_questions_train = hf_hub_download(repo_id="piotr-rybak/poleval2022-passage-retrieval-dataset", subfolder="wiki-trivia", filename="questions-train.jl", repo_type="dataset")

## Launch ElasticSearch in Docker and create stores

In terminal when in root directory run:
- docker-compose up

In [None]:
# Count documents in store to verify
def count_documents(stores):
    for key in stores:
        print(stores[key].count_documents())

In [None]:
# Declare stores
import requests
import json

document_stores = {
    'allegro': ElasticsearchDocumentStore(hosts='http://localhost:9200', index='allegro'),
    'wiki': ElasticsearchDocumentStore(hosts='http://localhost:9200', index='wiki'),
    'legal': ElasticsearchDocumentStore(hosts='http://localhost:9200', index='legal'),
}

def change_document_store_window_size(key, size):
    url = "http://localhost:9200/" + key + "/_settings"
    
    data = {
      "index": {
        "max_result_window" : size
      }
    }
    
    headers = {
        "content-type": "application/json"
    }
    
    response = requests.put(url, headers=headers, data=json.dumps(data))
    
    print(response.text)

change_document_store_window_size('wiki', 300000)
change_document_store_window_size('legal', 300000)

In [None]:
"""
Function which deletes all documents from the document stores if no argument is passed. Deletes 10000 docs at a time. If store is passed, deletes 10k documents from particular store.
"""
def delete_documents(store=None):
    if store is not None:
        store.delete_documents(document_ids=list(map(lambda document: document.id, store.filter_documents(filters={}))))
    else:
        count_documents(document_stores)
        
        for key in document_stores:
            documents_to_delete = list(map(lambda document: document.id, document_stores[key].filter_documents(filters={})))
            while len(documents_to_delete) > 0:
                print("Deleting...")
                document_stores[key].delete_documents(document_ids=documents_to_delete)
                documents_to_delete = list(map(lambda document: document.id, document_stores[key].filter_documents(filters={})))
                
        count_documents(document_stores)

In [None]:
count_documents(document_stores)

## Index Passages

In [None]:
passages = {
    'allegro': pd.read_json(passages_source['allegro'], lines=True, chunksize=int(1e5)),
    'wiki': pd.read_json(passages_source['wiki'], lines=True, chunksize=int(1e5)),
    'legal': pd.read_json(passages_source['legal'], lines=True, chunksize=int(1e5)),
}

### Trim wiki-trivia dataset

- use `sample()` function from `pandas` in order to get simple random sample (~40k passages in total)
- preserve positive matches contained in supplied files

In [None]:
import csv

# Gather positive matches
wiki_positives_1 = pd.read_csv(os.path.join(dataset_path + 'wiki/', 'pairs-test.tsv'), sep='\t')
# wiki_positives_2 = pd.read_csv(os.path.join(dataset_path + 'dev-0/', 'expected.tsv'), sep='\t', header=None)

passage_ids = []

print(wiki_positives_1)


for _, row in wiki_positives_1.iterrows():
    if row['passage-id'] not in passage_ids:
        passage_ids.append(row['passage-id'])
        
with open(os.path.join(dataset_path + 'dev-0/', 'expected.tsv'), newline='') as wiki_positives_2:
    reader = csv.reader(wiki_positives_2, delimiter='\t', quotechar='"')
    for row in reader:
        for entry in row:
            if entry not in passage_ids:
                passage_ids.append(entry)
        
print(len(passage_ids))

In [None]:
# Write selected passages to file
for passages_batch in tqdm(pd.read_json(passages_source['wiki'], lines=True, chunksize=int(1e6))):
    positives = passages_batch[passages_batch["id"].isin(passage_ids)]
    sample_batch = pd.concat([positives, passages_batch.sample(random_state=1234, frac=0.005)], verify_integrity=True, ignore_index=True)
    sample_batch.to_json(os.path.join(dataset_path + 'wiki/', 'passages-trimmed.jl'), index=False, orient='records', mode='a', lines=True, force_ascii=False)

In [None]:
# Set new source file
passages_source['wiki'] = os.path.join(dataset_path + 'wiki/', 'passages-trimmed.jl')
print(passages_source['wiki'])

In [None]:
delete_documents()

def write_to_document_store(key):
    for batch in tqdm(passages[key]):
        if 'title' in batch:
            batch['title'] = batch['title'].fillna('')
            batch['text'] = batch.apply(lambda r: r['title'] + ' ' + r['text'], axis=1)
        
        document_stores[key].write_documents([Document(content=str(item['text']), id=str(item['id'])) for item in batch.to_dict(orient='records')], policy=DuplicatePolicy.SKIP)

write_to_document_store('allegro')
write_to_document_store('legal')
# Takes ~4h
write_to_document_store('wiki')

## Load questions

In [None]:
import csv

questions = {
    'allegro': pd.read_json(questions_source['allegro'], lines=True),
    'wiki': pd.read_json(questions_source['wiki'], lines=True),
    'legal': pd.read_json(questions_source['legal'], lines=True),
    'dev-0': pd.read_csv(questions_source['dev-0'], sep='\t', header=None).rename(columns={0: 'dataset', 1: 'text'}),
    'test-A': pd.read_csv(questions_source['test-A'], sep='\t', header=None).rename(columns={0: 'dataset', 1: 'text'}),
    'test-B': pd.read_csv(questions_source['test-B'], sep='\t', header=None).rename(columns={0: 'dataset', 1: 'text'}),
}

print(questions['allegro'].shape)
print(questions['wiki'].shape)
print(questions['legal'].shape)
print(questions['dev-0'].shape)
print(questions['test-A'].shape)
print(questions['test-B'].shape)

pairs_test = {
    'allegro': pd.read_csv(os.path.join(dataset_path, 'allegro' + '/pairs-test.tsv'), sep='\t'),
    'wiki': pd.read_csv(os.path.join(dataset_path, 'wiki' + '/pairs-test.tsv'), sep='\t'),
    'legal': pd.read_csv(os.path.join(dataset_path, 'legal' + '/pairs-test.tsv'), sep='\t'),    
}

print(pairs_test['wiki'].shape)
print(pairs_test['legal'].shape)
print(pairs_test['allegro'].shape)

def format_test_inputs(_question_key):
    df = {
        'allegro': pd.DataFrame(columns=['question-id', 'passage-id', 'score']),
        'wiki': pd.DataFrame(columns=['question-id', 'passage-id', 'score']),
        'legal': pd.DataFrame(columns=['question-id', 'passage-id', 'score']), 
    }
    df_questions_test = {
        'allegro': pd.DataFrame(columns=['id', 'text']),
        'wiki': pd.DataFrame(columns=['id', 'text']),
        'legal': pd.DataFrame(columns=['id', 'text']), 
    }
    
    for index, _row in questions[_question_key].iterrows():
        _key = _row['dataset'].split('-')[0].strip()
        found_docs = questions[_key][questions[_key]['text'].apply(str.strip) == _row['text'].strip()]
        if len(found_docs) == 0:
            print('Could not find question with such content', _key, _row['text'])
            return
        df_questions_test[_key].loc[int(len(df_questions_test[_key]))] = {'id': found_docs.iloc[0]['id'], 'text': found_docs.iloc[0]['text']}
        found_positives_for_questions = pairs_test[_key][pairs_test[_key]['question-id'] == found_docs.iloc[0]['id']]
        for _, __row in found_positives_for_questions.iterrows():
            df[_key].loc[int(len(df[_key]))] = {'question-id': __row['question-id'], 'passage-id': __row['passage-id'], 'score': 1}
            
    for _key in df:
        df[_key].to_csv(os.path.join(dataset_path, _question_key + '/' + _key + '_chunk' + '/pairs-test.tsv'), sep='\t', index=False)
        df_questions_test[_key].to_json(os.path.join(dataset_path, _question_key + '/' + _key + '_chunk' + '/questions-test.jl'), orient='records', lines=True, force_ascii=False)
    
    return df, df_questions_test

df_pairs_test_testA, df_questions_testA = format_test_inputs('test-A')
df_pairs_test_testB, df_questions_testB = format_test_inputs('test-B')

def convert_dev_input_to_common_format():
    df_dev = pd.DataFrame(columns=['question-id', 'passage-id', 'score'])
    df_questions_dev = pd.DataFrame(columns=['id', 'text'])
    
    with open(os.path.join(dataset_path + 'dev-0/', 'expected.tsv'), newline='') as wiki_positives_dev:
        dev_reader = csv.reader(wiki_positives_dev, delimiter='\t', quotechar='"')
        for (index, question_row), passage_row in zip(questions['dev-0'].iterrows(), dev_reader):
            for passage_entry in passage_row:
                df_dev.loc[int(len(df_dev))] = {'question-id': index, 'passage-id': passage_entry, 'score': 1}
                
            df_questions_dev.loc[int(len(df_questions_dev))] = {'id': index, 'text': question_row['text']}
            
    df_questions_dev.to_json(os.path.join(dataset_path, 'dev-0/questions-test.jl'), orient='records', lines=True, force_ascii=False)
    df_dev.to_csv(os.path.join(dataset_path, 'dev-0/pairs-test.tsv'), sep='\t', index=False)
    
    return df_dev, df_questions_dev

df_pairs_test_dev0, df_questions_dev0 = convert_dev_input_to_common_format()

## Construct pipes and gather predictions

In [None]:
def run_pipe(pipeline, pipe_param_callback, _questions):
    preds = []
    
    for _, row in _questions.iterrows():
        pipe_params = pipe_param_callback(row)
        top_passages = pipeline.run(pipe_params)
        
        for passage in top_passages['retriever']['documents']:
            passage = passage.to_dict()
            preds.append({
                'question-id': row['id'],
                'passage-id': passage['id'],
                'score': passage['score'],
            })
    
    return pd.DataFrame(preds)

def run_pipe_test(pipelines, pipe_param_callback, _questions, _questions_test=None):
    preds = {
        'wiki': [],
        'legal': [],
        'allegro': [],
    }
    indices = {
        'wiki': 0,
        'legal': 0,
        'allegro': 0,
    }
    
    for index, row in _questions.iterrows():
        pipe_params = pipe_param_callback(row)
        _key = row['dataset'].split('-')[0].strip()
        top_passages = pipelines[_key].run(pipe_params)
        question_id = _questions_test[_key].iloc[indices[_key]]['id'] if _questions_test is not None else index
        if len(top_passages['retriever']['documents']) <= 1:
            print(question_id, pipe_params)
        
        for passage in top_passages['retriever']['documents']:
            passage = passage.to_dict()
            preds[_key].append({
                'question-id': question_id,
                'passage-id': passage['id'],
                'score': passage['score'],
            })
        
        indices[_key] += 1
    
    return {
        'wiki': pd.DataFrame(preds['wiki']),
        'legal': pd.DataFrame(preds['legal']),
        'allegro': pd.DataFrame(preds['allegro']),
    }


### Base BM25 algorithm

Uses a retriever that utilizes BM25 algorithm (bag-of-words based)

In [None]:
bm25_retrievers = {
    'allegro': ElasticsearchBM25Retriever(document_store=document_stores['allegro']),
    'wiki': ElasticsearchBM25Retriever(document_store=document_stores['wiki']),
    'legal': ElasticsearchBM25Retriever(document_store=document_stores['legal']),
}

def construct_bm25_pipelines():
    pipes = {
        'allegro': Pipeline(),
        'wiki': Pipeline(),
        'legal': Pipeline(),
    }
    
    pipes['allegro'].add_component("retriever", bm25_retrievers['allegro'])
    pipes['wiki'].add_component("retriever", bm25_retrievers['wiki'])
    pipes['legal'].add_component("retriever", bm25_retrievers['legal'])
    
    return pipes

bm25_pipes = construct_bm25_pipelines()
bm25_pipe_param_callback = lambda row: { 'retriever': {"query": row['text']}}

bm25_predictions = {
    'discriminator': 'bm25',
    'allegro': run_pipe(bm25_pipes['allegro'], bm25_pipe_param_callback, questions['allegro']),
    'legal': run_pipe(bm25_pipes['legal'], bm25_pipe_param_callback, questions['legal']),
    'wiki': run_pipe(bm25_pipes['wiki'], bm25_pipe_param_callback, questions['wiki']),
    'dev-0': run_pipe_test(bm25_pipes, bm25_pipe_param_callback, questions['dev-0']),
    'test-A': run_pipe_test(bm25_pipes, bm25_pipe_param_callback, questions['test-A'], df_questions_testA),
    'test-B': run_pipe_test(bm25_pipes, bm25_pipe_param_callback, questions['test-B'], df_questions_testB),
}

### BM25 with spacy lemmatization

- use a large pretrained polish model
- include common polish stopwords

#### Lemmatization setup

In [None]:
import spacy
from constants import STOPWORDS_1

# Load largest spaCy pipeline for Polish
# Make sure to run: python -m spacy download pl_core_news_lg
spacy_model = "pl_core_news_lg"

nlp = spacy.load(spacy_model)

for stopword in STOPWORDS_1:
    nlp.vocab[stopword].is_stop = True
    
print(' '.join([token.lemma_ for token in nlp("Pojechałem do dziadków.") if not token.is_stop]))

In [None]:
passages = {
    'allegro': pd.read_json(passages_source['allegro'], lines=True, chunksize=int(1e5)),
    'wiki': pd.read_json(passages_source['wiki'], lines=True, chunksize=int(1e5)),
    'legal': pd.read_json(passages_source['legal'], lines=True, chunksize=int(1e5)),
}

In [None]:
delete_documents(document_stores['wiki'])

In [None]:
def write_to_document_store_l(key):
    for batch in tqdm(passages[key]):
        if 'title' in batch:
            batch['title'] = batch['title'].fillna('')
            batch['text'] = batch.apply(lambda r: r['title'] + ' ' + r['text'], axis=1)
        
        document_stores[key].write_documents([Document(content=' '.join([token.lemma_ for token in nlp(item['text']) if not token.is_stop]), id=str(item['id'])) for item in batch.to_dict(orient='records')], policy=DuplicatePolicy.SKIP)

write_to_document_store_l('allegro')
write_to_document_store_l('legal')
write_to_document_store_l('wiki')

In [None]:
bm25_l_pipe_param_callback = lambda _row: { 'retriever': {"query": ' '.join([token.lemma_ for token in nlp(_row['text']) if not token.is_stop])}}

bm25_l_predictions = {
    'discriminator': 'bm25-l',
    'allegro': run_pipe(bm25_pipes['allegro'], bm25_l_pipe_param_callback, questions['allegro']),
    'legal': run_pipe(bm25_pipes['legal'], bm25_l_pipe_param_callback, questions['legal']),
    'wiki': run_pipe(bm25_pipes['wiki'], bm25_l_pipe_param_callback, questions['wiki']),
    'dev-0': run_pipe_test(bm25_pipes, bm25_l_pipe_param_callback, questions['dev-0']),
    'test-A': run_pipe_test(bm25_pipes, bm25_l_pipe_param_callback, questions['test-A'], df_questions_testA),
    'test-B': run_pipe_test(bm25_pipes, bm25_l_pipe_param_callback, questions['test-B'], df_questions_testB),
}

### Text embedding

In [None]:
### Definitions, functions
from haystack_integrations.components.retrievers.elasticsearch import ElasticsearchEmbeddingRetriever
from haystack.components.embedders import SentenceTransformersTextEmbedder
from haystack.components.embedders import SentenceTransformersDocumentEmbedder

model_mpnet = "sentence-transformers/all-mpnet-base-v2"
model_minilm = "sentence-transformers/all-MiniLM-L12-v2"
model_roberta_large = "sdadas/mmlw-retrieval-roberta-large"

mppnet_document_embedder = SentenceTransformersDocumentEmbedder(model=model_mpnet)  
mppnet_document_embedder.warm_up()

minilm_document_embedder = SentenceTransformersDocumentEmbedder(model=model_minilm)  
minilm_document_embedder.warm_up()

roberta_document_embedder = SentenceTransformersDocumentEmbedder(model=model_roberta_large)
roberta_document_embedder.warm_up()

def write_to_document_store_e(key, document_embedder):
    for batch in tqdm(passages[key]):
        if 'title' in batch:
            batch['title'] = batch['title'].fillna('')
            batch['text'] = batch.apply(lambda r: r['title'] + ' ' + r['text'], axis=1)
        
        batch = batch.rename(columns={'id': 'passage-id', 'text': 'content'})
        batch_as_dicts = batch.to_dict(orient='records')
        documents_list = [Document(content=str(passageDict['content']), id=str(passageDict['passage-id'])) for passageDict in batch_as_dicts]
        documents_with_embeddings = document_embedder.run(documents_list)
        document_stores[key].write_documents(documents_with_embeddings['documents'], policy=DuplicatePolicy.SKIP)
        
embedding_retrievers = {
    'allegro': ElasticsearchEmbeddingRetriever(document_store=document_stores['allegro']),
    'wiki': ElasticsearchEmbeddingRetriever(document_store=document_stores['wiki']),
    'legal': ElasticsearchEmbeddingRetriever(document_store=document_stores['legal']),
}
        
def construct_embedding_pipelines(text_embedders):
    pipes = {
        'allegro': Pipeline(),
        'wiki': Pipeline(),
        'legal': Pipeline(),
    }
    
    pipes['allegro'].add_component("text_embedder", text_embedders['allegro'])
    pipes['allegro'].add_component("retriever", embedding_retrievers['allegro'])
    pipes['allegro'].connect("text_embedder.embedding", "retriever.query_embedding")
    
    pipes['wiki'].add_component("text_embedder", text_embedders['wiki'])
    pipes['wiki'].add_component("retriever", embedding_retrievers['wiki'])
    pipes['wiki'].connect("text_embedder.embedding", "retriever.query_embedding")
    
    pipes['legal'].add_component("text_embedder", text_embedders['legal'])
    pipes['legal'].add_component("retriever", embedding_retrievers['legal'])
    pipes['legal'].connect("text_embedder.embedding", "retriever.query_embedding")
    
    return pipes

#### mpnet-base-v2 model

- pretrained
- slow
- general use case, multilingual

In [None]:
passages = {
    'allegro': pd.read_json(passages_source['allegro'], lines=True, chunksize=int(3e6)),
    'wiki': pd.read_json(passages_source['wiki'], lines=True, chunksize=int(3e6)),
    'legal': pd.read_json(passages_source['legal'], lines=True, chunksize=int(3e6)),
}

In [None]:
delete_documents()

write_to_document_store_e('allegro', mppnet_document_embedder)
write_to_document_store_e('legal', mppnet_document_embedder)
write_to_document_store_e('wiki', mppnet_document_embedder)

mpnet_text_embedders = {
    'allegro': SentenceTransformersTextEmbedder(model=model_mpnet, progress_bar=False),
    'wiki': SentenceTransformersTextEmbedder(model=model_mpnet, progress_bar=False),
    'legal': SentenceTransformersTextEmbedder(model=model_mpnet, progress_bar=False),
}

mpnet_pipes = construct_embedding_pipelines(mpnet_text_embedders)
mpnet_pipe_param_callback = lambda row: {'text_embedder': {'text': row['text']}}

mpnet_predictions = {
    'discriminator': 'mpnet',
    'allegro': run_pipe(mpnet_pipes['allegro'], mpnet_pipe_param_callback, questions['allegro']),
    'dev-0': run_pipe_test(mpnet_pipes, mpnet_pipe_param_callback, questions['dev-0']),
    'legal': run_pipe(mpnet_pipes['legal'], mpnet_pipe_param_callback, questions['legal']),
    'test-A': run_pipe_test(mpnet_pipes, mpnet_pipe_param_callback, questions['test-A'], df_questions_testA),
    'wiki': run_pipe(mpnet_pipes['wiki'], mpnet_pipe_param_callback, questions['wiki']),
    'test-B': run_pipe_test(mpnet_pipes, mpnet_pipe_param_callback, questions['test-B'], df_questions_testB),
}

In [None]:
print(mpnet_predictions)

#### MiniLM-L12-v2

- faster
- general use case, multilingual
- pretrained

In [None]:
passages = {
    'allegro': pd.read_json(passages_source['allegro'], lines=True, chunksize=int(2e6)),
    'wiki': pd.read_json(passages_source['wiki'], lines=True, chunksize=int(2e6)),
    'legal': pd.read_json(passages_source['legal'], lines=True, chunksize=int(2e6)),
}

In [None]:
count_documents(document_stores)

In [None]:
delete_documents()

write_to_document_store_e('allegro', minilm_document_embedder)
write_to_document_store_e('legal', minilm_document_embedder)
write_to_document_store_e('wiki', minilm_document_embedder)

minilm_text_embedders = {
    'allegro': SentenceTransformersTextEmbedder(model=model_minilm, progress_bar=False),
    'wiki': SentenceTransformersTextEmbedder(model=model_minilm, progress_bar=False),
    'legal': SentenceTransformersTextEmbedder(model=model_minilm, progress_bar=False),
}

minilm_pipes = construct_embedding_pipelines(minilm_text_embedders)
minilm_pipe_param_callback = lambda row: {'text_embedder': {'text': row['text']}}

minilm_predictions = {
    'discriminator': 'MiniLM',
    'allegro': run_pipe(minilm_pipes['allegro'], minilm_pipe_param_callback, questions['allegro']),
    'legal': run_pipe(minilm_pipes['legal'], minilm_pipe_param_callback, questions['legal']),
    'wiki': run_pipe(minilm_pipes['wiki'], minilm_pipe_param_callback, questions['wiki']),
    'dev-0': run_pipe_test(minilm_pipes, minilm_pipe_param_callback, questions['dev-0']),
    'test-A': run_pipe_test(minilm_pipes, minilm_pipe_param_callback, questions['test-A'], df_questions_testA),
    'test-B': run_pipe_test(minilm_pipes, minilm_pipe_param_callback, questions['test-B'], df_questions_testB),
}

In [None]:
print(minilm_predictions)

#### roberta-large

- trained on polish specifically
- large size (~1gb)
- slowest
- author: Slawomir Dadas

In [None]:
passages = {
    'allegro': pd.read_json(passages_source['allegro'], lines=True, chunksize=int(2e6)),
    'wiki': pd.read_json(passages_source['wiki'], lines=True, chunksize=int(2e6)),
    'legal': pd.read_json(passages_source['legal'], lines=True, chunksize=int(2e6)),
}

In [None]:
delete_documents()

write_to_document_store_e('allegro', roberta_document_embedder)
write_to_document_store_e('legal', roberta_document_embedder)
write_to_document_store_e('wiki', roberta_document_embedder)

roberta_text_embedders = {
    'allegro': SentenceTransformersTextEmbedder(model=model_roberta_large, progress_bar=False),
    'wiki': SentenceTransformersTextEmbedder(model=model_roberta_large, progress_bar=False),
    'legal': SentenceTransformersTextEmbedder(model=model_roberta_large, progress_bar=False),
}

roberta_pipes = construct_embedding_pipelines(roberta_text_embedders)
roberta_pipe_param_callback = lambda row: {'text_embedder': {'text': "zapytanie: " + row['text']}}

roberta_predictions = {
    'discriminator': 'roberta',
    'allegro': run_pipe(roberta_pipes['allegro'], roberta_pipe_param_callback, questions['allegro']),
    'legal': run_pipe(roberta_pipes['legal'], roberta_pipe_param_callback, questions['legal']),
    'wiki': run_pipe(roberta_pipes['wiki'], roberta_pipe_param_callback, questions['wiki']),
    'dev-0': run_pipe_test(roberta_pipes, roberta_pipe_param_callback, questions['dev-0']),
    'test-A': run_pipe_test(roberta_pipes, roberta_pipe_param_callback, questions['test-A'], df_questions_testA),
    'test-B': run_pipe_test(roberta_pipes, roberta_pipe_param_callback, questions['test-B'], df_questions_testB),
}

In [None]:
print(roberta_predictions)

### *spacy mean word embeddings

NOT CONSIDERED IN PROJECT. Very bad results because it takes the mean vector of embeddings of all words in a sentence.

In [None]:
from custom_components import PolishSpacyTextEmbedder, PolishSpacyDocumentEmbedder

spacy_document_embedder = PolishSpacyDocumentEmbedder(model=spacy_model)  
spacy_document_embedder.warm_up()

In [None]:
count_documents(stores=document_stores)

In [None]:
# delete_documents()

write_to_document_store_e('allegro', spacy_document_embedder)
write_to_document_store_e('legal', spacy_document_embedder)
write_to_document_store_e('wiki', minilm_document_embedder)

spacy_text_embedders = {
    'allegro': PolishSpacyTextEmbedder(model=spacy_model),
    'wiki': PolishSpacyTextEmbedder(model=spacy_model),
    'legal': PolishSpacyTextEmbedder(model=spacy_model),
}

spacy_text_embedders['allegro'].warm_up()
spacy_text_embedders['wiki'].warm_up()
spacy_text_embedders['legal'].warm_up()

spacy_pipes = construct_embedding_pipelines(spacy_text_embedders)
spacy_pipe_param_callback = lambda row: {'text_embedder': {'text': row['text']}}

spacy_predictions = {
    'discriminator': 'spacy',
    'allegro': run_pipe(spacy_pipes['allegro'], spacy_pipe_param_callback, questions['allegro']),
    'legal': run_pipe(spacy_pipes['legal'], spacy_pipe_param_callback, questions['legal']),
    # 'wiki': run_pipe(minilm_pipes['wiki'], minilm_pipe_param_callback, questions['wiki']),
}


In [None]:
print(spacy_predictions)

### *roberta-large with lemmatization

NOT INCLUDED IN PROJECT RESULTS. sentence transformers capture the semantic meaning of paragraphs, and we are kinda removing this using lemmatization - results are significantly worse

In [None]:
def write_to_document_store_l_e(key, document_embedder):
    for batch in tqdm(passages[key]):
        if 'title' in batch:
            batch['title'] = batch['title'].fillna('')
            batch['text'] = batch.apply(lambda r: r['title'] + ' ' + r['text'], axis=1)
        
        batch = batch.rename(columns={'id': 'passage-id', 'text': 'content'})
        batch_as_dicts = batch.to_dict(orient='records')
        documents_list = [Document(content=' '.join([token.lemma_ for token in nlp(passageDict['content']) if not token.is_stop]), id=str(passageDict['passage-id'])) for passageDict in batch_as_dicts]
        documents_with_embeddings = document_embedder.run(documents_list)
        document_stores[key].write_documents(documents_with_embeddings['documents'], policy=DuplicatePolicy.SKIP)

In [None]:
passages = {
    'allegro': pd.read_json(passages_source['allegro'], lines=True, chunksize=int(2e6)),
    'wiki': pd.read_json(passages_source['wiki'], lines=True, chunksize=int(2e6)),
    'legal': pd.read_json(passages_source['legal'], lines=True, chunksize=int(2e6)),
}

In [None]:
delete_documents()

write_to_document_store_l_e('allegro', roberta_document_embedder)
write_to_document_store_l_e('legal', roberta_document_embedder)
write_to_document_store_l_e('wiki', roberta_document_embedder)

roberta_text_embedders = {
    'allegro': SentenceTransformersTextEmbedder(model=model_roberta_large, progress_bar=False),
    'wiki': SentenceTransformersTextEmbedder(model=model_roberta_large, progress_bar=False),
    'legal': SentenceTransformersTextEmbedder(model=model_roberta_large, progress_bar=False),
}

roberta_pipes = construct_embedding_pipelines(roberta_text_embedders)
roberta_l_pipe_param_callback = lambda row: {'text_embedder': {'text': "zapytanie: " + ' '.join([token.lemma_ for token in nlp(row['text']) if not token.is_stop])}}

roberta_l_predictions = {
    'discriminator': 'roberta-l',
    'allegro': run_pipe(roberta_pipes['allegro'], roberta_l_pipe_param_callback, questions['allegro']),
    'legal': run_pipe(roberta_pipes['legal'], roberta_l_pipe_param_callback, questions['legal']),
    'wiki': run_pipe(roberta_pipes['wiki'], roberta_l_pipe_param_callback, questions['wiki']),
    'dev-0': run_pipe_test(roberta_pipes, roberta_l_pipe_param_callback, questions['dev-0']),
    'test-A': run_pipe_test(roberta_pipes, roberta_l_pipe_param_callback, questions['test-A'], df_questions_testA),
    'test-B': run_pipe_test(roberta_pipes, roberta_l_pipe_param_callback, questions['test-B'], df_questions_testB),
}

## Train roberta-large 

- best performing model
- train using provided training/test sets

In [None]:
print(training_set)
print(test_set)

In [None]:
passages = {
    'allegro': pd.read_json(passages_source['allegro'], lines=True, chunksize=int(3e6)),
    'wiki': pd.read_json(passages_source['wiki'], lines=True, chunksize=int(3e6)),
    'legal': pd.read_json(passages_source['legal'], lines=True, chunksize=int(3e6)),
}

In [None]:
from sentence_transformers import SentenceTransformer, losses

model_roberta_large = "sdadas/mmlw-retrieval-roberta-large"
roberta_train = SentenceTransformer(model_roberta_large)
roberta_train_loss = losses.MultipleNegativesRankingLoss(model=roberta_train)

In [None]:
print(training_set)
print(training_set.shape)


In [None]:
from sentence_transformers import InputExample

train_examples = []
n_examples = training_set.num_rows

wiki_questions = pd.read_json(wiki_questions_train, lines=True)
inner_passages = pd.read_json(passages_source['wiki'], lines=True, chunksize=int(5e6))

processed_indices = []

for passage_batch in tqdm(inner_passages):
    if 'title' in passage_batch:
            passage_batch['title'] = passage_batch['title'].fillna('')
            passage_batch['text'] = passage_batch.apply(lambda r: r['title'] + ' ' + r['text'], axis=1)
    
    gen = (index for index in tqdm(range(n_examples)) if index not in processed_indices)
    for index in gen:
        found_passage = passage_batch[passage_batch['id'] == training_set[index]['passage-id']]
        
        if len(found_passage) == 1:
            processed_indices.append(index)
            found_question = wiki_questions[wiki_questions['id'] == training_set[index]['question-id']]
            
            if len(found_question) != 1:
                print("Error!!!")
                continue
            else:
                train_examples.append(InputExample(guid=index, texts=[found_question.iloc[0]['text'], found_passage.iloc[0]['text']]))

In [None]:
print(train_examples)

In [None]:
f = open(os.path.join(dataset_path, 'wiki/wiki-train-examples.jl'), mode='w', encoding='utf-8')

json.dump([{'question': item.texts[0], 'passage': item.texts[1]} for item in train_examples], f, ensure_ascii=False)

f.close()

In [None]:
from sentence_transformers import InputExample
import json

f = open(os.path.join(dataset_path, 'wiki/wiki-train-examples.jl'), mode='r', encoding='utf-8')

train_examples = [InputExample(guid=index, texts=[item['question'], item['passage']]) for index, item in enumerate(json.load(f))]

f.close()
print(train_examples[:4])

In [None]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=4)

roberta_train.get_max_seq_length()

In [None]:
def main() -> None:
    pool = roberta_train.start_multi_process_pool()
    
    roberta_train.fit(train_objectives=[(train_dataloader, roberta_train_loss)], epochs=1)
    
    roberta_train.stop_multi_process_pool(pool)

if __name__ == "__main__":
    main()

In [None]:
roberta_train.push_to_hub(
    repo_id="szmarkiewicz/mmlw-retrieval-roberta-large-poleval",
    token="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",
    commit_message="roberta-large trained with poleval training set",
    train_datasets=["piotr-rybak/poleval2022-passage-retrieval-dataset"],
    exist_ok=True
    )

### roberta-large trained results

In [None]:
delete_documents()

model_roberta_large_trained = "szmarkiewicz/mmlw-retrieval-roberta-large-poleval"

roberta_trained_document_embedder = SentenceTransformersDocumentEmbedder(model=model_roberta_large_trained)
roberta_trained_document_embedder.warm_up()

write_to_document_store_e('allegro', roberta_trained_document_embedder)
write_to_document_store_e('legal', roberta_trained_document_embedder)
write_to_document_store_e('wiki', roberta_trained_document_embedder)

roberta_trained_text_embedders = {
    'allegro': SentenceTransformersTextEmbedder(model=model_roberta_large_trained, progress_bar=False),
    'wiki': SentenceTransformersTextEmbedder(model=model_roberta_large_trained, progress_bar=False),
    'legal': SentenceTransformersTextEmbedder(model=model_roberta_large_trained, progress_bar=False),
}

roberta_trained_pipes = construct_embedding_pipelines(roberta_trained_text_embedders)
roberta_trained_pipe_param_callback = lambda row: {'text_embedder': {'text': "zapytanie: " + row['text']}}

roberta_trained_predictions = {
    'discriminator': 'roberta-trained',
    'allegro': run_pipe(roberta_trained_pipes['allegro'], roberta_trained_pipe_param_callback, questions['allegro']),
    'legal': run_pipe(roberta_trained_pipes['legal'], roberta_trained_pipe_param_callback, questions['legal']),
    'wiki': run_pipe(roberta_trained_pipes['wiki'], roberta_trained_pipe_param_callback, questions['wiki']),
    'dev-0': run_pipe_test(roberta_trained_pipes, roberta_trained_pipe_param_callback, questions['dev-0']),
    'test-A': run_pipe_test(roberta_trained_pipes, roberta_trained_pipe_param_callback, questions['test-A'], df_questions_testA),
    'test-B': run_pipe_test(roberta_trained_pipes, roberta_trained_pipe_param_callback, questions['test-B'], df_questions_testB),
}

## Load into .tsv files

In [None]:
empty_predictions = {
    'discriminator': 'bm25',
    'allegro': 'allegro',
    'legal': 'legal',
    'wiki': 'wiki',
    'dev-0': 'dev-0',
    'test-A': {
        'wiki': 'wiki',
        'legal': 'legal',
        'allegro': 'allegro',
    },
    'test-B': {
        'wiki': 'wiki',
        'legal': 'legal',
        'allegro': 'allegro',
    },
}

predictions = [
    # empty_predictions,
    # bm25_predictions,
    # bm25_l_predictions,
    # mpnet_predictions,
    # minilm_predictions,
    # spacy_predictions
    # roberta_predictions,
    # roberta_l_predictions,
    roberta_trained_predictions,
]

for prediction in predictions:
    if 'allegro' in prediction:
        prediction['allegro'].to_csv(os.path.join(dataset_path + 'allegro/', prediction['discriminator'] + '/submission.tsv'), sep='\t', index=False)
    if 'wiki' in prediction:
        prediction['wiki'].to_csv(os.path.join(dataset_path + 'wiki/', prediction['discriminator'] + '/submission.tsv'), sep='\t', index=False)
    if 'legal' in prediction:
        prediction['legal'].to_csv(os.path.join(dataset_path + 'legal/', prediction['discriminator'] + '/submission.tsv'), sep='\t', index=False)
    if 'dev-0' in prediction:
        for chunk_key in prediction['dev-0']:
            if len(prediction['dev-0'][chunk_key]) > 0:
                prediction['dev-0'][chunk_key].to_csv(os.path.join(dataset_path + 'dev-0/', prediction['discriminator'] + '/submission.tsv'), sep='\t', index=False)
    if 'test-A' in prediction:
        for chunk_key in prediction['test-A']:
            if len(prediction['test-A'][chunk_key]) > 0:
                prediction['test-A'][chunk_key].to_csv(os.path.join(dataset_path + 'test-A/', prediction['discriminator'] + '/' + chunk_key + '/submission.tsv'), sep='\t', index=False)
    if 'test-B' in prediction:
        for chunk_key in prediction['test-B']:
            if len(prediction['test-B'][chunk_key]) > 0:
                prediction['test-B'][chunk_key].to_csv(os.path.join(dataset_path + 'test-B/', prediction['discriminator'] + '/' + chunk_key + '/submission.tsv'), sep='\t', index=False)


## Evaluate

`o evaluate the results, run these in terminal:`

### allegro:
`python ./eval.py --true datasets/allegro/pairs-test.tsv --pred datasets/allegro/<name_of_algorithm>/submission.tsv`

### wiki
`python ./eval.py --true datasets/wiki/pairs-test.tsv --pred datasets/wiki/<name_of_algorithm>/submission.tsv`

### legal
`python ./eval.py --true datasets/legal/pairs-test.tsv --pred datasets/legal/<name_of_algorithm>/submission.tsv`

### Merge results for test inputs

In [None]:
for prediction in predictions:
    if 'test-A' in prediction:
        results_sum = 0
        for chunk_key in prediction['test-A']:
            with open(os.path.join(dataset_path + 'test-A/', prediction['discriminator'] + '/' + chunk_key + '/results.txt')) as result:
                results_sum += float(result.readline().split(' ')[1])
        f = open(os.path.join(dataset_path + 'test-A/', prediction['discriminator'] + '/results.txt'), "w")
        f.write(f'NDCG@10: {results_sum/3:.3f}')
        f.close()
    if 'test-B' in prediction:
        results_sum = 0
        for chunk_key in prediction['test-B']:
            with open(os.path.join(dataset_path + 'test-B/', prediction['discriminator'] + '/' + chunk_key + '/results.txt')) as result:
                results_sum += float(result.readline().split(' ')[1])
        f2 = open(os.path.join(dataset_path + 'test-B/', prediction['discriminator'] + '/results.txt'), "w")
        f2.write(f'NDCG@10: {results_sum/3:.3f}')
        f2.close()

## Sources

https://huggingface.co/blog/how-to-train-sentence-transformers
