In [1]:
link = 'http://localhost:9200/'

In [3]:
from elasticsearch import Elasticsearch, helpers
import pandas as pd
from sklearn.metrics import ndcg_score

es = Elasticsearch(link)

In [4]:
es

<Elasticsearch(['http://localhost:9200'])>

In [5]:
index_config = {
    "settings": {
        "analysis": {
            "analyzer": {
                "polish_with_synonyms": {
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "synonym_filter",
                        "morfologik_stem",
                        "lowercase"
                    ]
                },
                "polish_without_synonyms": {
                    "tokenizer": "standard",
                    "filter": [
                        "lowercase",
                        "morfologik_stem",
                        "lowercase"
                    ]
                }
            },
            "filter": {
                "synonym_filter": {
                    "type": "synonym",
                    "synonyms": ["kwiecień, kwi, IV"]
                }
            }
        }
    },
    "mappings": {
        "properties": {
            "content_with_synonyms": {"type": "text", "analyzer": "polish_with_synonyms"},
            "content_without_synonyms": {"type": "text", "analyzer": "polish_without_synonyms"}
        }
    }
}

# Create the index
es.indices.create(index="fiqa_pl", body=index_config)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'fiqa_pl'})

In [6]:
from datasets import load_dataset

ds = load_dataset("clarin-knext/fiqa-pl", "corpus")
ds

DatasetDict({
    corpus: Dataset({
        features: ['_id', 'title', 'text'],
        num_rows: 57638
    })
})

In [11]:
text = ds['corpus'][2137]['text']
text

'Zawsze jestem podejrzliwy, gdy artykuł wykorzystuje 2 punkty danych, aby zasugerować trend. Listopad 1997 i grudzień 2016. Ok? A co z tymi wszystkimi latami pomiędzy? Rok 1997 mógł mieć niezwykle wysoką liczbę z powodu boomu Dot Com, kiedy wszyscy i ich brat tworzyli strony internetowe o nic nie robili i upubliczniali.'

In [12]:
es.index(index="fiqa_pl", id=1, document={
    "content_with_synonyms": text,
    "content_without_synonyms": text
})

ObjectApiResponse({'_index': 'fiqa_pl', '_id': '1', '_version': 1, 'result': 'created', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 0, '_primary_term': 1})

In [25]:
# Query with synonyms
query_with_synonyms = {
    "query": {
        "match": {"content_with_synonyms": "powodowi"}
    }
}

# Query without synonyms
query_without_synonyms = {
    "query": {
        "match": {"content_without_synonyms": "powodowi"}
    }
}

# Perform queries
res_with_synonyms = es.search(index="fiqa_pl", body=query_with_synonyms)
res_without_synonyms = es.search(index="fiqa_pl", body=query_without_synonyms)

# Get results
num_docs_with_synonyms = res_with_synonyms['hits']['total']['value']
num_docs_without_synonyms = res_without_synonyms['hits']['total']['value']

print(f"Documents with synonyms: {num_docs_with_synonyms}")
print(f"Documents without synonyms: {num_docs_without_synonyms}")


Documents with synonyms: 1
Documents without synonyms: 1
