In [None]:
from elasticsearch import Elasticsearch, helpers
from elasticsearch_dsl import Search, Q

import csv

In [None]:
es = Elasticsearch(HOST="localhost", PORT="9200")

In [None]:
faq_csv = ".data/faq_with_splits_tokenized.csv"

In [None]:
es.indices.delete(index='questions', ignore=[404])

with open(faq_csv) as f:
    reader = csv.DictReader(f)
    helpers.bulk(es, reader, index='questions')

In [None]:
def generate_query(question):
    return {
        "match": {
            "long_question": {
                "query": question,
            }
        }
    }
    
def find_answer(query_param, generate_query, index="questions"):
    query = generate_query(query_param)
    res = es.search(index=index, query=query)
    
    return res['hits']['hits'][0]['_source']

In [None]:
question = input()

result = find_answer(question, generate_query)

print(f"Short question:\n{result['short_question']}")
print()
print(f"Long question:\n{result['long_question']}")
print()
print(f"Answer:\n{result['answer']}")

In [None]:
INDEX_NAME = "questions"
INDEX_FILE = "index.json"
DATA_FILE = faq_csv
BATCH_SIZE = 1000

In [None]:
import hu_core_ud_lg
import numpy as np

import pandas as pd

In [None]:
data_frame = pd.read_csv(DATA_FILE)

In [None]:
nlp = hu_core_ud_lg.load()

In [None]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords', quiet=True)
stop_words = set(stopwords.words('hungarian'))

def stop_word_filter(text):
    return " ". join([w for w in text if not w in stop_words])

In [None]:
def embed_docs(docs):
    return [embed(stop_word_filter(doc)) for doc in docs]

In [None]:
def embed(sentence):
    doc = nlp(sentence)
    lemmatized = " ".join([w.lemma_.lower() for w in doc])
    doc = nlp(lemmatized)
    helper = [doc[idx].vector for idx in range(len(doc))]
    query_vector = np.add.reduce(helper) / len(doc)
    
    return query_vector

In [None]:
def index_data(df):
    es.indices.delete(index=INDEX_NAME, ignore=[404])

    with open(INDEX_FILE) as index_file:
        source = index_file.read().strip()
        es.indices.create(index=INDEX_NAME, body=source)
    
    docs = []
    count = 0
    
    for index, row in df.iterrows():
        docs.append(row.to_dict())
        count += 1
            
        if count % BATCH_SIZE == 0:
            index_batch(docs)
            docs = []
            print(f"\rIndexed {count} documents.", end="")

    if docs:
        index_batch(docs)
        docs = []
        print(f"\rIndexed {count} documents.", end="")

    print()
    es.indices.refresh(index=INDEX_NAME)
    print("Done indexing.")

In [None]:
def index_batch(docs):
    short_questions = [doc["short_question"] for doc in docs]
    
    short_question_vectors = embed_docs(short_questions)

    requests = []
    
    for i, doc, in enumerate(docs):
        request = doc
        request["_op_type"] = "index"
        request["_index"] = INDEX_NAME
        request["short_question_vector"] = short_question_vectors[i]
        requests.append(request)
    helpers.bulk(es, requests)

In [None]:
index_data(data_frame)

In [None]:
script_query = {
    "script_score": {
        "query": {"match_all": {}},
        "script": {
            "source": "cosineSimilarity(params.query_vector, doc['short_question_vector']) + 1.0",
            "params": {"query_vector": embed(stop_word_filter(question))}
        }
    }
}

response = es.search(
    index=INDEX_NAME,
    body={
        "size": 10,
        "query": script_query
    }
)

In [None]:
for hit in response["hits"]["hits"]:
    print("id: {}, score: {}".format(hit["_id"], hit["_score"]))
    print(hit["_source"]["short_question"])
    print(hit["_source"]["answer"])
    print()