In [1]:
import minisearch

In [2]:
import json

In [3]:
with open('documents.json', 'r') as file:
    texts = json.load(file)
    

In [4]:
docs_raw = []

for doc in texts:
    for docs in doc['documents']:
        docs['course'] = doc['course']
        docs_raw.append(
            docs
        )

In [5]:
docs_raw[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [6]:
index = minisearch.Index(
    text_fields=['question', 'text', 'section'],
    keyword_fields=['course']
)

In [7]:
index.fit(docs_raw)

<minisearch.Index at 0x13e522350>

In [8]:
q = 'I am a beginner in programming. Can I take this course?'

In [9]:
boost = {
    'question': 3,
    'section':0.5
}

results = index.search(
    query=q,
    boost_dict=boost,
    filter_dict={
        'course':'data-engineering-zoomcamp'
    },
    num_results=10
)

In [10]:
results[0]

{'text': 'Yes, the slack channel remains open and you can ask questions there. But always sDocker containers exit code w search the channel first and second, check the FAQ (this document), most likely all your questions are already answered here.\nYou can also tag the bot @ZoomcampQABot to help you conduct the search, but don’t rely on its answers 100%, it is pretty good though.',
 'section': 'General course-related questions',
 'question': 'Course - Can I get support if I take the course in the self-paced mode?',
 'course': 'data-engineering-zoomcamp'}

### Elastic search

In [11]:
from elasticsearch import Elasticsearch

In [12]:
es_client = Elasticsearch('http://localhost:9200')

In [None]:
index_settings = {
    "settings":{
        "number_of_shards":1,
        "number_of_replicas":0
    },
    "mappings": {
        "properties": {
            "text": {"type":"text"},
            "section":{"type":"text"},
            "question":{"type":"text"},
            "course":{"type":"keyword"},
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

#### Indexing documents

In [14]:
from tqdm.auto import tqdm

In [15]:
for doc in tqdm(docs_raw):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [16]:
query = "I just discovered the course. Can I still join it?"

In [19]:
search_query = {
    "size":5,
    "query":{
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter":{
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
    
}

In [20]:
response = es_client.search(index=index_name, body=search_query)

In [25]:
result_docs = []
for hit in response['hits']['hits']:
    result_docs.append(hit['_source'])

In [26]:
result_docs[0]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}