In [1]:
import minsearch

In [2]:
import json

In [3]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [4]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [5]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [6]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [7]:
q = 'the course has already started, can I still enroll?'

In [8]:
index.fit(documents)

<minsearch.Index at 0x7f30d0b65df0>

In [10]:
from dotenv import load_dotenv
load_dotenv(dotenv_path='../.env')

True

In [12]:
from groq import Groq

In [13]:
client = Groq()

In [14]:
response = client.chat.completions.create(
    model='llama3-8b-8192',
    messages=[{"role": "user", "content": q}]
)

response.choices[0].message.content

"It's definitely possible to enroll in a course after it has already started, but the availability and feasibility of doing so depend on several factors. Here are a few things to consider:\n\n1. **Check the course policy**: First, review the course's enrollment policy to see if it allows late enrollment. Some courses may not allow enrollment once the course has started, while others might have a specific deadline for adding students.\n2. **Talk to the instructor or course admin**: Reach out to the instructor or course administrator to inquire about the possibility of enrolling late. They can help you determine if it's feasible and what the process would entail. Be prepared to provide your reasons for enrolling late and offer a plan for catching up on any missed material.\n3. **Assess your readiness**: If you do get permission to enroll late, make sure you're prepared to keep up with the course's pace. You might need to commit to a significant amount of additional studying or assignment

In [15]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [16]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [21]:
def llm(prompt):
    response = client.chat.completions.create(
        model='llama3-8b-8192',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [22]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [23]:
rag(query)

'Based on the context, I would answer the question "how do I run Kafka?" as follows:\n\nIf you are trying to run a Java Kafka producer/consumer/KStreams, you can do so by running:\n```\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\nin the project directory.\n\nIf you are trying to run a Python Kafka producersumer, you will need to create a virtual environment and run the requirements file. The command to create the virtual environment is:\n```\npython -m venv env\n```\nYou can then install the necessary packages by running:\n```\npip install -r ../requirements.txt\n```\nTo activate the virtual environment, run:\n```\nsource env/bin/activate\n```\nNote that you will need to run these commands in a specific directory, and you will need to ensure that the necessary dependencies are installed.'

In [25]:
rag('the course has already started, can I still enroll?')

'Based on the context, I can answer your question:\n\nThe course has already started, can I still enroll?\n\nAccording to the FAQ answer: Course - Can I still join the course after the start date?, the answer is:\n\n"Yes, even if you don\'t register, you\'re still eligible to submit the homeworks."\n\nSo, the answer is YES, you can still enroll, but keep in mind that there will be deadlines for turning in the final projects, so don\'t leave everything for the last minute.'

In [24]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [26]:
from elasticsearch import Elasticsearch

In [27]:
es_client = Elasticsearch('http://localhost:9200') 

In [29]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [30]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [31]:
from tqdm.auto import tqdm

In [32]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [33]:
query = 'I just disovered the course. Can I still join it?'

In [34]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [36]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [38]:
rag(query)

'Based on the provided context, the answer to your question is:\n\nYes, you can still join the course after the start date. The FAQ suggests that you can still submit homeworks, but be aware of the deadlines for final projects.'