In [52]:
# %%script false --no-raise-error
from pathlib import Path

file_path = "documents.json"
path = Path(file_path)

if not path.is_file():
    !wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/documents.json

file_path = "minsearch.py"
path = Path(file_path)

if not path.is_file():
    !wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py

In [53]:
from minsearch import Index

In [54]:
import json

In [55]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [56]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)


In [57]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [58]:
index = Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x79edb27eee60>

In [59]:
query = "Can I join the course if it has already started?"

filter_dict = {"course": "data-engineering-zoomcamp"}
boost_dict = {"question": 3, "section": 0.5}

results = index.search(query, 
                       filter_dict, 
                       boost_dict, 
                       num_results=3)

for result in results:
    print(result)

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", 'section': 'General course-related questions', 'question': 'Course - Can I still join the course after the start date?', 'course': 'data-engineering-zoomcamp'}
{'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.', 'section': 'General course-related questions', 'question': 'Course - Can I follow the course after it finishes?', 'course': 'data-engineering-zoomcamp'}
{'text': 'Yes, the slack channel remains open and you can ask questions there. But always sDocker containers exit code w search the channel first and second, check t

In [60]:
from openai import OpenAI
from dotenv import load_dotenv
load_dotenv();

In [61]:
client = OpenAI()

In [23]:
system_prompt= """
You are a course teaching assistant. Answer the QUESTION with the information included in CONTEXT.
If there is nothing in the QUESTION to match the data in CONTEXT answer with NONE.
Use the same idiom as the user used in the QUESTION. It is not necessary to repeat the QUESTION in the answer.
""".strip()

In [77]:
system_prompt= """
You are a course teaching assistant.
Your task is to provide specific guidance based on the CONTEXT provided.
Provide the appropriate answer if the QUESTION aligns with the information in CONTEXT.
Ensure clear responses address the user's query or provide appropriate instructions based on the provided course materials.
""".strip()

In [38]:
system_prompt_gp4= """
You are a course teaching assistant. Your role is to answer questions using the information provided in the CONTEXT.
Guidelines:
Contextual Matching: Use the CONTEXT to find relevant information that matches the QUESTION. If there is no matching information in the CONTEXT, respond with "NONE."
Language Consistency: Use the same language and tone as the user used in the QUESTION.
Brevity and Clarity: Provide clear and concise answers without repeating the QUESTION.
Example:
If the QUESTION asks about assignment deadlines and the CONTEXT includes the assignment schedule, provide the specific deadlines.
If the QUESTION is about a topic not covered in the CONTEXT, respond with "NONE."
""".strip()

In [36]:
system_prompt_gpt3= """
You are a course teaching assistant. Your role is to answer questions using the information provided in the CONTEXT.

Guidelines:

Contextual Matching: Look for relevant information in the CONTEXT that matches the QUESTION. If the CONTEXT does not contain information related to the QUESTION, respond with "NONE."

Language Consistency: Use the same language and tone as the user used in the QUESTION.

Brevity and Clarity: Provide clear and concise answers. Do not repeat the QUESTION in your answer.

Instructions for Answering:

Search the CONTEXT for any relevant information that can answer the QUESTION.
If you find relevant information, provide a direct and concise answer.
If there is no relevant information in the CONTEXT, respond with "NONE."
Example:

QUESTION: "What are the deadlines for assignments?"
CONTEXT: "The deadline for Assignment 1 is June 20. The deadline for Assignment 2 is July 15."
Answer: "The deadline for Assignment 1 is June 20. The deadline for Assignment 2 is July 15."

QUESTION: "What is the exam format?"
CONTEXT: "NONE"
Answer: "NONE"
""".strip()

In [30]:
user_prompt_template = """
QUESTION:
{question}

CONTEXT:
{context}
""".strip()

context= ""

for doc in results:
    context += f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

user_prompt = user_prompt_template.format(question=query, context=context).strip()

In [37]:
response = client.chat.completions.create(
    model="gpt-3.5-turbo",
    # model="gpt-4o",
    messages= [
        {"role": "system", "content": system_prompt_gpt3},
        {"role": "user", "content": user_prompt,
        }
    ]
)

response.choices[0].message.content

'Yes, you can join the course even if it has already started.'

In [62]:
def search(query: str, filter_dict: dict={}, bosst_dict: dict={"question": 3, "section": 0.5}, num_results: int=3):
    results = index.search(query, filter_dict, boost_dict, num_results)
    return results

In [63]:
def build_user_prompt(query: str, search_result: str):
    user_prompt_template = """
QUESTION: {question}

CONTEXT:
{context}
""".strip()

    context= ""
    
    for doc in search_result:
        context += f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    user_prompt = user_prompt_template.format(question=query, context=context).strip()
    return user_prompt

In [64]:
def llm(system_prompt: str, user_prompt: str, model: str="gpt-3.5-turbo"):
    response = client.chat.completions.create(
        model=model,
        messages= [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ]
    )
    return response.choices[0].message.content

In [65]:
def rag(query: str, filter_dict: dict= {"course": "data-engineering-zoomcamp"}):
    system_prompt= """
You are a course teaching assistant.
Your task is to provide specific guidance based on the CONTEXT provided.
Provide the appropriate answer if the QUESTION aligns with the information in CONTEXT.
Ensure clear responses address the user's query or provide appropriate instructions based on the provided course materials.
""".strip()
    search_result = search(query, filter_dict)
    user_prompt = build_user_prompt(query, search_result)
    answer = llm(system_prompt, user_prompt)
    return answer

In [81]:
query="how do I run kafka?"
# query = "Can I join the course if it has already started?"
rag(query)

'Since you are specifically looking to run Kafka, you can follow the instructions provided in the context for running Java Kafka under "Module 6: streaming with kafka." Use the java command provided to run your producer, consumer, or kstreams in the terminal. If you encounter issues with running a Python module, refer to Alexey\'s solution on creating a virtual environment and running the Python files from within that environment. Lastly, if you need to install dependencies for running code related to dlthub, ensure the \'dlt[duckdb]\' package is installed by executing the command: `!pip install dlt[duckdb]`.'

In [84]:
rag("the course has already started, can I still enroll?")

"Based on the provided context, if the course has already started and you are considering enrolling, you can still participate and submit the homeworks, even if you don't officially register. However, there will be deadlines for submitting final projects, so it is advisable not to procrastinate and to manage your time efficiently."

### Run ElasticSearch with Docker

```bash
docker run -it \
    --name elasticsearch \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3
```

We can verify if it is running with:

```bash
curl http://localhost:9200
```

In [10]:
from elasticsearch import Elasticsearch

In [12]:
es_client = Elasticsearch("http://localhost:9200")

In [13]:
es_client.info()

ObjectApiResponse({'name': '29f8a9768e63', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'RAJAxf6KTi2zTH1SesO4jA', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

### Index setting

```python
{
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}
```


### Query

```python
{
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}
```


In [15]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [9]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [16]:
from tqdm.auto import tqdm

In [17]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [01:43<00:00,  9.20it/s]


In [38]:
query = "I just discovered the course. Can I still join it?"
query="how do I run kafka?"

In [66]:
def elastic_search(index_name: str, query: str, filter_course: str, num_results: int=5):
    search_query = {
        "size": num_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": filter_course
                    }
                }
            }
        }
    }
    response = es_client.search(index=index_name, body=search_query)
    result_docs = [hit['_source'] for hit in response['hits']['hits']]
    return result_docs

In [49]:
query = "I just discovered the course. Can I still join it?"
index_name = "course-questions"
elastic_search(index_name, query, filter_course="data-engineering-zoomcamp")

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (insta

In [67]:
def rag_elastic(query: str, index_name:str = "course-questions",  filter_course="data-engineering-zoomcamp"):
    system_prompt= """
You are a course teaching assistant.
Your task is to provide specific guidance based on the CONTEXT provided.
Provide the appropriate answer if the QUESTION aligns with the information in CONTEXT.
Ensure clear responses address the user's query or provide appropriate instructions based on the provided course materials.
""".strip()
    search_result = elastic_search(index_name, query, filter_course)
    user_prompt = build_user_prompt(query, search_result)
    answer = llm(system_prompt, user_prompt)
    return answer


In [68]:
query = "I just discovered the course. Can I still join it?"
rag_elastic(query)

"Yes, you can still join the course even if you just discovered it. You are eligible to submit homework even if you don't register. Just be mindful of the deadlines for turning in the final projects. Don't procrastinate and try to stay on track with the course material."