# Homework: Introduction

## Getting the data

In [1]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [2]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

## Q2. Indexing the data

In [3]:
from elasticsearch import Elasticsearch

In [4]:
es_client = Elasticsearch('http://127.0.0.1:9200')

In [5]:
es_client.info()

ObjectApiResponse({'name': '5da8cde45b38', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'pwu4MGjNQcO9QGRQt_pAXA', 'version': {'number': '9.0.1', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '73f7594ea00db50aa7e941e151a5b3985f01e364', 'build_date': '2025-04-30T10:07:41.393025990Z', 'build_snapshot': False, 'lucene_version': '10.1.0', 'minimum_wire_compatibility_version': '8.18.0', 'minimum_index_compatibility_version': '8.0.0'}, 'tagline': 'You Know, for Search'})

In [6]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

if not es_client.indices.exists(index=index_name):
    es_client.indices.create(index=index_name, body=index_settings)
else:
    print(f"Index '{index_name}' already exists.")


Index 'course-questions' already exists.


In [7]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [8]:
for doc in tqdm(documents):
    es_client.index(index = index_name, document = doc)

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:02<00:00, 433.74it/s]


## Q3. Searching

In [9]:
def elastic_search(query):

    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"], #question is 4 times more important than text 
                        "type": "best_fields"
                    }
                }
            }
        }
    }

    response = es_client.search(index = index_name, body = search_query)

    result_docs = []
    score_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
        score_docs.append(hit['_score'])
    
    return result_docs, score_docs

In [10]:
query = "How do execute a command on a Kubernetes pod?"

In [11]:
result_docs, score_docs = elastic_search(query)

In [12]:
score_docs

[44.587067, 44.587067, 44.587067, 44.587067, 44.587067]

## Q4. Filtering

In [13]:
def elastic_search_filter(query):

    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"], #question is 4 times more important than text 
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index = index_name, body = search_query)

    result_docs = []
    score_docs = []

    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
        score_docs.append(hit['_score'])
    
    return result_docs, score_docs

In [14]:
query = "How do copy a file to a Docker container?"

In [15]:
search_results, scores = elastic_search_filter(query)
search_results

[{'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I debug a docker container?',
  'course': 'machine-learning-zoomcamp'},
 {'text': 'Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)',
  'section': '5. Deploying Machine Learning Models',
  'question': 'How do I debug a docker container?',
  'course': 'machine-learning-zoomcamp'},
 {'text': 'Launch the container im

## Q5. Building a prompt

In [16]:
context_template = """
Q: {question}
A: {text}
""".strip()

In [17]:
def build_prompt(query, search_results):
    prompt_template = """
                        You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
                        Use only the facts from the CONTEXT when answering the QUESTION.
                        
                        QUESTION: {question}
                        
                        CONTEXT: 
                        {context}
                        """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"""Q: {doc['question']}\nA: {doc['text']}\n\n""".strip()
    
    prompt = prompt_template.format(question=query, context=context)
    return prompt

In [18]:
prompt = build_prompt(query, search_results)

In [19]:
prompt

"You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\n                        Use only the facts from the CONTEXT when answering the QUESTION.\n\n                        QUESTION: How do copy a file to a Docker container?\n\n                        CONTEXT: \n                        Q: How do I debug a docker container?\nA: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)Q: How do I debug a docker container?\nA: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the cont

In [20]:
print(f'The length of the resulting prompt is: {len(prompt)}')

The length of the resulting prompt is: 2115


## Q6. Tokens

In [21]:
from openai import OpenAI

In [22]:
client = OpenAI()

In [23]:
def llm(prompt):
    response = client.chat.completions.create(
                model = 'gpt-4o',
                messages = [{"role": "user", "content":prompt}]
                 )

    return response.choices[0].message.content

In [24]:
def rag(query):
    search_results, scores = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [25]:
llm_output = rag(query)

In [26]:
llm_output

"I'm sorry, but the context provided does not contain information about copying a file to a Docker container. If you have access to additional resources or documentation, I recommend checking there for guidance on copying files to Docker containers."

In [27]:
import tiktoken

In [28]:
encoding = tiktoken.encoding_for_model("gpt-4o")

In [29]:
print(f'Our prompt has {len(encoding.encode(prompt))} tokens.')

Our prompt has 467 tokens.


In [30]:
encoding.encode(prompt)[0]

63842

In [31]:
encoding.decode_single_token_bytes(63842)

b"You're"

In [32]:
prompt

"You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.\n                        Use only the facts from the CONTEXT when answering the QUESTION.\n\n                        QUESTION: How do copy a file to a Docker container?\n\n                        CONTEXT: \n                        Q: How do I debug a docker container?\nA: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the container-id)\ndocker exec -it <container-id> bash\n(Marcos MJD)Q: How do I debug a docker container?\nA: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.\ndocker run -it --entrypoint bash <image>\nIf the container is already running, execute a command in the specific container:\ndocker ps (find the cont