In [1]:
import openai

In [None]:
from openai import OpenAI

import yaml
# Open the file
with open('api_keys.yml', 'r') as file:
    # Load the data from the file
    data = yaml.safe_load(file)
    
# Get the API key
openai_api_key = data['OPENAI_API_KEY']

In [None]:
client = OpenAI(api_key=openai_api_key)

In [4]:
response = client.chat.completions.create(
    model='gpt-4o-mini',
    messages=[{"role": "user", "content": "is it too late to join the course?"}]
)

In [5]:
response.choices[0].message.content

"Whether it's too late to join a course depends on the specific course and its enrollment policies. Many courses have different deadlines for enrollment, and some may allow late registration. It's best to check with the course organizer or institution directly for accurate information about their enrollment process and deadlines."

In [6]:
from elasticsearch import Elasticsearch

In [7]:
es_client = Elasticsearch('http://localhost:9200')

In [8]:
es_client.info()

ObjectApiResponse({'name': 'cfb860c720ec', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'UzPj39baSoucc-qED6kPYA', 'version': {'number': '9.0.2', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '0a58bc1dc7a4ae5412db66624aab968370bd44ce', 'build_date': '2025-05-28T10:06:37.834829258Z', 'build_snapshot': False, 'lucene_version': '10.1.0', 'minimum_wire_compatibility_version': '8.18.0', 'minimum_index_compatibility_version': '8.0.0'}, 'tagline': 'You Know, for Search'})

In [9]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [10]:
import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [11]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [12]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [13]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:04<00:00, 224.79it/s]


In [14]:
query = 'How do copy a file to a Docker container?'

In [15]:
search_query = {
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^4", "text"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "machine-learning-zoomcamp"
                }
            }
        }
    }
}

In [16]:
results = es_client.search(index=index_name, body=search_query)

In [42]:
result_docs = []

for hit in results['hits']['hits']:
    result_docs.append(hit['_source'])

In [43]:
context_template = """
Q: {question}

A: {text}
""".strip()

In [49]:
context = ""

for doc in result_docs:
    context = context+context_template.format(question={doc['question']},text={doc['text']}).strip()+"\n\n"

In [50]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

In [51]:
prompt = prompt_template.format(question=query, context=context).strip()

In [52]:
len(prompt)

2230

In [54]:
import tiktoken

In [55]:
encoding = tiktoken.encoding_for_model("gpt-4o")

In [56]:
encode

NameError: name 'encode' is not defined

In [59]:
encoding.encode(prompt)

[63842,
 261,
 4165,
 14029,
 29186,
 13,
 30985,
 290,
 150339,
 4122,
 402,
 290,
 31810,
 8099,
 591,
 290,
 40251,
 7862,
 558,
 8470,
 1606,
 290,
 19719,
 591,
 290,
 31810,
 8099,
 1261,
 55959,
 290,
 150339,
 364,
 107036,
 25,
 3253,
 621,
 5150,
 261,
 1974,
 316,
 261,
 91238,
 9282,
 1715,
 10637,
 50738,
 734,
 48,
 25,
 11881,
 5299,
 621,
 357,
 15199,
 261,
 62275,
 9282,
 48511,
 943,
 32,
 25,
 11881,
 35423,
 290,
 9282,
 3621,
 306,
 25383,
 6766,
 326,
 151187,
 290,
 7251,
 4859,
 11,
 813,
 484,
 480,
 13217,
 261,
 38615,
 6348,
 15043,
 301,
 19940,
 2461,
 533,
 278,
 2230,
 7962,
 4859,
 38615,
 464,
 3365,
 16656,
 77,
 3335,
 290,
 9282,
 382,
 4279,
 6788,
 11,
 15792,
 261,
 6348,
 306,
 290,
 4857,
 9282,
 16008,
 301,
 19940,
 10942,
 350,
 6555,
 290,
 9282,
 26240,
 23428,
 301,
 19940,
 25398,
 533,
 278,
 464,
 6896,
 26240,
 29,
 38615,
 3392,
 6103,
 277,
 10732,
 391,
 79771,
 45517,
 943,
 48,
 25,
 11881,
 5299,
 621,
 357,
 5150,
 6291,
 591,