# video 1.2 - prep environment

In [3]:
import os

import openai
from openai import OpenAI

In [5]:
OPENAI_API_KEY = os.getenv("OPENAI_API_KEY")

In [6]:
client = OpenAI(api_key=OPENAI_API_KEY)

In [12]:
res = client.chat.completions.create(
    model='gpt-4o',
    messages=[{"role": "user", "content": "is it too late to join the course?"}]
)    

In [20]:
res.choices[0].message

ChatCompletionMessage(content="Whether it's too late to join a course depends on several factors, including the specific enrollment deadlines set by the institution or platform offering the course, the course structure, and whether or not the instructor allows late enrollments. Here are a few steps you can take to find out:\n\n1. **Check the Course Website or Portal**: Often, the deadlines and enrollment information are posted there.\n\n2. **Contact the Course Instructor or Administration**: If the information isn't readily available online, reaching out directly can give you a definitive answer. Explain your situation; some instructors or administrators may make exceptions.\n\n3. **Look for Rolling Admissions or Late Enrollment Policies**: Some institutions offer rolling admissions or have policies in place for late registrants.\n\n4. **Self-Paced vs. Scheduled Courses**: If the course is self-paced, you might be able to join at any time. For scheduled courses with fixed dates, joinin

# video 1.3 - retrieval

In [21]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-07-01 18:54:44--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Risoluzione di raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connessione a raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connesso.
Richiesta HTTP inviata, in attesa di risposta... 200 OK
Lunghezza: 3832 (3,7K) [text/plain]
Salvataggio in: «minsearch.py»


2024-07-01 18:54:44 (11,8 MB/s) - «minsearch.py» salvato [3832/3832]



In [28]:
import minsearch
import json

In [29]:
with open('../../llm-zoomcamp/01-intro/documents.json', 'r') as file_input:
    docs_raw = json.load(file_input)

In [30]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [31]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [34]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

Text fields are the fields used to perform the **search**.

similar to

```sql
SELECT * WHERE course = "data-engineering-zoomcamp"
```

In [35]:
q = "the course has already started, can I still enroll?"

In [36]:
index.fit(documents)

<minsearch.Index at 0x11483e5c0>

In [39]:
boost = {'question': 3.0, "section": 0.5}

results = index.search(
    query=q,
    filter_dict={'course': 'data-engineering-zoomcamp'},
    boost_dict=boost,
    num_results=5
)

# video 1.4 generation with OpenAI

In [73]:
q

'the course has already started, can I still enroll?'

In [74]:
res = client.chat.completions.create(
    model='gpt-4o',
    messages=[{'role': 'user', 'content': q}]
)

In [75]:
res.choices[0].message.content

"Yes, you can often still enroll in a course that has already started, depending on the institution or platform offering the course. Here are some steps you can take:\n\n1. **Check the Enrollment Deadline:** Review the course details to see if there is a specific deadline for enrollment. Some courses may have a cutoff date after which new students cannot join.\n\n2. **Contact the Instructor or Administration:** Reach out to the course instructor or the academic department. Explain your situation and express your interest in joining the course. They may have the flexibility to allow late enrollment.\n\n3. **Review Course Content:** If you are allowed to enroll late, make sure to catch up on any missed lectures, assignments, or readings as soon as possible. This will help you stay on track with the rest of the class.\n\n4. **Online Platforms:** If the course is offered through an online platform (e.g., Coursera, edX, Udacity), there might be more flexibility. Some platforms allow self-pa

In [76]:
### useful to give a 'role' to a LLM

prompt_template = """
You're a course teaching assistant.

Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the question.
If the CONTEXT doesn't contain the answer, output NONE

QUESTION: {question}

CONTEXT: {context}
""".strip()

In [79]:
context = ""

for doc in results:
    context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

In [81]:
prompt = prompt_template.format(question=q, context=context).strip()

In [83]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{'role': 'user', 'content': prompt}]
)

In [84]:
print(response.choices[0].message.content)

Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.


# video 1.5 cleaned RAG flow

In [113]:
def search(query):
    boost = {'question': 3.0, "section": 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=10
    )

    return results

In [114]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant.
    
    Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the question.
    If the CONTEXT doesn't contain the answer, output NONE
    
    QUESTION: {question}
    
    CONTEXT: {context}
    """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [115]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{"role": "user", "content": prompt}]
    )
    return response.choices[0].message.content

In [118]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [119]:
rag('the course has started, can I still enroll?')

"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."

# video 1.6 searching with ElasticSearch

first we need to run elasticsearch:

```sh
docker run -it \
    --rm \
    --name elasticsearch \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3
```

In [120]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [123]:
from elasticsearch import Elasticsearch

In [124]:
es_client = Elasticsearch('http://localhost:9200')

In [126]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'course-questions'}

In [128]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [130]:
for doc in tqdm(documents):
    es_client.index(index=index_name, body=doc)

100%|██████████████████████████████████████████████████████| 948/948 [00:04<00:00, 194.49it/s]


In [143]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    res = es_client.search(index=index_name, body=search_query)
    result_docs = [hit['_source'] for hit in res['hits']['hits']]

    return result_docs

In [145]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [131]:
query = 'I just discovered the course. Can I still join it?'

In [146]:
rag(query)

'Yes, you can still join the course even after the start date. You are eligible to submit the homework, but be mindful of deadlines for turning in the final projects.'