# Intro

In [1]:
import os 
from openai import OpenAI

api_key = os.environ['LLM_ZOOMCAMP_OPEN_AI']

client = OpenAI(api_key=api_key)

In [2]:
response = client.chat.completions.create(
    model='gpt-3.5-turbo',
    messages=[{
        'role': 'user', 
        'content': 'is it too late to join the course?',
    }]
)

In [3]:
response.choices[0].message.content

"It depends on the course and the policies of the institution offering it. Some courses may have specific enrollment deadlines, while others may allow for late enrollment with instructor permission. It's best to contact the institution offering the course to inquire about their enrollment policies and see if it is possible to join after the official start date."

# 1.3 Retrieval and Search

In [4]:
import minsearch
import json

In [5]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [6]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [7]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [8]:
index = minsearch.Index(
    text_fields=['question', 'text', 'section'],
    keyword_fields=['course']
)

In [9]:
index.fit(documents)

<minsearch.Index at 0x76c4f90ea1d0>

In [10]:
q = 'the course has already started, can i still enroll?'

In [11]:
boost = {'question': 3.0, 'section': 0.5}

results = index.search(
    query=q,
    filter_dict={'course': 'data-engineering-zoomcamp'},
    boost_dict=boost,
    num_results=5
)

In [12]:
results[0]

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
 'section': 'General course-related questions',
 'question': 'Course - Can I still join the course after the start date?',
 'course': 'data-engineering-zoomcamp'}

# 1.4 Generating answers with OpenAI GPT 4.0

In [13]:
client = OpenAI(api_key=api_key)

In [14]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{'role': 'user', 'content': q}]
)

In [15]:
response.choices[0].message.content

'Whether you can still enroll in a course that has already started depends on the specific policies of the educational institution offering the course. Here are some steps you can take to find out if late enrollment is possible:\n\n1. **Check the Course Enrollment Policy**: Look up the course or institution’s official website and review their policies regarding late enrollment.\n\n2. **Contact the Instructor**: Reach out to the course instructor or professor to ask if it’s possible to join the class late. They might make exceptions based on the situation.\n\n3. **Contact Administration**: Speak with the admissions office, registrar, or academic advisor. They may have procedures in place for late enrollment and can guide you through the process.\n\n4. **Assess the Impact**: Consider how much material has already been covered and whether you can realistically catch up. Ask the instructor if they can provide you with resources or additional support to help you get up to speed.\n\n5. **Fin

In [16]:
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
Use only the facts from the CONTEXT when answering.
If the CONTEXT doesn't contain the answer, output NONE.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

context = ""

for doc in results:
    context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

prompt = prompt_template.format(question=q, context=context).strip()

In [17]:
response = client.chat.completions.create(
    model='gpt-4o',
    messages=[{'role': 'user', 'content': prompt}],
)

In [18]:
response.choices[0].message.content

"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."

# 1.5 Rag Flow cleaning & modularizing

In [19]:
def search(query): 
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=20,
    )

    return results

In [20]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
Use only the facts from the CONTEXT when answering.
If the CONTEXT doesn't contain the answer, output NONE.

QUESTION: {question}

CONTEXT:
{context}
    """.strip()
    
    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()

    return prompt

In [21]:
def llm(prompt):
    response = client.chat.completions.create(
        model='gpt-4o',
        messages=[{'role': 'user', 'content': prompt}]
    )
    
    return response.choices[0].message.content

In [22]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [23]:
rag('how is grading done?')

'Grading for the course is done through peer evaluation of the capstone project. Each submitted project will be evaluated by three randomly assigned students who have also submitted their projects. As part of this system, you are also responsible for grading the projects of three fellow students. Non-compliance with this rule will result in failing to achieve the Certificate at the end of the course. Your final grade will be the median score of the grades given by the peer reviewers. The peer review criteria must follow the guidelines provided in the course materials.\n\n'

# 1.6 Search with Elasticsearch

In [24]:
from elasticsearch import Elasticsearch

In [25]:
es_client = Elasticsearch('http://localhost:9200')

In [26]:
es_client.info()

ObjectApiResponse({'name': '58fddc2cbafa', 'cluster_name': 'docker-cluster', 'cluster_uuid': 'aTYD9xtgTGK1LyeoxDcbfg', 'version': {'number': '8.4.3', 'build_flavor': 'default', 'build_type': 'docker', 'build_hash': '42f05b9372a9a4a470db3b52817899b99a76ee73', 'build_date': '2022-10-04T07:17:24.662462378Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [28]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"}
        }
    }
}

In [29]:
index_name = "course-questions"

index_creation = es_client.indices.create(index=index_name, body=index_settings)

In [31]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [33]:
for doc in documents:
    es_client.index(index=index_name, document=doc)

In [41]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }
    
    response = es_client.search(index=index_name, body=search_query)
    result_docs = []

    for hit in response["hits"]["hits"]:
        result_docs.append(hit['_source'])

    return result_docs

In [45]:
query="I just discovered the course, can I still join it?"
results = elastic_search(query=query)

In [46]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [47]:
rag(query)

"Yes, even if you don't register, you're still eligible to submit the homeworks. Be aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute."