In [None]:
# set up and test simple commands with LLM 
# swaped OpenAI for Mistral due to expired free tier quota

In [None]:
# import mistral 
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

In [None]:
import os

In [None]:
# get API key stored in env variable
api_key = os.getenv("MISTRAL_KEY")

In [None]:
client = MistralClient(api_key = api_key)

In [None]:
# ask it a question
response = client.chat(
    model = 'mistral-tiny',
    messages = [ChatMessage(role = "user", content = "Is it too late to join the course?")]
)

In [None]:
print(response.choices[0].message.content)

In [None]:
# set up a search using minisearch from Alexei Grigorev
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

In [None]:
import minsearch

In [None]:
import json

In [None]:
# get the document
!wget https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/documents.json

In [None]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [None]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [None]:
index = minsearch.Index(
    text_fields = ["question", "text", "section"],
    keyword_fields = ["course"]
)

In [None]:
# index (fit the index to) the documents
index.fit(documents)

In [None]:
# some query (question) 
query = 'the course has already started, can I still enroll?'

In [None]:
# let's search but make some fields more important and some less so
boost = {'question': 3.0, 'section': 0.5}
results = index.search(
    query,
    boost_dict = boost, 
    # filter_dict = {'course': 'data-engineering-zoomcamp'}, # if we want to search only for specific course 
    num_results = 5
)

In [None]:
print(results)

In [None]:
# combine LLM and search

In [None]:
# create the prompt template
prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
Use only the facts from the CONTEXT when answering the QUESTION.
If the CONTEXT doesn't contain the answer, output NONE

QUESTION: {question}

CONTEXT: {context}
""".strip()

In [None]:
# define the context
context = ""
for doc in results:
    context += f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

In [None]:
prompt = prompt_template.format(question = query, context = context).strip()

In [None]:
print(prompt)

In [None]:
# feed it to LLM
response = client.chat(
    model = "mistral-large-latest",
    messages = [ChatMessage(role = "user", content = prompt)]
)
print(response.choices[0].message.content)

In [None]:
# clean up the flow

In [None]:
# define a function for search
def search(query):
    boost = {'question': 3.0, 'section': 0.5}
    search_results = index.search(
        query,
        boost_dict = boost, 
        # filter_dict = {'course': 'data-engineering-zoomcamp'}, # if we want to ask only about one specific course 
        num_results = 10
    )
    return search_results
    

In [None]:
# define a function for building a prompt
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database. 
    Use only the facts from the CONTEXT when answering the QUESTION.
    If the CONTEXT doesn't contain the answer, output NONE

    QUESTION: {question}

    CONTEXT: {context}
    """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_template.format(question = query, context = context).strip()
    return prompt
    

In [None]:
# define a function to talk to LLM
def llm(prompt):
    response = client.chat(
    model = "mistral-large-latest",
    messages = [ChatMessage(role = "user", content = prompt)]
    )    
    return response.choices[0].message.content

In [None]:
# call these functions within in one other function
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [None]:
print(rag('the course has already started, can I enroll now?'))

In [None]:
# replace Alexey's search with elasticsearch

In [None]:
from elasticsearch import Elasticsearch

In [None]:
# create elastic search client
es_client = Elasticsearch('http://localhost:9200')

In [None]:
# check that it worked and retrieve info for hw1 Q1
es_client.info()['version']['build_hash']

In [None]:
# create an index
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} # we use course names for filtering
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index = index_name, body = index_settings)

In [None]:
from tqdm.auto import tqdm

In [None]:
# index the docs
for doc in documents:
    es_client.index(index = index_name, document = doc)

In [None]:
query = 'I just discovered the course. Can I still join it?'

In [None]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"], # ^3 - makes this field 3x important
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    result_docs = [hit['_source'] for hit in response['hits']['hits']]
    return result_docs

In [None]:
search_results = elastic_search(query)
print(search_results)

In [None]:
# adjust rag function
def rag_es(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [None]:
rag_es("The course has already started, can I still join?")

In [None]:
# TODO: HW