In [14]:
import minsearch
import json

In [15]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [16]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course'] # For each document, a new key 'course' is added
        documents.append(doc)


In [17]:
documents[1]

{'text': 'GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites',
 'section': 'General course-related questions',
 'question': 'Course - What are the prerequisites for this course?',
 'course': 'data-engineering-zoomcamp'}

In [18]:
# search engine

index = minsearch.Index(
    text_fields = ["question", "text", "section"],
    keyword_fields = ["course"]
)

In [20]:
# SELECT * WHERE course = "data-engineering-zoomcamp" # Filters, elastic search

In [21]:
index.fit(documents) # Now we can use the search engine

<minsearch.Index at 0x794b2a66f1f0>

In [22]:
# from openai import OpenAI

# client = OpenAI()

# Doesn't work

# resp = client.chat.completions.create(
#     model='gpt-3.5-turbo-16k',
#     messages=[{"role": "user", "content":q}]
# )

In [23]:
# pip install mistralai

In [24]:
import os
from mistralai.client import MistralClient
from mistralai.models.chat_completion import ChatMessage

model = "mistral-medium-latest"

client = MistralClient() # Available until August 2

# chat_response = client.chat(
#     model=model,
#     messages=[ChatMessage(role="user", content=q)]
# )

# print(chat_response.choices[0].message.content)

In [25]:
def search(query):
    boost = {"question": 3.0, "section": 0.5}

    res = index.search(
        query = query,
        boost_dict=boost,
        num_results=5,
        filter_dict={"course": "data-engineering-zoomcamp"}
    )
    return res

In [32]:
# Turn the document to the prompt
def build_prompt(query, search_results):
    prompt_temp = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.
    
    QUESTION: {question}
    
    CONTEXT: 
    {context}
    """.strip()

    context = ""

    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"

    prompt = prompt_temp.format(question=query, context=context).strip()
    return prompt


In [35]:
def llm(prompt):
    # Put the content into ai
    chat_response = client.chat(
        model=model,
        messages=[ChatMessage(role="user", content=prompt)]
    )
    
    return chat_response.choices[0].message.content

In [49]:
# RAG flow

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [50]:
query = "What is this course about?"

In [51]:
rag(query)

'According to the provided context, the course is about data engineering and will involve the use of various tools such as Google Cloud, Google Cloud SDK, Python 3 with Anaconda, Terraform, and Git. It is expected to require about 5-15 hours per week, depending on the student\'s background and previous experience. The exact start date of the course is January 15, 2024 at 17h00. Prerequisites for the course can be found in the GitHub repository for the course. Students are expected to create a GitHub account and clone the course repository to their local machine. They may also create their own repositories for their notes and versions of their files.\n\nTo answer the question "What is this course about?" based on the provided context, this course is about data engineering and will cover topics such as Google Cloud, Google Cloud SDK, Python 3 with Anaconda, Terraform, and Git. It is expected to require a time commitment of 5-15 hours per week. Students will need to have a GitHub account 