In [1]:
!rm -f minsearch.py
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-06-23 19:36:56--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-06-23 19:36:57 (9.15 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [2]:
import requests 
import minsearch

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x783971bbe470>

In [3]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [4]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

def llm(prompt):
    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [6]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [9]:
from openai import OpenAI

client = OpenAI(
    base_url='http://localhost:11434/v1/',
    api_key='ollama',
)

In [10]:
llm('write that this is a test')

' This is a test message indicating the intention to create or demonstrate a simple output, typically used as an example or placeholder. Here\'s how one might construct such a statement:\n\n"This message serves as a demonstration of basic text communication. It exemplifies the straightforward conveyance of information without any specific context or purpose."'

In [11]:
print(_)

 This is a test message indicating the intention to create or demonstrate a simple output, typically used as an example or placeholder. Here's how one might construct such a statement:

"This message serves as a demonstration of basic text communication. It exemplifies the straightforward conveyance of information without any specific context or purpose."


In [12]:
rag("I just find the course. Can I still join it?")

" Yes, according to our FAQs, even after the course starts on January 15th, 2024 at 17:00, you can still follow the course materials at your own pace as they will be kept after the course finishes. Additionally, there are resources such as Google Calendar for live sessions and Telegram channels for announcements to stay informed about the course. However, it's important to note that specific deadlines exist for submitting final projects, so you shouldn't wait until the last minute.\n\nBefore the course starts, make sure to have a Google Cloud account, install and set up Google Cloud SDK, Python 3 (installed with Anaconda), Terraform, and Git as they are mentioned in the prerequisites and requirements for this course. It would also be beneficial if you review the syllabus beforehand to assess your familiarity with these subjects."