## Set up minsearch

In [1]:
!rm -f minsearch.py
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

--2024-07-09 13:30:24--  https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.111.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.111.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3832 (3.7K) [text/plain]
Saving to: ‘minsearch.py’


2024-07-09 13:30:24 (33.3 MB/s) - ‘minsearch.py’ saved [3832/3832]



In [2]:
import requests 
import minsearch

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x76c77cd9d520>

## Pipeline using Ollama 

In [8]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [9]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [10]:
def llm(prompt):
    response = client.chat.completions.create(
        model='phi3',
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [11]:
def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [12]:
# Implement Ollama using OpenAI API 

from openai import OpenAI 

client = OpenAI(
    base_url = 'http://localhost:11434/v1/',
    api_key = 'ollama',
)

In [15]:
query = ("The course has already started. Can I still join?")
rag(query)

" Based on the CONTEXT provided in FAQs for a general course question:\n\nYes, even if the course has already started and as per your registration status, assuming that no one mentioned otherwise about an early sign-up requirement or restricted late registrations to certain enrollment policies only applicable after specific dates. But since you've not registered beforehand yet but can submit homeworks according to guidelines provided in FAQs which clearly state deadlines for final projects submission, so it seems like course registration is still possible even if the course has already started and will be taking part from 15th Jan at a specified time. It would help to confirm with your instructor as policies can change or become more specific once enrollment begins after starting date of the class; but based on this FAQ information, you are eligible for registration even if it has already started and will have guidelines about project submission deadlines not being left until last minu

In [16]:
print(_)

 Based on the CONTEXT provided in FAQs for a general course question:

Yes, even if the course has already started and as per your registration status, assuming that no one mentioned otherwise about an early sign-up requirement or restricted late registrations to certain enrollment policies only applicable after specific dates. But since you've not registered beforehand yet but can submit homeworks according to guidelines provided in FAQs which clearly state deadlines for final projects submission, so it seems like course registration is still possible even if the course has already started and will be taking part from 15th Jan at a specified time. It would help to confirm with your instructor as policies can change or become more specific once enrollment begins after starting date of the class; but based on this FAQ information, you are eligible for registration even if it has already started and will have guidelines about project submission deadlines not being left until last minute.