In [1]:
!wget https://raw.githubusercontent.com/alexeygrigorev/minsearch/main/minsearch.py

zsh:1: command not found: wget


In [2]:
import minsearch

In [3]:
import json

In [4]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [5]:
len(docs_raw), docs_raw[1].keys()

(3, dict_keys(['course', 'documents']))

In [6]:
documents = []


for course_dict in docs_raw:
    c = 0
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)
        c += 1
    print(c)

435
375
138


In [7]:
documents[435]

{'text': 'Machine Learning Zoomcamp FAQ\nThe purpose of this document is to capture frequently asked technical questions.\nWe did this for our data engineering course and it worked quite well. Check this document for inspiration on how to structure your questions and answers:\nData Engineering Zoomcamp FAQ\nIn the course GitHub repository there’s a link. Here it is: https://airtable.com/shryxwLd0COOEaqXo\nwork',
 'section': 'General course-related questions',
 'question': 'How do I sign up?',
 'course': 'machine-learning-zoomcamp'}

In [8]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [9]:
index.vectorizers

{'question': TfidfVectorizer(),
 'text': TfidfVectorizer(),
 'section': TfidfVectorizer()}

SELECT * WHERE course = 'data-engineering-zoomcamp';

In [10]:
q = 'the course has already started, can I still enroll?'

In [11]:
index.fit(documents)

<minsearch.Index at 0x123c6a800>

In [12]:
#from openai import OpenAI
#client = OpenAI()

# response = client.chat.completions.create(
#     model='gpt-4o',
#     messages=[{"role": "user", "content": q}]
# )

# response.choices[0].message.content

In [13]:
from groq import Groq
from dotenv import load_dotenv
import os

print(os.getcwd())

# load environment variables
load_dotenv('../.env')

/Users/lemnos/Documents/gits/llm-zoomcamp/01-intro


True

In [14]:
# create client calling Groq class
client = Groq(api_key=os.getenv('GROQ_API_KEY'))

# create a query
response = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": q,
        }
    ],
    model="llama3-8b-8192",
)

# print the response
print(response.choices[0].message.content)

While I'd love to provide you with a definitive answer, the availability of enrollment for a course that has already started depends on various factors. Let me break it down for you:

**In-person courses:**

* If the course is an in-person, on-campus program, it's unlikely you'll be able to enroll in the middle of the semester. Most institutions have a clear cutoff date for adding or dropping courses, usually within the first few weeks of the semester. Once the course has started, it's generally not feasible to enroll in the same course.
* However, you might be able to enroll in a future section of the course, if one is offered, or waitlist for the current course in case a spot becomes available.

**Online courses:**

* For online courses, it's generally easier to enroll later in the semester. Many online courses are self-paced, and you can start at any time. However, it's essential to check with the course provider or institution to confirm their enrollment policies and any late-start

In [15]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [16]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT: 
{context}
""".strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [17]:
def llm(prompt):
    response = client.chat.completions.create(
        model="llama3-8b-8192",
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [18]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [19]:
rag(query)

[np.float64(1.385188873778814), np.float64(1.384084985627199), np.float64(1.1887225863814017), np.float64(1.0394697695919861), np.float64(1.0062012015640445)]


'Based on the context from Module 6: Streaming with Kafka, here\'s the answer to the QUESTION "how do I run Kafka":\n\nTo run Kafka, it seems that the Kafka Streams application is being run. The command to run the Kafka Streams application is not explicitly mentioned in the context. However, based on the other answers provided in the context, it appears that the Kafka Streams application is being run using the Java command `java`.\n\nIn the context, it is mentioned that the Java Kafka application is being run using the command:\n```\njava -cp build/libs/<jar_name>-1.0-SNAPSHOT.jar:out src/main/java/org/example/JsonProducer.java\n```\nThis command seems to be specific to the project being used, so you may need to replace `<jar_name>` with the actual name of the JAR file being referenced in the project.'

In [20]:
rag('the course has already started, can I still enroll?')

[np.float64(1.5827932857773956), np.float64(1.2000308369294905), np.float64(1.1359344551180688), np.float64(1.1255099239111823), np.float64(1.0286273449946706)]


"Based on the CONTEXT, I'll answer the QUESTION: the course has already started, can I still enroll?\n\nAnswer: Yes, even if you don't register, you're still eligible to submit the homeworks (from the first answer)."

In [21]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [22]:
import sys
sys.exit('24')

SystemExit: 24

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)


In [23]:
from elasticsearch import Elasticsearch

In [24]:
es_client = Elasticsearch('http://localhost:9200') 

In [25]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [26]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [27]:
from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


In [28]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|██████████| 948/948 [00:01<00:00, 516.72it/s]


In [29]:
query = 'I just disovered the course. Can I still join it?'

In [30]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [31]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer

In [32]:
rag(query)

'Based on the provided context, according to the FAQ, yes, you can still join the course.'