In [110]:
import minsearch
import json

In [111]:
with open('/week-1documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [112]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [113]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [114]:
q = 'the course has already started, can I still enroll?'

In [115]:
index.fit(documents)

<minsearch.Index at 0x159ce0590>

In [116]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [166]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT:
    {context}
    """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [167]:
# Set the API key directly in the code
import google.generativeai as genai
api_key = "AIzaSyCisI_4EMFtpsin************"
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-1.5-flash')

In [168]:

def llm(prompt):
    response = model.generate_content(prompt)
    return response


In [169]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer.text

In [170]:
rag(query)

'The provided context does not contain the answer to the question, "how do I run kafka?".  Therefore, I cannot provide an answer. \n'

In [171]:
rag('the course has already started, can I still enroll?')

"The FAQ states that you can submit homeworks even if you don't register for the course. However, it does not mention whether you can enroll after the start date. \n"

In [172]:
from elasticsearch import Elasticsearch

In [173]:
es_client = Elasticsearch('http://localhost:9200') 
es_client

<Elasticsearch(['http://localhost:9200'])>

In [186]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

In [175]:
from tqdm.auto import tqdm

In [176]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [177]:
query = 'How do I execute a command in a running docker container?'

In [178]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }
    # print(search_query)

    response = es_client.search(index=index_name, body=search_query)
    # print(response)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    
    return result_docs

In [183]:
from transformers import GPT2Tokenizer
def get_token(prompt):
    # Load the GPT-2 tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

    # Input string
    input_string = prompt

    # Tokenize the input string
    tokens = tokenizer.tokenize(input_string)

    # Get the number of tokens
    num_tokens = len(tokens)

    print(f"Number of tokens: {num_tokens}")


In [184]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    # print(prompt)
    answer = llm(prompt)
    return answer.text, get_token(prompt)

In [185]:
rag(query)

Number of tokens: 643


('To execute a command in a running docker container, first use the `docker ps` command to find the container ID. Then, execute the command using `docker exec -it <container-id> bash`. \n',
 None)