In [4]:
import minsearch
import json

In [6]:
with open('documents.json', 'rt') as f_in:
    docs_raw = json.load(f_in)

In [7]:
documents = []

for course_dict in docs_raw:
    for doc in course_dict['documents']:
        doc['course'] = course_dict['course']
        documents.append(doc)

In [8]:
index = minsearch.Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

In [9]:
q = 'the course has already started, can I still enroll?'

In [10]:
index.fit(documents)

<minsearch.Index at 0x185d323d550>

In [11]:
def search(query):
    boost = {'question': 3.0, 'section': 0.5}

    results = index.search(
        query=query,
        filter_dict={'course': 'data-engineering-zoomcamp'},
        boost_dict=boost,
        num_results=5
    )

    return results

In [12]:
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT:
    {context}
    """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [19]:
# Set the API key directly in the code
import google.generativeai as genai
api_key = "AIzaSyBENmu-sXIDvdm2_wE5kptzBi5wzlIw56A"
genai.configure(api_key=api_key)
model = genai.GenerativeModel('gemini-1.5-flash')

In [20]:

def llm(prompt):
    response = model.generate_content(prompt)
    return response


In [21]:
query = 'how do I run kafka?'

def rag(query):
    search_results = search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)
    return answer.text

In [22]:
rag(query)

'This FAQ database does not contain the answer to your question. The database only provides instructions for running Kafka producers and consumers in Java and Python, as well as how to fix specific errors encountered while running these programs. It does not mention how to run Kafka itself. \n'

In [23]:
rag('the course has already started, can I still enroll?')

"The FAQ database does not have information about whether a student can enroll after the course starts. The only related information is that students can submit homework even if they don't register, but there are deadlines for final projects. \n"

In [36]:
from elasticsearch import Elasticsearch

In [None]:
es_client = Elasticsearch('http://localhost:9200') 
es_client

In [None]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = "course-questions"

es_client.indices.create(index=index_name, body=index_settings)

In [37]:
from tqdm.auto import tqdm

In [42]:
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

In [39]:
query = 'How do I execute a command in a running docker container?'

In [40]:
def elastic_search(query):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }
    # print(search_query)

    response = es_client.search(index=index_name, body=search_query)
    # print(response)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    
    return result_docs

In [41]:
from transformers import GPT2Tokenizer
def get_token(prompt):
    # Load the GPT-2 tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained("gpt2")

    # Input string
    input_string = prompt

    # Tokenize the input string
    tokens = tokenizer.tokenize(input_string)

    # Get the number of tokens
    num_tokens = len(tokens)

    print(f"Number of tokens: {num_tokens}")


In [184]:
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    # print(prompt)
    answer = llm(prompt)
    return answer.text, get_token(prompt)

In [185]:
rag(query)

Number of tokens: 643


('To execute a command in a running docker container, first use the `docker ps` command to find the container ID. Then, execute the command using `docker exec -it <container-id> bash`. \n',
 None)