In [1]:
import requests
import tiktoken

ModuleNotFoundError: No module named 'tiktoken'

In [2]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

In [4]:
documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [4]:
from elasticsearch import Elasticsearch
es_client = Elasticsearch('http://localhost:9200')

In [7]:
index_name = "course-questions"

In [10]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [12]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 948/948 [00:04<00:00, 203.34it/s]


In [8]:
def es_query(query, limit=5, course='data-engineering-zoomcamp'):
    search_query = {
        "size": limit,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }
    
    return es_client.search(index=index_name, body=search_query)

def elastic_search(query):
    response = es_query(query)

    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [9]:
query = 'How do execute a command on a Kubernetes pod?'

response = es_query(query)
response['hits']['hits'][0]['_score']

31.973522

In [10]:
query = 'How do copy a file to a Docker container?'

response = es_query(query, 3, 'machine-learning-zoomcamp')

# question = response['hits']['hits'][2]['_source']['question']
# text = response['hits']['hits'][2]['_source']['text']

search_results = elastic_search(query)
search_results

[{'text': 'In case running pgcli  locally causes issues or you do not want to install it locally you can use it running in a Docker container instead.\nBelow the usage with values used in the videos of the course for:\nnetwork name (docker network)\npostgres related variables for pgcli\nHostname\nUsername\nPort\nDatabase name\n$ docker run -it --rm --network pg-network ai2ys/dockerized-pgcli:4.0.1\n175dd47cda07:/# pgcli -h pg-database -U root -p 5432 -d ny_taxi\nPassword for root:\nServer: PostgreSQL 16.1 (Debian 16.1-1.pgdg120+1)\nVersion: 4.0.1\nHome: http://pgcli.com\nroot@pg-database:ny_taxi> \\dt\n+--------+------------------+-------+-------+\n| Schema | Name             | Type  | Owner |\n|--------+------------------+-------+-------|\n| public | yellow_taxi_data | table | root  |\n+--------+------------------+-------+-------+\nSELECT 1\nTime: 0.009s\nroot@pg-database:ny_taxi>',
  'section': 'Module 1: Docker and Terraform',
  'question': 'PGCLI - running in a Docker container',
 

In [11]:
def build_prompt(query, search_results):
    prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
"""
    context_template = """
Q: {question}
A: {text}\n\n
"""
    context = ""
        
    for doc in search_results:
        context = context + context_template.format(question=doc['question'], text=doc['text']).strip()

    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [12]:
prompt = build_prompt(query, search_results)
len(prompt)

3113

In [13]:
import tiktoken
encoding = tiktoken.encoding_for_model("gpt-4o")
result = encoding.encode(prompt)

ModuleNotFoundError: No module named 'tiktoken'