In [None]:
# 1. Standard Python Libraries
import os

# 2. Third-party libraries (alphabetical order)
import requests
from dotenv import load_dotenv
from elasticsearch import Elasticsearch
from openai import OpenAI
from tqdm.auto import tqdm
from transformers import AutoTokenizer

# 3 Local imports (alphabetical order)


In [None]:
# Load environment variables
load_dotenv(override=True)

# Set environment variables for LLM configuration
api_key = os.getenv('DEEPSEEK_API_KEY')
url_base = os.getenv('DEEPSEEK_URL_BASE')
model = os.getenv('DEEPSEEK_MODEL')

# Convert the temperature to float, since a number is expected
temperature = float(os.getenv('DEEPSEEK_TEMP', 0.7))

client = OpenAI(
    base_url= f"{url_base}/v1",
    api_key= api_key               
)

```bash
curl localhost:9200

{
  "name" : "8ecda94a8498",
  "cluster_name" : "docker-cluster",
  "cluster_uuid" : "YVhGXINSTB6mxh7Yd8ycEw",
  "version" : {
    "number" : "8.17.6",
    "build_flavor" : "default",
    "build_type" : "docker",
    "build_hash" : "dbcbbbd0bc4924cfeb28929dc05d82d662c527b7",
    "build_date" : "2025-04-30T14:07:12.231372970Z",
    "build_snapshot" : false,
    "lucene_version" : "9.12.0",
    "minimum_wire_compatibility_version" : "7.17.0",
    "minimum_index_compatibility_version" : "7.0.0"
  },
  "tagline" : "You Know, for Search"
}
```
`"build_hash" : "dbcbbbd0bc4924cfeb28929dc05d82d662c527b7"`

In [None]:
# Download and initial document process
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [5]:
documents[0]

{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp'}

In [None]:
#1. Connect to Elasticsearch
es_client = Elasticsearch(
    hosts=['htt://localhost:9200'],
    request_timeout=30
    )

try:
    if es_client.ping():
        print("¡Conexión exitosa a Elasticsearch!")
    else:
        print("No se pudo conectar a Elasticsearch.")
        exit()
except Exception as e:
    print(f"Error al conectar a Elasticsearch: {e}")
    exit()

¡Conexión exitosa a Elasticsearch!


In [None]:
# 2. Define the index name and configuration
actual_index_name = 'course-questions'

# 3 Check if the index exists, if so, delete it
if es_client.indices.exists(index=actual_index_name):
    print(f"El índice '{actual_index_name}' ya existe. Procediendo a borrarlo...")
    try:
        es_client.indices.delete(index=actual_index_name)
        print(f"Índice '{actual_index_name}' borrado exitosamente.")
    except Exception as e:
        print(f"Error al borrar el índice '{actual_index_name}': {e}")
        # Considera si quieres salir o continuar si el borrado falla
else:
    print(f"El índice '{actual_index_name}' no existe, se creará.")

El índice 'course-questions' no existe, se creará.


In [None]:
# 4 Create the index with the settings and mappings:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}

index_name = 'course-questions'
es_client.indices.create(index=index_name, body=index_settings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'course-questions'})

In [None]:
# 5 Index the document
for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

In [None]:
query = 'I just discovered the course. Can I still join it?'

In [None]:
# Function to build prompt
def build_prompt(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT: 
    {context}
    """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"section: {doc['section']}\nquestion: {doc['question']}\nanswer: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [None]:
# Function to consult llm
def llm(prompt):
    response = client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}]
    )
    
    return response.choices[0].message.content

In [None]:
# Funtion use elastic_search
def elastic_search(query):
    
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "data-engineering-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    result_docs = []
    for hit in response ['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs


In [None]:
# Funtion RAG 
def rag(query):
    search_results = elastic_search(query)
    prompt = build_prompt(query, search_results)
    answer = llm(prompt)

    return answer

In [None]:
rag(query)

## Modificaciones de código para respuestas a Homework


In [None]:
# Modified so that the context expresses the requested form
def build_prompt_mod(query, search_results):
    prompt_template = """
    You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
    Use only the facts from the CONTEXT when answering the QUESTION.

    QUESTION: {question}

    CONTEXT: 
    {context}
    """.strip()

    context = ""
    
    for doc in search_results:
        context = context + f"Q: {doc['question']}\nA: {doc['text']}\n\n"
    
    prompt = prompt_template.format(question=query, context=context).strip()
    return prompt

In [None]:
# It was modified so that the fields and courses would adapt to the requested data
def elastic_search_mod(query):
    
    search_query = {
        "size": 3,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^4", "text"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": "machine-learning-zoomcamp"
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    result_docs = []
    for hit in response ['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [None]:
elastic_search_mod("How do copy a file to a Docker container?")

In [13]:
query ="How do copy a file to a Docker container?"
sear = elastic_search_mod(query)
prompt_ = build_prompt_mod(query, sear)
len(prompt_)

1463

In [None]:
response_llm = rag(query)

In [16]:
tokenizer = AutoTokenizer.from_pretrained("deepseek-ai/deepseek-v3")

tokenizer_config.json:   0%|          | 0.00/3.13k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.85M [00:00<?, ?B/s]

In [17]:
def count_tokens_deepseek(text: str) -> int:
    tokens = tokenizer.encode(text, add_special_tokens=False)
    return len(tokens)

In [21]:
tokens_llm = count_tokens_deepseek(response_llm)
tokens_llm

123

In [23]:
tokens_prompt = count_tokens_deepseek(prompt_)
tokens_prompt

333

In [24]:
response_llm

'To copy a file to a Docker container, you can use the `docker cp` command with the following syntax:  \n\n```bash\ndocker cp /path/to/local/file_or_directory container_id:/path/in/container\n```\n\nFor example, if you want to copy a file named `example.txt` from your local machine to a container with ID `abc123` into the `/app` directory inside the container, you would run:  \n\n```bash\ndocker cp example.txt abc123:/app/\n```  \n\nThis command works for both files and directories.  \n\n(Source: Hrithik Kumar Advani)'