## Importar Modulos y Bibliotecas

In [26]:
import requests 
import hashlib
import json

## 1. Descargar documento y extraer en una lista

In [None]:
docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

## Función para Gererar Id utilizando hash md5

In [28]:
def generate_document_id(doc):
    # combined = f"{doc['course']}-{doc['question']}"
    combined = f"{doc['course']}-{doc['question']}-{doc['text'][:10]}"
    hash_object = hashlib.md5(combined.encode())
    hash_hex = hash_object.hexdigest()
    document_id = hash_hex[:8]
    return document_id

## Generar Id unico para cada documento en la lista

In [29]:
for doc in documents:
    doc['id'] = generate_document_id(doc)

In [30]:
documents[3]

{'text': "You don't need it. You're accepted. You can also just start learning and submitting homework without registering. It is not checked against any registered list. Registration is just to gauge interest before the start date.",
 'section': 'General course-related questions',
 'question': 'Course - I have registered for the Data Engineering Bootcamp. When can I expect to receive the confirmation email?',
 'course': 'data-engineering-zoomcamp',
 'id': '0bbf41ec'}

## Veririfcar si algún Id se repite

In [31]:
from collections import defaultdict

hashes = defaultdict(list)

for doc in documents:
    hashes[doc['id']].append(doc)

print(f"cantidad de Id: {len(hashes)}, Cantidad de documentos {len(documents)}")

if len(hashes) != len(documents):
    print("Hay Ids duplicados ......")
    duplicates = [k for k, v in hashes.items() if len(v) > 1]
    for k in duplicates:
        print(f"Duplicate ID: {k} Count: {len(hashes[k])}")
        print("-"*80)
        print(json.dumps(hashes[k], indent=2, ensure_ascii=False))
        print("-"*80)
        print("\n")
else:  
    print("No hay Ids duplicados")


cantidad de Id: 947, Cantidad de documentos 948
Hay Ids duplicados ......
Duplicate ID: 593f7569 Count: 2
--------------------------------------------------------------------------------
[
  {
    "text": "They both do the same, it's just less typing from the script.\nAsked by Andrew Katoch, Added by Edidiong Esu",
    "section": "6. Decision Trees and Ensemble Learning",
    "question": "Does it matter if we let the Python file create the server or if we run gunicorn directly?",
    "course": "machine-learning-zoomcamp",
    "id": "593f7569"
  },
  {
    "text": "They both do the same, it's just less typing from the script.",
    "section": "6. Decision Trees and Ensemble Learning",
    "question": "Does it matter if we let the Python file create the server or if we run gunicorn directly?",
    "course": "machine-learning-zoomcamp",
    "id": "593f7569"
  }
]
--------------------------------------------------------------------------------




In [32]:
with open('documents-with-ids.json', 'wt') as f_out:
    json.dump(documents, f_out, indent=2)

In [34]:
!head documents-with-ids.json

[
  {
    "text": "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  \u201cOffice Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon\u2019t forget to register in DataTalks.Club's Slack and join the channel.",
    "section": "General course-related questions",
    "question": "Course - When will the course start?",
    "course": "data-engineering-zoomcamp",
    "id": "c02e79ef"
  },
  {
    "text": "GitHub - DataTalksClub data-engineering-zoomcamp#prerequisites",


## Creación de Prompt Template

In [35]:
prompt_template = """
You emulate a student who's taking our course.
Formulate 5 questions this student might ask based on a FAQ record. The record
should contain the answer to the questions, and the questions should be complete and not too short.
If possible, use as fewer words as possible from the record. 

The record:

section: {section}
question: {question}
answer: {text}

Provide the output in parsable JSON without using code blocks:

["question1", "question2", ..., "question5"]
""".strip()

In [36]:
from deepseek_client import MODEL, client


In [37]:
def generate_questions(doc):
    prompt = prompt_template.format(**doc)

    response = client.chat.completions.create(
        model=MODEL,
        messages=[{"role": "user", "content": prompt}]
    )

    json_response = response.choices[0].message.content
    return json_response

In [38]:
from tqdm.auto import tqdm

In [39]:
results = {}

In [40]:
for doc in tqdm(documents): 
    doc_id = doc['id']
    if doc_id in results:
        continue

    questions = generate_questions(doc)
    results[doc_id] = questions

  0%|          | 0/948 [00:00<?, ?it/s]

In [41]:
import pickle

In [62]:
import json
import re

def limpiar_json_llm(texto):
    # Eliminar bloques de código JSON con triple backtick
    texto = re.sub(r'```json\s*|\s*```', '', texto, flags=re.DOTALL)

    # Eliminar comentarios de una línea (// ...) y de bloque (/* ... */), si existen
    texto = re.sub(r'//.*?$|/\*.*?\*/', '', texto, flags=re.DOTALL | re.MULTILINE)

    # Eliminar espacios extras y limpiar la cadena
    texto = texto.strip()

    # Reemplazar tildes o caracteres mal codificados (opcional, si sabes que pueden estar mal)
    # Esto es solo un ejemplo, ajusta según sea necesario
    reemplazos = {
        'á': 'a', 'é': 'e', 'í': 'i', 'ó': 'o', 'ú': 'u',
        'Á': 'A', 'É': 'E', 'Í': 'I', 'Ó': 'O', 'Ú': 'U',
        'ñ': 'n', 'Ñ': 'N',
        # puedes añadir más reemplazos según tu caso
    }
    for antiguo, nuevo in reemplazos.items():
        texto = texto.replace(antiguo, nuevo)

    return texto

# Ejemplo de uso:



In [81]:
# Función para limpiar cada valor
def limpiar_valor(valor):
    # Eliminar bloques de código JSON con triple backtick
    valor = re.sub(r'```json\s*|\s*```', '', valor, flags=re.DOTALL)

    # Eliminar comentarios de una línea y de bloque
    valor = re.sub(r'//.*?$|/\*.*?\*/', '', valor, flags=re.DOTALL | re.MULTILINE)

    # Reemplazar comillas simples por dobles
    valor = valor.replace("'", '"')

    # Corregir escapes mal formados
    valor = re.sub(r'\\([^\"]|$)', r'\\"', valor)

    # Corregir casos como "don"t" -> "don't"
    valor = re.sub(r'(\w+)"(\w+)', r"\1'\2", valor)

    # Eliminar espacios extra y limpiar
    valor = re.sub(r'\s+', ' ', valor)

    # Limpiar espacios iniciales y finales
    valor = valor.strip()

    return valor

# Función para convertir el diccionario completo
def convertir_dict_a_json_valido(diccionario):
    nuevo_dict = {}
    for clave, valor in diccionario.items():
        try:
            valor_limpio = limpiar_valor(valor)
            nuevo_valor = json.loads(valor_limpio)
            nuevo_dict[clave] = nuevo_valor
        except json.JSONDecodeError as e:
            valor_fallado = valor
            print(f"❌ Error decodificando JSON para clave {clave}: {e}")
            nuevo_dict[clave] = None  # o puedes usar [] si prefieres una lista vacía
    return nuevo_dict

In [79]:
results['f43f5fe7']

'```json\n["How can I open the Run command window on a Windows machine to access the registry editor?", "Where exactly in the Registry Editor do I need to modify the \'Autorun\' value for GCP VM connection issues?", "What specific change should I make to the \'Autorun\' registry value to resolve the VSCode-GCP VM connection problem?", "Is there an alternative solution to modifying the registry, such as dealing with the known_hosts file?", "Where is the known_hosts file located on a Windows system if I want to delete the fingerprint manually?"]\n```'

In [82]:
# Aplicar la limpieza a todo el diccionario
dict_json = convertir_dict_a_json_valido(results)

# Mostrar el resultado
from pprint import pprint
pprint(dict_json)

❌ Error decodificando JSON para clave f43f5fe7: Expecting ',' delimiter: line 1 column 158 (char 157)
❌ Error decodificando JSON para clave eb9d376f: Expecting ',' delimiter: line 1 column 26 (char 25)
❌ Error decodificando JSON para clave a1e59afc: Expecting ',' delimiter: line 1 column 328 (char 327)
❌ Error decodificando JSON para clave 7ec0f9b0: Expecting ',' delimiter: line 1 column 22 (char 21)
❌ Error decodificando JSON para clave bb1ba786: Expecting ',' delimiter: line 1 column 27 (char 26)
❌ Error decodificando JSON para clave d407d65b: Unterminated string starting at: line 1 column 2 (char 1)
❌ Error decodificando JSON para clave c9375c56: Expecting ',' delimiter: line 1 column 384 (char 383)
❌ Error decodificando JSON para clave e866156b: Expecting ',' delimiter: line 1 column 20 (char 19)
❌ Error decodificando JSON para clave fb930700: Expecting ',' delimiter: line 1 column 28 (char 27)
❌ Error decodificando JSON para clave aa187680: Expecting ',' delimiter: line 1 column 2

In [76]:
# Tu ejemplo
valor = '[\n    "What is the exact start date and time for the course?",\n    "How can I stay updated with the course schedule and important announcements?",\n    "Where do I need to register before the course begins?",\n    "Is there a specific platform or channel for course-related communication?",\n    "What should I do to ensure I don\'t miss the beginning of the course?"\n]'

valor_limpio = limpiar_valor(valor)
print("Valor limpio:", valor_limpio)

try:
    parsed = json.loads(valor_limpio)
    print("✅ JSON válido:", parsed)
except json.JSONDecodeError as e:
    print("❌ Error:", e)

Valor limpio: [ "What is the exact start date and time for the course?", "How can I stay updated with the course schedule and important announcements?", "Where do I need to register before the course begins?", "Is there a specific platform or channel for course-related communication?", "What should I do to ensure I don't miss the beginning of the course?" ]
✅ JSON válido: ['What is the exact start date and time for the course?', 'How can I stay updated with the course schedule and important announcements?', 'Where do I need to register before the course begins?', 'Is there a specific platform or channel for course-related communication?', "What should I do to ensure I don't miss the beginning of the course?"]


In [73]:
valor = results['c02e79ef']
valor_limpio = limpiar_valor(valor)
print("Valor original:", valor)
print("Valor limpio:", valor_limpio)

Valor original: [
    "What is the exact start date and time for the course?",
    "How can I stay updated with the course schedule and important announcements?",
    "Where do I need to register before the course begins?",
    "Is there a specific platform or channel for course-related communication?",
    "What should I do to ensure I don't miss the beginning of the course?"
]
Valor limpio: [ "What is the exact start date and time for the course?", "How can I stay updated with the course schedule and important announcements?", "Where do I need to register before the course begins?", "Is there a specific platform or channel for course-related communication?", "What should I do to ensure I don"t miss the beginning of the course?" ]


In [None]:
for item in results.items():
    limpiar_json_llm(item)[2]

In [63]:
results

{'c02e79ef': '[\n    "What is the exact start date and time for the course?",\n    "How can I stay updated with the course schedule and important announcements?",\n    "Where do I need to register before the course begins?",\n    "Is there a specific platform or channel for course-related communication?",\n    "What should I do to ensure I don\'t miss the beginning of the course?"\n]',
 '1f6520ca': '[\n    "What background knowledge or skills do I need before enrolling in this course?",\n    "Are there any specific technical requirements I should meet to take this course?",\n    "Do I need prior experience with data engineering tools to join this course?",\n    "What are the recommended prerequisites to ensure I can follow the course material effectively?",\n    "Is there a list of required skills or tools I should be familiar with before starting the course?"\n]',
 '7842b56a': '```json\n[\n  "Is it possible to enroll in the course after it has already begun?",\n  "What happens if I do

In [49]:
with open('results.bin', 'wb') as archivo:
        pickle.dump(results, archivo)

In [50]:
with open('results.bin', 'rb') as f_in:
    results = pickle.load(f_in)

In [83]:
results['1f6520ca']

'[\n    "What background knowledge or skills do I need before enrolling in this course?",\n    "Are there any specific technical requirements I should meet to take this course?",\n    "Do I need prior experience with data engineering tools to join this course?",\n    "What are the recommended prerequisites to ensure I can follow the course material effectively?",\n    "Is there a list of required skills or tools I should be familiar with before starting the course?"\n]'

In [84]:
for doc_id, json_questions in results.items():
    print(f'questions: {json_questions}')

questions: [
    "What is the exact start date and time for the course?",
    "How can I stay updated with the course schedule and important announcements?",
    "Where do I need to register before the course begins?",
    "Is there a specific platform or channel for course-related communication?",
    "What should I do to ensure I don't miss the beginning of the course?"
]
questions: [
    "What background knowledge or skills do I need before enrolling in this course?",
    "Are there any specific technical requirements I should meet to take this course?",
    "Do I need prior experience with data engineering tools to join this course?",
    "What are the recommended prerequisites to ensure I can follow the course material effectively?",
    "Is there a list of required skills or tools I should be familiar with before starting the course?"
]
questions: ```json
[
  "Is it possible to enroll in the course after it has already begun?",
  "What happens if I don't formally register but sti

In [85]:
parsed_resulst = {}

for doc_id, json_questions in results.items():
    
    parsed_resulst[doc_id] = json.loads(json_questions)

JSONDecodeError: Expecting value: line 1 column 1 (char 0)

In [86]:
print(json_questions)

```json
[
  "Is it possible to enroll in the course after it has already begun?",
  "What happens if I don't formally register but still want to participate?",
  "Are there any deadlines I should be aware of if I join late?",
  "Can I submit assignments if I join the course after the start date?",
  "Will there be penalties for submitting the final project close to the deadline?"
]
```


In [87]:
doc_index = {d['id']: d for d in documents}

In [88]:
final_results = []

for doc_id, questions in parsed_resulst.items():
    course = doc_index[doc_id]['course']
    for q in questions:
        final_results.append((q, course, doc_id))

In [1]:
import pandas as pd

In [90]:
df = pd.DataFrame(final_results, columns=['question', 'course', 'document'])

In [91]:
df

Unnamed: 0,question,course,document
0,What is the exact start date and time for the ...,data-engineering-zoomcamp,c02e79ef
1,How can I stay updated with the course schedul...,data-engineering-zoomcamp,c02e79ef
2,Where do I need to register before the course ...,data-engineering-zoomcamp,c02e79ef
3,Is there a specific platform or channel for co...,data-engineering-zoomcamp,c02e79ef
4,What should I do to ensure I don't miss the be...,data-engineering-zoomcamp,c02e79ef
5,What background knowledge or skills do I need ...,data-engineering-zoomcamp,1f6520ca
6,Are there any specific technical requirements ...,data-engineering-zoomcamp,1f6520ca
7,Do I need prior experience with data engineeri...,data-engineering-zoomcamp,1f6520ca
8,What are the recommended prerequisites to ensu...,data-engineering-zoomcamp,1f6520ca
9,Is there a list of required skills or tools I ...,data-engineering-zoomcamp,1f6520ca


In [92]:
df.to_csv('ground-truth-data.csv', index=False)

In [93]:
!head ground-truth-data.csv

question,course,document
What is the exact start date and time for the course?,data-engineering-zoomcamp,c02e79ef
How can I stay updated with the course schedule and important announcements?,data-engineering-zoomcamp,c02e79ef
Where do I need to register before the course begins?,data-engineering-zoomcamp,c02e79ef
Is there a specific platform or channel for course-related communication?,data-engineering-zoomcamp,c02e79ef
What should I do to ensure I don't miss the beginning of the course?,data-engineering-zoomcamp,c02e79ef
What background knowledge or skills do I need before enrolling in this course?,data-engineering-zoomcamp,1f6520ca
Are there any specific technical requirements I should meet to take this course?,data-engineering-zoomcamp,1f6520ca
Do I need prior experience with data engineering tools to join this course?,data-engineering-zoomcamp,1f6520ca
What are the recommended prerequisites to ensure I can follow the course material effectively?,data-engineering-zoomcamp,1f6520ca


************************************************************************************************************

In [47]:
import csv

# Nombre del archivo
nombre_archivo_csv_fila = 'results_fila.csv'

try:
    with open(nombre_archivo_csv_fila, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = results.keys() # Las claves del diccionario serán los encabezados
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

        writer.writeheader() # Escribe los encabezados
        writer.writerow(results) # Escribe el diccionario como una fila

    print(f"Diccionario guardado exitosamente en '{nombre_archivo_csv_fila}' como una fila CSV.")
except Exception as e:
    print(f"Error al guardar el archivo CSV como una fila: {e}")

Diccionario guardado exitosamente en 'results_fila.csv' como una fila CSV.


In [96]:
prueba = results.items()

In [100]:
dato = results['c02e79ef']
dato_limpio = limpiar_valor(dato)
print(f"Dato limpio {dato_limpio}")

Dato limpio [ "What is the exact start date and time for the course?", "How can I stay updated with the course schedule and important announcements?", "Where do I need to register before the course begins?", "Is there a specific platform or channel for course-related communication?", "What should I do to ensure I don't miss the beginning of the course?" ]


In [103]:
for doc_id, json_questions in results.items():
    dato = json_questions
    dato_limpio = limpiar_valor(dato)
    results[doc_id] = dato_limpio

print(len(results))

947


In [104]:
results['a3b9af04']

'["Why am I getting a Permission denied error when running a script in GitHub Actions?", "How can I resolve the error code 126 in my GitHub Actions workflow?", "What steps should I take to fix execution permission issues for a script in GitHub Actions?", "How do I grant execute permissions to a script file before running it in a GitHub Actions workflow?", "What is the correct command to add execute permissions to a script in a GitHub repository?"]'

In [98]:
for row in prueba:
    print(row)

('c02e79ef', '[\n    "What is the exact start date and time for the course?",\n    "How can I stay updated with the course schedule and important announcements?",\n    "Where do I need to register before the course begins?",\n    "Is there a specific platform or channel for course-related communication?",\n    "What should I do to ensure I don\'t miss the beginning of the course?"\n]')
('1f6520ca', '[\n    "What background knowledge or skills do I need before enrolling in this course?",\n    "Are there any specific technical requirements I should meet to take this course?",\n    "Do I need prior experience with data engineering tools to join this course?",\n    "What are the recommended prerequisites to ensure I can follow the course material effectively?",\n    "Is there a list of required skills or tools I should be familiar with before starting the course?"\n]')
('7842b56a', '```json\n[\n  "Is it possible to enroll in the course after it has already begun?",\n  "What happens if I do

*******************************************************************************


## Video 3.3 --> Evaluation of Text Retrieval Techniques for RAG out

In [1]:
import json

with open('documents-with-ids.json', 'rt') as f_in:
    documents = json.load(f_in)

In [3]:
from elasticsearch import Elasticsearch

es_client = Elasticsearch('http://localhost:9200', request_timeout=60) 

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    }
}

index_name = "course-questions"

# es_client.indices.delete(index=index_name, ignore_unavailable=True)
# es_client.indices.create(index=index_name, body=index_settings)

try:
    # Intenta borrar el índice (ignora si no existe)
    print(f"Intentando borrar el índice '{index_name}' si existe...")
    es_client.indices.delete(index=index_name, ignore_unavailable=True)
    print(f"Índice '{index_name}' borrado o no existía.")

    # Intenta crear el índice
    print(f"Intentando crear el índice '{index_name}'...")
    es_client.indices.create(index=index_name, body=index_settings)
    print(f"¡Índice '{index_name}' creado exitosamente!")
except Exception as e:
    print(f"ERROR: Ocurrió un problema al interactuar con Elasticsearch: {e}")

Intentando borrar el índice 'course-questions' si existe...
Índice 'course-questions' borrado o no existía.
Intentando crear el índice 'course-questions'...
¡Índice 'course-questions' creado exitosamente!


In [4]:
from tqdm.auto import tqdm

for doc in tqdm(documents):
    es_client.index(index=index_name, document=doc)

  0%|          | 0/948 [00:00<?, ?it/s]

ApiError: ApiError(503, 'unavailable_shards_exception', '[course-questions][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest [[course-questions][0]] containing [index {[course-questions][OuOBOZgBgnecT-cPMzh2], source[{"text":"The purpose of this document is to capture frequently asked technical questions\\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours\'\' live.1\\nSubscribe to course public Google Calendar (it works from Desktop only).\\nRegister before the course starts using this link.\\nJoin the course Telegram channel with announcements.\\nDon’t forget to register in DataTalks.Club\'s Slack and join the channel.","section":"General course-related questions","question":"Course - When will the course start?","course":"data-engineering-zoomcamp","id":"c02e79ef"}]}]]')

In [7]:
print(type(documents))
documents[0]

<class 'list'>


{'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
 'section': 'General course-related questions',
 'question': 'Course - When will the course start?',
 'course': 'data-engineering-zoomcamp',
 'id': 'c02e79ef'}

In [11]:
index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
        "index": {
            "refresh_interval": "30s",  # Reduce la frecuencia de refresh
            "number_of_routing_shards": 1
        }
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"},
            "id": {"type": "keyword"},
        }
    }
}

In [8]:
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

# Configuración del cliente
es_client = Elasticsearch(
    'http://localhost:9200',
    request_timeout=60
)

# Nombre del índice
index_name = "course-questions"

# Función para indexar documentos con manejo de errores
def index_documents(documents):
    successes = 0
    errors = 0
    
    for doc in tqdm(documents, desc="Indexando documentos"):
        try:
            # Asegúrate que el documento tenga todos los campos requeridos
            required_fields = ['text', 'section', 'question', 'course', 'id']
            if not all(field in doc for field in required_fields):
                print(f"Documento falta campos requeridos: {doc}")
                errors += 1
                continue
                
            # Intenta indexar el documento
            response = es_client.index(
                index=index_name,
                document=doc,
                id=doc['id']  # Usamos el campo 'id' como ID del documento
            )
            
            if response['result'] in ['created', 'updated']:
                successes += 1
            else:
                errors += 1
                print(f"Respuesta inesperada: {response}")
                
        except Exception as e:
            errors += 1
            print(f"Error indexando documento {doc.get('id', 'sin ID')}: {str(e)}")
    
    print(f"\nResumen: {successes} documentos indexados, {errors} errores")

# Ejemplo de uso:
# documents = [
#     {
#         "id": "1",
#         "text": "Texto de ejemplo",
#         "section": "Sección 1",
#         "question": "Pregunta de ejemplo",
#         "course": "matematicas"
#     },
#     # ... más documentos
# ]
# index_documents(documents)

In [9]:
index_documents(documents)

Indexando documentos:   0%|          | 0/948 [00:00<?, ?it/s]

Error indexando documento c02e79ef: ApiError(503, 'unavailable_shards_exception', '[course-questions][0] primary shard is not active Timeout: [1m], request: [BulkShardRequest [[course-questions][0]] containing [index {[course-questions][c02e79ef], source[{"text":"The purpose of this document is to capture frequently asked technical questions\\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours\'\' live.1\\nSubscribe to course public Google Calendar (it works from Desktop only).\\nRegister before the course starts using this link.\\nJoin the course Telegram channel with announcements.\\nDon’t forget to register in DataTalks.Club\'s Slack and join the channel.","section":"General course-related questions","question":"Course - When will the course start?","course":"data-engineering-zoomcamp","id":"c02e79ef"}]}]]')
Error indexando documento 1f6520ca: ApiError(503, 'unavailable_shards_exception', '[course-questions][0] pr

KeyboardInterrupt: 

In [9]:
def elastic_search(query, course):
    search_query = {
        "size": 5,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": ["question^3", "text", "section"],
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    
    result_docs = []
    
    for hit in response['hits']['hits']:
        result_docs.append(hit['_source'])
    
    return result_docs

In [10]:
elastic_search(
    query="I just discovered the course. Can I still join?",
    course="data-engineering-zoomcamp"
)

ApiError: ApiError(503, 'search_phase_execution_exception', None)

In [2]:
df = pd.read_csv('results_fila.csv')
df.head()

Unnamed: 0,c02e79ef,1f6520ca,7842b56a,0bbf41ec,63394d91,2ed9b986,93e2c8ed,a482086d,eb56ae98,4292531b,...,dc55657f,f6979915,1076a121,aa203ca7,8b04605d,a3b9af04,b16aae74,66326a87,fb3c4150,886d1617
0,"[\n ""What is the exact start date and time ...","[\n ""What background knowledge or skills do...","```json\n[\n ""Is it possible to enroll in the...","[\n ""If I haven't received a confirmation e...","[\n ""What are the recommended preparations ...","[\n ""How many Zoomcamp sessions are offered...","[\n ""How does the current cohort differ fro...","[\n ""Will the course materials remain acces...","[\n ""Is there any support available for stu...","[\n ""What is the main YouTube playlist I sh...",...,"```json\n[\n ""Why does the get-records comm...","```json\n{\n ""questions"": [\n ""Why am I ge...","[\n ""Why does my Git commit fail with the e...","[\n ""Why do I get a 'module not found' erro...","[\n ""Why am I getting a 'module not found' ...","[""Why am I getting a Permission denied error w...","[""How can I efficiently manage and run only sp...","```json\n[\n ""Why are my integration tests ...","[\n ""Why is the pre-commit command failing ...","[\n ""What steps should I follow to tear dow..."
