# Base functions
> Functions to complete the RAG process.

## Verify if exist all the necessary files

In [1]:
# %%script false --no-raise-error
from pathlib import Path
import json
import subprocess

# Define the relative path to the 'data' directory and the document name
data_dir = Path('..', 'data')
# Create the 'data' directory if it doesn't exist
data_dir.mkdir(parents=True, exist_ok=True)

document_url = "https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/documents.json"
file_name = document_url.split('/')[-1]
document_path = Path(data_dir, file_name)


if not document_path.is_file():
    result = subprocess.run(['wget', document_url, '-O', str(document_path)], capture_output=True, text=True)
    if result.returncode == 0:
        print(f"{file_name} download successful")
    else:
        print(f"Error occurred downloading {file_name}: {result.stderr}") 

with open(document_path, 'rt') as f_in:
    documents_raw = json.load(f_in)

documents = []
for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

minsearch_url = "https://raw.githubusercontent.com/DataTalksClub/llm-zoomcamp/main/01-intro/minsearch.py"
file_name = minsearch_url.split('/')[-1]
path = Path('.', file_name)

if not path.is_file():
    result = subprocess.run(['wget', minsearch_url], capture_output=True, text=True)
    if result.returncode == 0:
        print(f"{file_name} download successful")
    else:
        print(f"Error occurred downloading {file_name}: {result.stderr}") 

## If we want to download the base document file instead to use the saved in disk

In [2]:
%%script false --no-raise-error

import requests 

docs_url = 'https://github.com/DataTalksClub/llm-zoomcamp/blob/main/01-intro/documents.json?raw=1'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()

documents = []

for course in documents_raw:
    course_name = course['course']

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

## Search index using TF-IDF and cosine similarity for text fields and exact matching for keyword fields.


In [3]:
from minsearch import Index

index = Index(
    text_fields=["question", "text", "section"],
    keyword_fields=["course"]
)

index.fit(documents)

<minsearch.Index at 0x70bafc14f400>

In [4]:
query = "Can I join the course if it has already started?"

filter_dict = {"course": "data-engineering-zoomcamp"}
boost_dict = {"question": 3, "section": 0.5}
minsearch_results = index.search(query, 
                       filter_dict, 
                       boost_dict, 
                       num_results=3)

for result in minsearch_results:
    print(result)

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", 'section': 'General course-related questions', 'question': 'Course - Can I still join the course after the start date?', 'course': 'data-engineering-zoomcamp'}
{'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.', 'section': 'General course-related questions', 'question': 'Course - Can I follow the course after it finishes?', 'course': 'data-engineering-zoomcamp'}
{'text': 'Yes, the slack channel remains open and you can ask questions there. But always sDocker containers exit code w search the channel first and second, check t

In [5]:
def minsearch_index_search(query: str, filter_dict: dict={}, boost_dict: dict={}, num_results: int=3):
    results = index.search(query, filter_dict, boost_dict, num_results)
    return results  

In [6]:
query = "Can I join the course if it has already started?"
filter_dict = {"course": "data-engineering-zoomcamp"}
boost_dict = {"question": 3, "section": 0.5}

minsearch_results = minsearch_index_search(query, filter_dict, boost_dict)

for result in minsearch_results:
    print(result)

{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.", 'section': 'General course-related questions', 'question': 'Course - Can I still join the course after the start date?', 'course': 'data-engineering-zoomcamp'}
{'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.', 'section': 'General course-related questions', 'question': 'Course - Can I follow the course after it finishes?', 'course': 'data-engineering-zoomcamp'}
{'text': 'Yes, the slack channel remains open and you can ask questions there. But always sDocker containers exit code w search the channel first and second, check t

## Create Vector storage (ElasticSearch)

- How to create a docker container for ElasticSearch:

```bash
docker run -d \
    --name elasticsearch \
    -p 9200:9200 \
    -p 9300:9300 \
    -e "discovery.type=single-node" \
    -e "xpack.security.enabled=false" \
    docker.elastic.co/elasticsearch/elasticsearch:8.4.3
```

- How to verify if it is running, execute in a `terminal`:

```bash
curl http://localhost:9200
```

- This is the index setting:  

```json
{
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"} 
        }
    }
}
```

- This is a query example:  

```json
{
    "size": 5,
    "query": {
        "bool": {
            "must": {
                "multi_match": {
                    "query": query,
                    "fields": ["question^3", "text", "section"],
                    "type": "best_fields"
                }
            },
            "filter": {
                "term": {
                    "course": "data-engineering-zoomcamp"
                }
            }
        }
    }
}
```

In [13]:
from elasticsearch import Elasticsearch
from tqdm.auto import tqdm

es_client = Elasticsearch("http://localhost:9200")

index_settings = {
    "settings": {
        "number_of_shards": 1,
        "number_of_replicas": 0,
    },
    "mappings": {
        "properties": {
            "text": {"type": "text"},
            "section": {"type": "text"},
            "question": {"type": "text"},
            "course": {"type": "keyword"}
        }
    }
}

index_name = "homework-course"
# Check if the index exists
if es_client.indices.exists(index=index_name):
    print(f"The index '{index_name}' already exists.")
else:
    es_client.indices.create(index=index_name, body=index_settings)
    for doc in tqdm(documents):
        es_client.index(index=index_name, document=doc)
    
    print(f"The index '{index_name}' has been created.")


  0%|          | 0/948 [00:00<?, ?it/s]

The index 'homework-course' has been created.


In [14]:
def elastic_search(index_name: str, query: str, fields: list[str], filter_course: str, num_results: int=5):
    search_query = {
        "size": num_results,
        "query": {
            "bool": {
                "must": {
                    "multi_match": {
                        "query": query,
                        "fields": fields,
                        "type": "best_fields"
                    }
                },
                "filter": {
                    "term": {
                        "course": filter_course
                    }
                }
            }
        }
    }

    response = es_client.search(index=index_name, body=search_query)
    result_docs = [hit['_source'] for hit in response['hits']['hits']]
    return result_docs

In [16]:
# %%script false --no-raise-error

index_name = "homework-course"
query = "I just discovered the course. Can I still join it?"
fields = ["question^3", "text", "section"]
filter_course="data-engineering-zoomcamp"
elastic_search(index_name, query, fields, filter_course)

[{'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
  'section': 'General course-related questions',
  'question': 'Course - Can I still join the course after the start date?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
  'section': 'General course-related questions',
  'question': 'Course - Can I follow the course after it finishes?',
  'course': 'data-engineering-zoomcamp'},
 {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (insta

## Create the prompt, LLM and RAG functions

In [17]:
def build_user_prompt(query: str, search_result: str):
    user_prompt_template = """
You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: {question}

CONTEXT:
{context}
""".strip()

    context_template = """
S: {section}
Q: {question}
A: {text}
""".strip()

    context= ""
    
    for doc in search_result:
        context += f"{context_template.format(section=doc['section'], question=doc['question'], text=doc['text'])}\n\n"

    user_prompt = user_prompt_template.format(question=query, context=context).strip()
    return user_prompt

In [18]:
index_name = "homework-course"
query="How do I execute a command in a running docker container?"
fields = ["question^4", "text", "section"]
filter_course="machine-learning-zoomcamp"
search_result = elastic_search(index_name, query, fields, filter_course, num_results=3)
prompt = build_user_prompt(query, search_result)

In [19]:
print(prompt)

You're a course teaching assistant. Answer the QUESTION based on the CONTEXT from the FAQ database.
Use only the facts from the CONTEXT when answering the QUESTION.

QUESTION: How do I execute a command in a running docker container?

CONTEXT:
S: 5. Deploying Machine Learning Models
Q: How do I debug a docker container?
A: Launch the container image in interactive mode and overriding the entrypoint, so that it starts a bash command.
docker run -it --entrypoint bash <image>
If the container is already running, execute a command in the specific container:
docker ps (find the container-id)
docker exec -it <container-id> bash
(Marcos MJD)

S: 5. Deploying Machine Learning Models
Q: How do I copy files from my local machine to docker container?
A: You can copy files from your local machine into a Docker container using the docker cp command. Here's how to do it:
To copy a file or directory from your local machine into a running Docker container, you can use the `docker cp command`. The basi

In [27]:
from openai import OpenAI
from dotenv import load_dotenv

load_dotenv()

open_source = True

if open_source:  
    client = OpenAI(
        base_url='http://localhost:11434/v1/',
        api_key='ollama',
    )
else:
    client = OpenAI()


In [28]:
def llm(user_prompt: str, model: str="gpt-3.5-turbo"):
    response = client.chat.completions.create(
        model=model,
        messages= [
            {"role": "user", "content": user_prompt},
        ]
    )
    return response.choices[0].message.content

In [29]:
def rag_elastic(index_name: str, query: str, fields: list[str], filter_course: str, num_results: int, model ):
    search_result = elastic_search(index_name, query, fields, filter_course, num_results)
    user_prompt = build_user_prompt(query, search_result)
    answer = llm(user_prompt, model)
    return answer


In [30]:
index_name = "homework-course"
query="How do I execute a command in a running docker container?"
fields = ["question^4", "text", "section"]
filter_course="machine-learning-zoomcamp"
num_results = 3
model = "phi3"
answer = rag_elastic(index_name, query, fields, filter_course, num_results, model)
answer

' To execute a command in a running docker container, you can use the `docker exec` command. If the container is already running, you can find the container ID using `docker ps`, and then execute a command by appending the container ID followed by `-it bash`. The syntax would look like:\n\n```\ndocker exec -it <container-id> bash\n```'

## Notes to make a backup and restore of elasticsearch index

- Open a bash session in the elasticsearch container
```bash
docker exec -it elasticsearch bash
```

- Add path.repo configuration
```bash
echo path.repo: ["/usr/share/elasticsearch/backup"] >> /usr/share/elasticsearch/config/elasticsearch.yml
exit
```

- Restart container and verify it was created correctly
```bash
docker restart elasticsearch
curl -X GET "localhost:9200/_snapshot/my_backup?pretty"
```

- Create the snapshot
```bash
curl -X PUT "localhost:9200/_snapshot/my_backup/snapshot_1?wait_for_completion=true" -H 'Content-Type: application/json' -d'
{
  "indices": "your_index_name",
  "ignore_unavailable": true,
  "include_global_state": false
}
'
```

- Copy the backup to my machine
```bash
docker cp elasticsearch:/usr/share/elasticsearch/backup /path/to/local
```

- Now create the new one with docker-compose:
```bash
docker compose up -d
```

- Add de path.repo configuration in the new one, same as before.
```bash
docker exec -it new_elasticsearch bash
echo path.repo: ["/usr/share/elasticsearch/backup"] >> /usr/share/elasticsearch/config/elasticsearch.yml
```

- Restart the docker container and copy the snapshot in it
```bash
docker restart new_elasticsearch
docker cp /path/to/local/backup new_elasticsearch:/usr/share/elasticsearch
```

- Register the Snapshot Repository in the New Container.
```bash
curl -X PUT "localhost:9200/_snapshot/my_backup" -H 'Content-Type: application/json' -d'
{
  "type": "fs",
  "settings": {
    "location": "/usr/share/elasticsearch/backup"
  }
}
'
```

- Verify if it exists
```bash
curl -X GET "localhost:9200/_snapshot/my_backup/snapshot_1?pretty"
```

- Restore the snapshot
```bash
curl -X POST "localhost:9200/_snapshot/my_backup/snapshot_1/_restore" -H 'Content-Type: application/json' -d'
{
  "indices": "your_index_name",
  "ignore_unavailable": true,
  "include_global_state": false
}
'
```

- Extra point: If you want to change the original index name by other when you restore the snapshot:
```bash
curl -X POST "localhost:9200/_snapshot/my_backup/snapshot_1/_restore?pretty" -H 'Content-Type: application/json' -d'
{
  "indices": "old_index",
  "ignore_unavailable": true,
  "include_global_state": false,
  "rename_pattern": "old_index",
  "rename_replacement": "new_index"
}
'
```

- Show your indexes:
```bash
curl -X GET "localhost:9200/_cat/indices?v"
```