In [1]:
from fastembed import TextEmbedding


In [2]:
# Inicializa el modelo de embedding específico
# Esto descargará el modelo si es la primera vez que lo usas.
embedding_model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en")

Fetching 5 files:   0%|          | 0/5 [00:00<?, ?it/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

tokenizer_config.json:   0%|          | 0.00/367 [00:00<?, ?B/s]

onnx/model.onnx:   0%|          | 0.00/130M [00:00<?, ?B/s]

In [3]:
query = 'I just discovered the course. Can I join now?'

# El método embed devuelve un generador de embeddings.
# Como es una sola consulta, tomamos el primer elemento.
query_embedding = next(embedding_model.embed(query))

In [5]:
import numpy as np

# Convertir el resultado a un array NumPy explícitamente si no lo fuera ya (TextEmbedding ya devuelve np.ndarray)
# Aunque 'query_embedding' ya será un array de NumPy
query_embedding_array = np.array(query_embedding)

# Calcular el valor mínimo del array
min_value = query_embedding_array.min()

print(f"La consulta: '{query}'")
print(f"El embedding es un array NumPy con {query_embedding_array.shape[0]} dimensiones.")
print(f"El valor mínimo en el array del embedding es: {min_value}")

La consulta: 'I just discovered the course. Can I join now?'
El embedding es un array NumPy con 512 dimensiones.
El valor mínimo en el array del embedding es: -0.11726374368207196


#### Cosine Similarity

In [6]:
np.linalg.norm(query_embedding)

np.float64(1.0)

In [7]:
query_embedding.dot(query_embedding)

np.float64(1.0)

In [8]:
doc = 'Can I still join the course after the start date?'

query_embedding_doc = next(embedding_model.embed(doc))

In [9]:
query_embedding.dot(query_embedding_doc)

np.float64(0.9008529058287051)

In [10]:
import numpy as np
from fastembed import TextEmbedding

# Paso 1: Inicializar el modelo de embedding
embedding_model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en")

# Paso 2: Generar el embedding para la Consulta 1
query1 = 'I just discovered the course. Can I join now?'
q1_embedding = next(embedding_model.embed(query1))

# Paso 3: Generar el embedding para la Consulta 2
query2 = 'Can I still join the course after the start date?'
q2_embedding = next(embedding_model.embed(query2))

# Paso 4: Calcular la similitud de coseno usando el producto punto
# Asumimos que los embeddings están normalizados (longitud 1.0)
cosine_similarity = q1_embedding.dot(q2_embedding)

print(f"Embedding de Consulta 1 (primeros 5 elementos): {q1_embedding[:5]}")
print(f"Longitud de Consulta 1: {np.linalg.norm(q1_embedding):.4f}")
print(f"Embedding de Consulta 2 (primeros 5 elementos): {q2_embedding[:5]}")
print(f"Longitud de Consulta 2: {np.linalg.norm(q2_embedding):.4f}")
print(f"---")
print(f"La similitud de coseno entre la Consulta 1 y la Consulta 2 es: {cosine_similarity:.4f}")

Embedding de Consulta 1 (primeros 5 elementos): [-0.07639464 -0.07305554  0.05865016  0.03926705 -0.0141797 ]
Longitud de Consulta 1: 1.0000
Embedding de Consulta 2 (primeros 5 elementos): [-0.05453042 -0.07834519  0.03136102  0.02342347 -0.03063215]
Longitud de Consulta 2: 1.0000
---
La similitud de coseno entre la Consulta 1 y la Consulta 2 es: 0.9009


In [None]:
import numpy as np
from fastembed import TextEmbedding

# --- 1. Datos de entrada ---
documents = [
    {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
     'section': 'General course-related questions',
     'question': 'Course - Can I still join the course after the start date?',
     'course': 'data-engineering-zoomcamp'},
    {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
     'section': 'General course-related questions',
     'question': 'Course - Can I follow the course after it finishes?',
     'course': 'data-engineering-zoomcamp'},
    {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
     'section': 'General course-related questions',
     'question': 'Course - When will the course start?',
     'course': 'data-engineering-zoomcamp'},
    {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
     'section': 'General course-related questions',
     'question': 'Course - What can I do before the course starts?',
     'course': 'data-engineering-zoomcamp'},
    {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
     'section': 'General course-related questions',
     'question': 'How can we contribute to the course?',
     'course': 'data-engineering-zoomcamp'}
]

query = 'Can I still join the course after the start date?'

# --- 2. Inicializar el modelo de embedding ---
print("Inicializando el modelo de embedding...")
embedding_model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en")

# --- 3. Generar el embedding de la consulta ---
print(f"Generando embedding para la consulta: '{query}'")
query_embedding = next(embedding_model.embed(query))

# --- 4. Generar embeddings para cada documento ---
print("Generando embeddings para los documentos...")
document_texts = [doc['text'] for doc in documents]
# El método embed puede tomar una lista de textos y devolverá un generador
document_embeddings = list(embedding_model.embed(document_texts))

# --- 5. Calcular la similitud de coseno para cada documento ---
print("\nCalculando similitud de coseno entre la consulta y cada documento:")
results = []
for i, doc_embedding in enumerate(document_embeddings):
    # Ya que los embeddings de jinaai/jina-embeddings-v2-small-en están normalizados,
    # el producto punto es directamente la similitud de coseno.
    cosine_similarity = query_embedding.dot(doc_embedding)
    results.append({
        'document_index': i,
        'text': documents[i]['text'][:100] + '...', # Mostrar solo un fragmento del texto
        'similitud_coseno': cosine_similarity
    })
    print(f"Documento {i+1}:")
    print(f"  Texto (fragmento): {documents[i]['text'][:100]}...")
    print(f"  Similitud de Coseno: {cosine_similarity:.4f}\n")

# Opcional: Ordenar los resultados por similitud para ver los más relevantes primero
results_sorted = sorted(results, key=lambda x: x['similitud_coseno'], reverse=True)

print("--- Resultados ordenados por similitud de coseno ---")
for res in results_sorted:
    print(f"Documento {res['document_index']+1} (Similitud: {res['similitud_coseno']:.4f}):")
    print(f"  Texto: {res['text']}\n")

Inicializando el modelo de embedding...
Generando embedding para la consulta: 'Can I still join the course after the start date?'
Generando embeddings para los documentos...

Calculando similitud de coseno entre la consulta y cada documento:
Documento 1:
  Texto (fragmento): Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, t...
  Similitud de Coseno: 0.8106

Documento 2:
  Texto (fragmento): Yes, we will keep all the materials after the course finishes, so you can follow the course at your ...
  Similitud de Coseno: 0.8499

Documento 3:
  Texto (fragmento): The purpose of this document is to capture frequently asked technical questions
The exact day and ho...
  Similitud de Coseno: 0.7991

Documento 4:
  Texto (fragmento): You can start by installing and setting up all the dependencies and requirements:
Google cloud accou...
  Similitud de Coseno: 0.6813

Documento 5:
  Texto (fragmento): Star the repo! Share it with friends if you find

In [12]:
# Suponiendo que `results_sorted` es la lista de diccionarios que obtuvimos antes
# y que ya está ordenada en orden descendente por similitud.
if results_sorted:
    documento_mas_similar = results_sorted[0]
    indice_original = documento_mas_similar['document_index']
    similitud = documento_mas_similar['similitud_coseno']
    print(f"El documento con mayor similitud es el Documento {indice_original + 1} (índice original {indice_original}).")
    print(f"Su similitud de coseno es: {similitud:.4f}")
else:
    print("No se encontraron resultados.")

El documento con mayor similitud es el Documento 2 (índice original 1).
Su similitud de coseno es: 0.8499


In [13]:
import numpy as np
from fastembed import TextEmbedding

# Recreando el paso de obtención de embeddings (usando el modelo real)
documents = [
    {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
     'section': 'General course-related questions',
     'question': 'Course - Can I still join the course after the start date?',
     'course': 'data-engineering-zoomcamp'},
    {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
     'section': 'General course-related questions',
     'question': 'Course - Can I follow the course after it finishes?',
     'course': 'data-engineering-zoomcamp'},
    {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
     'section': 'General course-related questions',
     'question': 'Course - When will the course start?',
     'course': 'data-engineering-zoomcamp'},
    {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
     'section': 'General course-related questions',
     'question': 'Course - What can I do before the course starts?',
     'course': 'data-engineering-zoomcamp'},
    {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
     'section': 'General course-related questions',
     'question': 'How can we contribute to the course?',
     'course': 'data-engineering-zoomcamp'}
]

query = 'Can I still join the course after the start date?'

embedding_model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en")
document_texts = [doc['text'] for doc in documents]
document_embeddings = list(embedding_model.embed(document_texts))

print(f"Tipo de document_embeddings: {type(document_embeddings)}")
print(f"Número de embeddings individuales: {len(document_embeddings)}")
print(f"Forma del primer embedding: {document_embeddings[0].shape}\n")

# --- Métodos para crear una única matriz bidimensional ---

# Método 1: Usando np.stack()
# Ideal cuando quieres apilar arrays a lo largo de un nuevo eje (dimensión).
# Asegura que cada embedding individual se mantenga como una "fila" en la nueva matriz.
all_embeddings_stacked = np.stack(document_embeddings)

print(f"Usando np.stack():")
print(f"Tipo de la matriz resultante: {type(all_embeddings_stacked)}")
print(f"Forma de la matriz resultante: {all_embeddings_stacked.shape}")
# La forma será (número_de_documentos, dimensionalidad_del_embedding)
# Por ejemplo, (5, 512) si tienes 5 documentos y embeddings de 512 dimensiones.
print(f"Primeros 5 valores del primer embedding en la matriz apilada:\n{all_embeddings_stacked[0, :5]}\n")


# Método 2: Usando np.array() directamente con la lista
# Si todos los elementos de la lista son arrays NumPy de la misma forma,
# np.array() los convertirá automáticamente en una matriz multidimensional.
all_embeddings_from_list = np.array(document_embeddings)

print(f"Usando np.array() directamente:")
print(f"Tipo de la matriz resultante: {type(all_embeddings_from_list)}")
print(f"Forma de la matriz resultante: {all_embeddings_from_list.shape}")
# La forma será idéntica a np.stack(): (número_de_documentos, dimensionalidad_del_embedding)
print(f"Primeros 5 valores del primer embedding en la matriz desde lista:\n{all_embeddings_from_list[0, :5]}\n")

print(f"Generando embedding para la consulta: '{query}'")
query_embedding = next(embedding_model.embed(query))





Tipo de document_embeddings: <class 'list'>
Número de embeddings individuales: 5
Forma del primer embedding: (512,)

Usando np.stack():
Tipo de la matriz resultante: <class 'numpy.ndarray'>
Forma de la matriz resultante: (5, 512)
Primeros 5 valores del primer embedding en la matriz apilada:
[-0.02495248 -0.0396454  -0.00437673  0.02958302 -0.01203007]

Usando np.array() directamente:
Tipo de la matriz resultante: <class 'numpy.ndarray'>
Forma de la matriz resultante: (5, 512)
Primeros 5 valores del primer embedding en la matriz desde lista:
[-0.02495248 -0.0396454  -0.00437673  0.02958302 -0.01203007]

Generando embedding para la consulta: 'Can I still join the course after the start date?'


In [19]:
all_embeddings_from_list.dot(query_embedding).max()

# V.dot(q)

np.float64(0.8498906718659862)

In [14]:
import numpy as np
from fastembed import TextEmbedding

# --- 1. Datos de entrada ---
documents = [
    {'text': "Yes, even if you don't register, you're still eligible to submit the homeworks.\nBe aware, however, that there will be deadlines for turning in the final projects. So don't leave everything for the last minute.",
     'section': 'General course-related questions',
     'question': 'Course - Can I still join the course after the start date?',
     'course': 'data-engineering-zoomcamp'},
    {'text': 'Yes, we will keep all the materials after the course finishes, so you can follow the course at your own pace after it finishes.\nYou can also continue looking at the homeworks and continue preparing for the next cohort. I guess you can also start working on your final capstone project.',
     'section': 'General course-related questions',
     'question': 'Course - Can I follow the course after it finishes?',
     'course': 'data-engineering-zoomcamp'},
    {'text': "The purpose of this document is to capture frequently asked technical questions\nThe exact day and hour of the course will be 15th Jan 2024 at 17h00. The course will start with the first  “Office Hours'' live.1\nSubscribe to course public Google Calendar (it works from Desktop only).\nRegister before the course starts using this link.\nJoin the course Telegram channel with announcements.\nDon’t forget to register in DataTalks.Club's Slack and join the channel.",
     'section': 'General course-related questions',
     'question': 'Course - When will the course start?',
     'course': 'data-engineering-zoomcamp'},
    {'text': 'You can start by installing and setting up all the dependencies and requirements:\nGoogle cloud account\nGoogle Cloud SDK\nPython 3 (installed with Anaconda)\nTerraform\nGit\nLook over the prerequisites and syllabus to see if you are comfortable with these subjects.',
     'section': 'General course-related questions',
     'question': 'Course - What can I do before the course starts?',
     'course': 'data-engineering-zoomcamp'},
    {'text': 'Star the repo! Share it with friends if you find it useful ❣️\nCreate a PR if you see you can improve the text or the structure of the repository.',
     'section': 'General course-related questions',
     'question': 'How can we contribute to the course?',
     'course': 'data-engineering-zoomcamp'}
]

query = 'Can I still join the course after the start date?'

# --- 2. Inicializar el modelo de embedding ---
print("Inicializando el modelo de embedding...")
embedding_model = TextEmbedding(model_name="jinaai/jina-embeddings-v2-small-en")

# --- 3. Generar el embedding de la consulta ---
print(f"Generando embedding para la consulta: '{query}'")
query_embedding = next(embedding_model.embed(query))

# --- 4. Generar embeddings para cada documento ---
print("Generando embeddings para los documentos...")
document_texts = [doc['question'] + ' ' + doc['text'] for doc in documents]
# El método embed puede tomar una lista de textos y devolverá un generador
document_embeddings = list(embedding_model.embed(document_texts))

# --- 5. Calcular la similitud de coseno para cada documento ---
print("\nCalculando similitud de coseno entre la consulta y cada documento:")
results = []
for i, doc_embedding in enumerate(document_embeddings):
    # Ya que los embeddings de jinaai/jina-embeddings-v2-small-en están normalizados,
    # el producto punto es directamente la similitud de coseno.
    cosine_similarity = query_embedding.dot(doc_embedding)
    results.append({
        'document_index': i,
        'text': documents[i]['text'][:100] + '...', # Mostrar solo un fragmento del texto
        'similitud_coseno': cosine_similarity
    })
    print(f"Documento {i+1}:")
    print(f"  Texto (fragmento): {documents[i]['text'][:100]}...")
    print(f"  Similitud de Coseno: {cosine_similarity:.4f}\n")

# Opcional: Ordenar los resultados por similitud para ver los más relevantes primero
results_sorted = sorted(results, key=lambda x: x['similitud_coseno'], reverse=True)

print("--- Resultados ordenados por similitud de coseno ---")
for res in results_sorted:
    print(f"Documento {res['document_index']+1} (Similitud: {res['similitud_coseno']:.4f}):")
    print(f"  Texto: {res['text']}\n")

Inicializando el modelo de embedding...
Generando embedding para la consulta: 'Can I still join the course after the start date?'
Generando embeddings para los documentos...

Calculando similitud de coseno entre la consulta y cada documento:
Documento 1:
  Texto (fragmento): Yes, even if you don't register, you're still eligible to submit the homeworks.
Be aware, however, t...
  Similitud de Coseno: 0.9273

Documento 2:
  Texto (fragmento): Yes, we will keep all the materials after the course finishes, so you can follow the course at your ...
  Similitud de Coseno: 0.8595

Documento 3:
  Texto (fragmento): The purpose of this document is to capture frequently asked technical questions
The exact day and ho...
  Similitud de Coseno: 0.8397

Documento 4:
  Texto (fragmento): You can start by installing and setting up all the dependencies and requirements:
Google cloud accou...
  Similitud de Coseno: 0.7632

Documento 5:
  Texto (fragmento): Star the repo! Share it with friends if you find

In [15]:
if results_sorted:
    documento_mas_similar = results_sorted[0]
    indice_original = documento_mas_similar['document_index']
    similitud = documento_mas_similar['similitud_coseno']
    print(f"El documento con mayor similitud es el Documento {indice_original + 1} (índice original {indice_original}).")
    print(f"Su similitud de coseno es: {similitud:.4f}")
else:
    print("No se encontraron resultados.")

El documento con mayor similitud es el Documento 1 (índice original 0).
Su similitud de coseno es: 0.9273


In [16]:
import requests 

docs_url = 'https://github.com/alexeygrigorev/llm-rag-workshop/raw/main/notebooks/documents.json'
docs_response = requests.get(docs_url)
documents_raw = docs_response.json()


documents = []

for course in documents_raw:
    course_name = course['course']
    if course_name != 'machine-learning-zoomcamp':
        continue

    for doc in course['documents']:
        doc['course'] = course_name
        documents.append(doc)

In [20]:
len(documents)

375

In [None]:
from fastembed import TextEmbedding
import numpy as np
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams

# --- 1. Datos  ---


query = "I just discovered the course. Can I join now?"

# --- 2. Inicializar modelo de embedding ---
model = TextEmbedding(model_name="BAAI/bge-small-en")

# --- 3. Preparar texto para embedding ---
texts = [doc['question'] + ' ' + doc['text'] for doc in documents]

# --- 4. Generar embeddings ---
print("Generando embeddings...")
embeddings = list(model.embed(texts))
query_embedding = next(model.embed(query))

# Convertir a listas normales (list) desde tipos internos de fastembed
embeddings_list = [emb.tolist() for emb in embeddings]
query_vector = query_embedding.tolist()

# --- 5. Conectar con Qdrant ---
client = QdrantClient(url="http://localhost:6333")

collection_name = "course_qa"

# Crear colección (si no existe)
if not client.collection_exists(collection_name):
    print(f"Creando colección '{collection_name}'...")
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(size=len(embeddings_list[0]), distance=Distance.COSINE)
    )

# Subir vectores a Qdrant
print("Indexando documentos...")
client.upsert(
    collection_name=collection_name,
    points=[
        {
            "id": idx,
            "vector": embedding,
            "payload": {
                "text": doc["text"],
                "question": doc["question"],
                "course": doc["course"]
            }
        }
        for idx, (embedding, doc) in enumerate(zip(embeddings_list, documents))
    ]
)

# --- 6. Buscar en Qdrant ---
print("\nBuscando resultados...")
search_result = client.search(
    collection_name=collection_name,
    query_vector=query_vector,
    limit=1  # Solo queremos el primer resultado
)

# --- 7. Mostrar el primer resultado y su score ---
first_result = search_result[0]
print("\n--- Primer resultado devuelto ---")
print("ID:", first_result.id)
print("Score:", first_result.score)
print("Payload:", first_result.payload)

Generando embeddings...
Indexando documentos...

Buscando resultados...


  search_result = client.search(



--- Primer resultado devuelto ---
ID: 14
Score: 0.87031716
Payload: {'text': 'Yes, you can. You won’t be able to submit some of the homeworks, but you can still take part in the course.\nIn order to get a certificate, you need to submit 2 out of 3 course projects and review 3 peers’ Projects by the deadline. It means that if you join the course at the end of November and manage to work on two projects, you will still be eligible for a certificate.', 'question': 'The course has already started. Can I still join it?', 'course': 'machine-learning-zoomcamp'}
