# Create Vector and store it

In [None]:
# Install requirements

!pip install -q llama-index==0.12.21 openai==1.59.8 tiktoken==0.8.0  huggingface-hub==0.30 chromadb==0.6.0 llama-index-vector-stores-chroma==0.4.1 llama-index-llms-gemini==0.4.1 llama_index.embeddings.huggingface==0.5.4 llama-index-embeddings-adapter==0.3.0 llama-index-finetuning==0.3.2

In [None]:
# set variables
import os
from google.colab import userdata

os.environ["OPENAI_API_KEY"] = userdata.get('openai_api_key')

os.environ["HF_TOKEN"] = userdata.get('HF_TOKEN2')
HF_TOKEN = userdata.get('HF_TOKEN2')


In [None]:
# Allows running asyncio in environments with an existing event loop, like Jupyter notebooks.
import nest_asyncio
nest_asyncio.apply()

# Create a VectoreStore


In [None]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore

# create client and a new collection
#chroma_client = chromadb.EphemeralClient() # saves data in-memory.
chroma_path = './azure-architect'
chroma_client = chromadb.PersistentClient(path=chroma_path)
chroma_collection = chroma_client.get_or_create_collection("azure-architect")

# Define a storage context object using the created vector database.
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

# Transforming


In [None]:
import multiprocessing

cores = multiprocessing.cpu_count() # Count the number of cores in a computer
cores

8

In [None]:
# Download nodes

from huggingface_hub import snapshot_download

snapshot_download(repo_id="vicpada/AzureResources", allow_patterns=[ "*.pkl"],repo_type="dataset",local_dir="/content")

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

enhanced_nodes_microsoft-learn.pkl:   0%|          | 0.00/324M [00:00<?, ?B/s]

enhanced_nodes_github-samples.pkl:   0%|          | 0.00/212M [00:00<?, ?B/s]

enhanced_nodes_azure-architecture.pkl:   0%|          | 0.00/18.3M [00:00<?, ?B/s]

enhanced_nodes_azure-updates.pkl:   0%|          | 0.00/22.2M [00:00<?, ?B/s]

'/content'

In [None]:
# Merge and load nodes

import pickle
import os

all_data = []

# Assuming the directory is /content where the files were downloaded
for filename in os.listdir('/content'):
  if filename.endswith('.pkl'):
    file_path = os.path.join('/content', filename)
    with open(file_path, 'rb') as f:
      data = pickle.load(f)
      all_data.extend(data) # Assuming each pickle file contains a list

# Optional: Save the combined data to a new pickle file
with open('combined_data.pkl', 'wb') as f:
  pickle.dump(all_data, f)

print(f"Combined data from {len([f for f in os.listdir('/content') if f.endswith('.pkl')])} files into a single list with {len(all_data)} items.")

Combined data from 5 files into a single list with 195707 items.


In [None]:
# prompt: download a finetuned model from hugging face, repor vicpada/finetuned_embed_model

from huggingface_hub import snapshot_download

snapshot_download(repo_id="vicpada/finetuned_embed_model_full", repo_type="model", local_dir="./finetuned_embed_model_full")

Fetching 3 files:   0%|          | 0/3 [00:00<?, ?it/s]

pytorch_model.bin:   0%|          | 0.00/593k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/55.0 [00:00<?, ?B/s]

.gitattributes:   0%|          | 0.00/1.52k [00:00<?, ?B/s]

'/content/finetuned_embed_model_full'

In [None]:
from llama_index.core.embeddings import resolve_embed_model
from llama_index.embeddings.adapter import AdapterEmbeddingModel

# Load the Base model without fine-tuning
base_embed_model = resolve_embed_model("local:BAAI/bge-small-en-v1.5")

# Load the Fine-tuned model.
embed_model = AdapterEmbeddingModel(base_embed_model, "finetuned_embed_model_full")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.core.ingestion import IngestionPipeline


# Create the pipeline to apply the transformation (splitting and embedding) on each chunk,
# and store the transformed text in the chroma vector store.
pipeline = IngestionPipeline(
    transformations=[
        embed_model,
    ],
    vector_store=vector_store,
)



In [None]:
def processNodes(pipeline, nodes):
  b = pipeline.run(nodes=nodes, show_progress=True#, num_workers=cores
                   )
  return b

In [None]:
import pprint
# prompt: Iterate documents in batches of ten

# Define batch size
batch_size = 1000
nodes_to_process = len(all_data)

# Iterate through documents in batches
for i in range(0, nodes_to_process, batch_size):
    print(f"Batch {1+i/batch_size} of {nodes_to_process/batch_size}")
    batch_documents = all_data[i:i + batch_size]
    # Process the batch of documents using the pipeline
    b = processNodes(pipeline, batch_documents)

print("Finished processing documents in batches.")


Batch 1.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 2.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 3.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 4.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 5.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 6.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 7.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 8.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 9.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 10.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 11.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 12.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 13.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 14.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 15.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 16.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 17.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 18.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 19.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 20.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 21.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 22.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 23.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 24.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 25.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 26.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 27.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 28.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 29.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 30.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 31.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 32.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 33.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 34.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 35.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 36.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 37.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 38.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 39.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 40.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 41.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 42.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 43.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 44.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 45.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 46.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 47.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 48.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 49.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 50.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 51.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 52.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 53.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 54.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 55.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 56.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 57.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 58.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 59.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 60.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 61.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 62.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 63.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 64.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 65.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 66.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 67.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 68.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 69.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 70.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 71.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 72.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 73.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 74.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 75.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 76.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 77.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 78.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 79.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 80.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 81.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 82.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 83.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 84.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 85.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 86.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 87.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 88.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 89.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 90.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 91.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 92.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 93.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 94.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 95.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 96.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 97.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 98.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 99.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 100.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 101.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 102.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 103.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 104.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 105.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 106.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 107.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 108.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 109.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 110.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 111.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 112.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 113.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 114.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 115.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 116.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 117.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 118.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 119.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 120.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 121.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 122.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 123.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 124.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 125.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 126.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 127.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 128.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 129.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 130.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 131.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 132.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 133.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 134.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 135.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 136.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 137.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 138.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 139.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 140.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 141.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 142.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 143.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 144.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 145.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 146.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 147.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 148.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 149.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 150.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 151.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 152.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 153.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 154.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 155.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 156.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 157.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 158.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 159.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 160.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 161.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 162.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 163.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 164.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 165.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 166.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 167.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 168.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 169.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 170.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 171.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 172.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 173.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 174.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 175.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 176.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 177.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 178.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 179.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 180.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 181.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 182.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 183.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 184.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 185.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 186.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 187.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 188.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 189.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 190.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 191.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 192.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 193.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 194.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 195.0 of 195.707


Generating embeddings:   0%|          | 0/1000 [00:00<?, ?it/s]

Batch 196.0 of 195.707


Generating embeddings:   0%|          | 0/707 [00:00<?, ?it/s]

Finished processing documents in batches.


In [None]:
# Optional: Save the combined data to a new pickle file
with open('all_data_embedded.pkl', 'wb') as f:
  pickle.dump(all_data, f)


In [None]:
# prompt: upload pkl file to hugging face

from huggingface_hub import HfApi

# Specify the path to the .pkl file you want to upload
pkl_file_path = 'all_data_embedded.pkl'

# Specify your Hugging Face repository information
repo_id = "vicpada/AzureResources"  # Replace with your username and desired repo name
# Ensure the repository exists on Hugging Face, or create it before uploading.

# Initialize the HfApi
api = HfApi()

# Upload the file
try:
    api.upload_file(
        path_or_fileobj=pkl_file_path,
        path_in_repo='embedded/' + os.path.basename(pkl_file_path),
        repo_id=repo_id,
        repo_type="dataset",  # Or "model" or "space" depending on your repository type
        token=HF_TOKEN # Use the token you already defined
    )
    print(f"Successfully uploaded {pkl_file_path} to {repo_id}")
except Exception as e:
    print(f"Error uploading file: {e}")



all_data_embedded.pkl:   0%|          | 0.00/1.26G [00:00<?, ?B/s]

Successfully uploaded all_data_embedded.pkl to vicpada/AzureResources


In [None]:
# prompt: zip vectorDB and upload to hugging face

# Zip the vectorDB folder
#!zip -r vectorDB.zip ./vectorDB

from huggingface_hub import HfApi, HfFolder

# Authenticate with Hugging Face (if you haven't already)
# Replace with your actual Hugging Face API key or use userdata if stored
# HfFolder.save_token(userdata.get('HF_TOKEN')) # Uncomment and set HF_TOKEN in Colab secrets if using userdata

api = HfApi(token=HF_TOKEN)

# Specify your Hugging Face repository details
repo_id = "vicpada/AzureArchitectKnowledgeFull" # Replace with your username and repository name

# Create the repository if it doesn't exist
api.create_repo(repo_id, repo_type="dataset", exist_ok=True) # Uncomment if creating a new dataset repo

# Upload the zipped file to the repository
api.upload_folder(
    folder_path="./azure-architect",
    repo_id=repo_id,
    repo_type="dataset",
    delete_patterns="*"
)

print(f"Successfully uploaded vectorDB.zip to Hugging Face repository: {repo_id}")


data_level0.bin:   0%|          | 0.00/327M [00:00<?, ?B/s]

index_metadata.pickle:   0%|          | 0.00/12.1M [00:00<?, ?B/s]

length.bin:   0%|          | 0.00/780k [00:00<?, ?B/s]

header.bin:   0%|          | 0.00/100 [00:00<?, ?B/s]

Upload 6 LFS files:   0%|          | 0/6 [00:00<?, ?it/s]

link_lists.bin:   0%|          | 0.00/1.66M [00:00<?, ?B/s]

chroma.sqlite3:   0%|          | 0.00/5.20G [00:00<?, ?B/s]

Successfully uploaded vectorDB.zip to Hugging Face repository: vicpada/AzureArchitectKnowledgeFull


# Load Indexes


In [None]:
import chromadb
from llama_index.vector_stores.chroma import ChromaVectorStore


# Load the vector store from the local storage.
chroma_path = './azure-architect'
db = chromadb.PersistentClient(path=chroma_path, settings=chromadb.config.Settings(allow_reset=True))
db.list_collections()
chroma_collection = db.get_collection("azure-architect")
chroma_collection.peek()
vector_store = ChromaVectorStore(chroma_collection=chroma_collection)

In [None]:
chroma_collection.peek()

{'ids': ['d866ff40-8d27-4447-a142-9641865b6aa9',
  '27db2386-05ff-4415-b3c8-fc30f40ef8e2',
  '05f145dd-9e85-4432-b36b-25768c9f54d1',
  '56497153-33fe-4b46-904d-75bb33efe1f8',
  '6b2f2b94-3210-4ab0-b829-80205656fc31',
  '1cb11ef9-f4d9-44f7-8fb3-d5d71664a5e9',
  'd2d88404-ad80-44e5-8299-35c6373b3892',
  'f7ac8d77-51fe-4b75-b660-7d37d26580ef',
  '07803f55-a6fc-4b28-9e6f-fc532a7864ca',
  'e7f631bb-8ace-46cf-9a6f-92fb405438fa'],
 'embeddings': array([[-0.04770017,  0.01702834, -0.01900982, ...,  0.06583185,
         -0.02026628, -0.01509645],
        [-0.04897779, -0.01541488, -0.06792049, ...,  0.05747938,
          0.03297177, -0.04528217],
        [-0.03880099, -0.02372612, -0.02887052, ...,  0.03350438,
          0.00538404, -0.05245819],
        ...,
        [-0.03857255, -0.026109  , -0.01282983, ...,  0.01373301,
          0.01062632, -0.01129852],
        [-0.05812408, -0.03337533, -0.01429535, ...,  0.05321993,
         -0.01180594, -0.01416142],
        [-0.07606846, -0.00972674, 

In [None]:
from llama_index.core import VectorStoreIndex

# Create the index based on the vector store.
index = VectorStoreIndex.from_vector_store(vector_store)

In [None]:
from IPython.display import Markdown, display
def display_response(response):
    display(Markdown(f"<b>{response}</b>"))

In [None]:
from llama_index.llms.gemini import Gemini

# Define a query engine that is responsible for retrieving related pieces of text,
# and using a LLM to formulate the final answer.

llm = Gemini(model="models/gemini-2.0-flash", temperature=1, max_tokens=512)

query_engine = index.as_query_engine(llm=llm, similarity_top_k=5, embed_model=embed_model)

In [None]:
query = "When to use Azure Functions instead of App Service? Please explain with details. Use only the context provided" # Enter your query here, it should be relevant to the crawled websites
res = query_engine.query(query)
display_response(res)

print("-----------------")
# Show the retrieved nodes
for src in res.source_nodes:
  print("Node ID\t", src.node_id)
  print("Title\t", src.metadata['title'])
  print("URL\t", src.metadata['url'])
  print("Score\t", src.score)
  print("Content Type\t", src.metadata.get("contentType"))
  print("-_"*20)

<b>Azure Functions provides various hosting options for different business needs and application workloads. One can opt for fully serverless options, where you only pay for execution time, or choose to have always-warm instances for faster response times. You can also host functions in an existing App Service plan when you have excess App Service hosting resources. If you desire complete control over your runtime environment and dependencies, you have the option to deploy your functions in customizable containers. These containers can be hosted by Functions, deployed as part of a microservices architecture in Azure Container Apps, or self-hosted in Kubernetes. Running Functions on Container Apps is a good fit for a multitype microservices architecture in a centralized environment, providing consistent network, observability, and billing configurations.
</b>

-----------------
Node ID	 7bb7cadb-c606-46fe-acf4-eab83041d23d
Title	 Guidance for developing Azure Functions | Microsoft Learn
URL	 https://learn.microsoft.com/en-us/azure/azure-functions/functions-reference?tabs=blob&toc=%2Fazure%2Fdeveloper%2Fjavascript%2Ftoc.json&bc=%2Fazure%2Fdeveloper%2Fjavascript%2Fbreadcrumb%2Ftoc.json&pivots=programming-language-csharp
Score	 0.5963049277608097
Content Type	 None
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
Node ID	 2befe6e7-6f60-40b5-a303-7ff17bad7076
Title	 Azure Functions – Serverless Functions in Computing | Microsoft Azure
URL	 https://azure.microsoft.com/en-gb/products/functions/
Score	 0.5875640306714637
Content Type	 None
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
Node ID	 0508adbd-1e4d-4532-b133-9b4e3dde0aca
Title	 Azure Functions – Serverless Functions in Computing | Microsoft Azure
URL	 https://azure.microsoft.com/en-us/products/functions/
Score	 0.5844802046476505
Content Type	 None
-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_-_
Node ID	 5e51

# Evaluate

In [None]:
from huggingface_hub import hf_hub_download
from llama_index.finetuning.embeddings.common import (
    EmbeddingQAFinetuneDataset,
)

# Download the evaluation dataset
hf_hub_download(repo_id="vicpada/AzureRAGEval", filename="rag_eval_dataset2.json", repo_type="dataset", local_dir=".")
rag_eval_dataset = EmbeddingQAFinetuneDataset.from_json("./rag_eval_dataset2.json")

In [None]:
import pandas as pd


#  A simple function to show the evaluation result.
def display_results_retriever(name, eval_results):
    """Display results from evaluate."""

    metric_dicts = []
    for eval_result in eval_results:
        metric_dict = eval_result.metric_vals_dict
        metric_dicts.append(metric_dict)

    full_df = pd.DataFrame(metric_dicts)

    hit_rate = full_df["hit_rate"].mean()
    mrr = full_df["mrr"].mean()
    precision = full_df["precision"].mean()
    recall = full_df["recall"].mean()
    ap = full_df["ap"].mean()
    ndcg = full_df["ndcg"].mean()

    metric_df = pd.DataFrame(
        {"Retriever": [name], "Hit Rate": [hit_rate], "MRR": [mrr], "Precision": [precision], "Recall":[recall],"AP":[ap],"NDCG":[ndcg]}
    )

    return metric_df

In [None]:
from llama_index.core.evaluation import RetrieverEvaluator

metrics = ["hit_rate", "mrr", "precision", "recall", "ap", "ndcg"]


# We can evaluate the retievers with different top_k values.
for i in [2,10,20,50,100,200,400]:
    retriever = index.as_retriever(
        similarity_top_k=i,
        embed_model=embed_model
    )
    retriever_evaluator = RetrieverEvaluator.from_metric_names(
        metrics, retriever=retriever
    )
    eval_results = await retriever_evaluator.aevaluate_dataset(rag_eval_dataset)
    print(display_results_retriever(f"top_{i}", eval_results))