In [None]:
!pip install -qU rich
!pip install -qU wandb
!pip install -qU git+https://github.com/wandb/weave.git@feat/groq
!pip install -qU llama-index
!pip install -qU llama-index-embeddings-huggingface llama-index-llms-groq

## Data Ingestion

In [None]:
import os
from typing import List

import rich
import wandb
import weave
from llama_index.core import SimpleDirectoryReader

In [None]:
wandb.init(project="groq-rag", job_type="build-vector-index")
weave.init(project_name="groq-rag")

config = wandb.config
config.embed_model_name = "BAAI/bge-small-en-v1.5"
config.node_parser_buffer_size = 1
config.node_parser_breakpoint_percentile_threshold = 95
config.groq_llm = "mixtral-8x7b-32768"

artifact = wandb.use_artifact(
    "geekyrakshit/groq-rag/ncert-flamingoes:latest", type="dataset"
)
artifact_dir = artifact.download()

In [None]:
reader = SimpleDirectoryReader(
    input_dir=os.path.join(artifact_dir, "prose", "chapters")
)
documents = weave.op()(reader.load_data)(
    num_workers=4, show_progress=True
)

In [None]:
rich.print(f"{len(documents)=}")

In [None]:
rich.print(f"{documents[0].text}")

## Vector Embeddings

In [None]:
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.huggingface import HuggingFaceEmbedding

In [None]:
embed_model = HuggingFaceEmbedding(model_name=config.embed_model_name)

In [None]:
splitter = SemanticSplitterNodeParser(
    buffer_size=config.node_parser_buffer_size,
    breakpoint_percentile_threshold=config.node_parser_breakpoint_percentile_threshold,
    embed_model=embed_model
)
nodes = splitter.get_nodes_from_documents(
    documents, show_progress=True
)

## Vector Index

In [None]:
from google.colab import userdata
from llama_index.core import ServiceContext, VectorStoreIndex
from llama_index.llms.groq import Groq

In [None]:
llm = Groq(
    model=config.groq_llm,
    api_key=userdata.get("GROQ_API_KEY")
)

In [None]:
service_context = ServiceContext.from_defaults(
    embed_model=embed_model, llm=llm
)

vector_index = VectorStoreIndex.from_documents(
    documents,
    show_progress=True,
    service_context=service_context,
    node_parser=nodes
)
vector_index.storage_context.persist(
    persist_dir="./vector_embedding_storage"
)

In [None]:
artifact = wandb.Artifact(
    name="ncert-flamingoes-prose-embeddings", type="vector_index"
)
artifact.add_dir(local_path="./vector_embedding_storage")
wandb.log_artifact(artifact)
wandb.finish()