In [27]:
from llama_index.core import (
    VectorStoreIndex, 
    SimpleDirectoryReader, 
    StorageContext, 
    ServiceContext, 
    load_index_from_storage
)
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.groq import Groq
from llama_index.postprocessor.cohere_rerank import CohereRerank
import os
from dotenv import load_dotenv
load_dotenv()

True

In [28]:
GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")

In [29]:
reader = SimpleDirectoryReader(input_dir="/Users/kiwitech/Desktop/untitled folder")
documents = reader.load_data()

Generating embeddings:  82%|████████▏ | 9/11 [50:02<11:07, 333.59s/it]
Generating embeddings:  82%|████████▏ | 9/11 [48:38<10:48, 324.29s/it]
Generating embeddings:  82%|████████▏ | 9/11 [39:46<08:50, 265.12s/it]


In [30]:
embed_model = GeminiEmbedding(
    model_name="models/embedding-001", api_key=GOOGLE_API_KEY
)
splitter = SemanticSplitterNodeParser(
    buffer_size=1, 
    breakpoint_percentile_threshold=95, 
    embed_model=embed_model
)
nodes = splitter.get_nodes_from_documents(documents, show_progress=True)

Generating embeddings: 100%|██████████| 11/11 [00:09<00:00,  1.19it/s]
Generating embeddings: 100%|██████████| 30/30 [00:21<00:00,  1.37it/s]
Generating embeddings: 100%|██████████| 22/22 [00:10<00:00,  2.07it/s]
Generating embeddings: 100%|██████████| 26/26 [00:12<00:00,  2.12it/s]
Generating embeddings: 100%|██████████| 24/24 [00:11<00:00,  2.05it/s]
Generating embeddings: 100%|██████████| 37/37 [00:18<00:00,  2.05it/s]
Generating embeddings: 100%|██████████| 47/47 [00:23<00:00,  2.03it/s]
Generating embeddings: 100%|██████████| 17/17 [00:08<00:00,  2.01it/s]
Generating embeddings: 100%|██████████| 7/7 [00:03<00:00,  1.90it/s]
Parsing nodes: 100%|██████████| 9/9 [01:59<00:00, 13.23s/it]


In [31]:
llm = Groq(model="mixtral-8x7b-32768", api_key=GROQ_API_KEY)

In [32]:
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm)

  service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm)


In [33]:
vector_index = VectorStoreIndex.from_documents(documents, show_progress=True, 
               service_context=service_context, node_parser=nodes)

Parsing nodes: 100%|██████████| 9/9 [00:00<00:00, 664.89it/s]
Generating embeddings: 100%|██████████| 11/11 [00:05<00:00,  2.08it/s]


In [34]:
vector_index.storage_context.persist(persist_dir="./storage")

In [35]:
storage_context = StorageContext.from_defaults(persist_dir="./storage")

In [36]:
index = load_index_from_storage(storage_context, service_context=service_context)

In [37]:
cohere_rerank = CohereRerank(api_key=COHERE_API_KEY, top_n=2)

In [38]:
query_engine = index.as_query_engine(service_context=service_context,
                similarity_top_k=10,
                node_postprocessors=[cohere_rerank],)

In [39]:
query = "Summarize all the documents"
resp = query_engine.query(query)

In [40]:
resp.response

'The documents present a study on a 1-bit Large Language Model (LLM) called BitNet b1.58. The model is based on the BitNet architecture, which uses a Transformer that replaces nn.Linear with BitLinear. BitNet b1.58 has 1.58-bit weights and 8-bit activations and is trained from scratch. The model introduces a new computation paradigm that requires almost no multiplication operations for matrix multiplication, resulting in energy savings and faster computation. BitNet b1.58 has a much lower memory footprint than full-precision models, reducing the cost and time of loading weights from DRAM.\n\nThe study compares BitNet b1.58 with a full-precision baseline, LLaMA LLM, and shows that BitNet b1.58 can match the performance of the full precision baseline starting from a 3B size. The memory and latency cost of BitNet b1.58 is lower than LLaMA LLM, demonstrating that BitNet b1.58 is a Pareto improvement over the state-of-the-art LLM models.\n\nThe study further scales up the model size to 7B, 