In [2]:
from llama_index.core import (
    VectorStoreIndex, 
    SimpleDirectoryReader, 
    StorageContext, 
    ServiceContext, 
    load_index_from_storage
)
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.gemini import GeminiEmbedding
from llama_index.llms.groq import Groq
from llama_index.postprocessor.cohere_rerank import CohereRerank
import os
from dotenv import load_dotenv
load_dotenv('.local-env')

True

In [3]:
GOOGLE_API_KEY = os.getenv("GEMINI_API_KEY")
GROQ_API_KEY = os.getenv("GROQ_API_KEY")
COHERE_API_KEY = os.getenv("COHERE_API_KEY")

In [4]:
reader = SimpleDirectoryReader(input_dir="/files")
documents = reader.load_data()

In [5]:
embed_model = GeminiEmbedding(
    model_name="models/embedding-001", api_key=GOOGLE_API_KEY
)
splitter = SemanticSplitterNodeParser(
    buffer_size=1, 
    breakpoint_percentile_threshold=95, 
    embed_model=embed_model
)
nodes = splitter.get_nodes_from_documents(documents, show_progress=True)

Generating embeddings: 0it [00:00, ?it/s]:00<?, ?it/s]
Generating embeddings: 0it [00:00, ?it/s]
Generating embeddings: 0it [00:00, ?it/s]
Generating embeddings: 0it [00:00, ?it/s]
Generating embeddings: 0it [00:00, ?it/s]
Generating embeddings: 100%|██████████| 7/7 [00:06<00:00,  1.16it/s]
Generating embeddings: 100%|██████████| 12/12 [00:08<00:00,  1.42it/s]
Generating embeddings: 100%|██████████| 13/13 [00:08<00:00,  1.46it/s]
Generating embeddings: 100%|██████████| 11/11 [00:07<00:00,  1.45it/s]
Generating embeddings: 100%|██████████| 19/19 [00:08<00:00,  2.18it/s]
Generating embeddings: 100%|██████████| 5/5 [00:02<00:00,  2.37it/s]
Generating embeddings: 100%|██████████| 1/1 [00:00<00:00,  1.86it/s]
Generating embeddings: 100%|██████████| 12/12 [00:06<00:00,  1.92it/s]
Generating embeddings: 100%|██████████| 16/16 [00:07<00:00,  2.14it/s]
Generating embeddings: 100%|██████████| 12/12 [00:05<00:00,  2.16it/s]
Generating embeddings: 100%|██████████| 19/19 [00:08<00:00,  2.17it/s]
Ge

In [6]:
llm = Groq(model="mixtral-8x7b-32768", api_key=GROQ_API_KEY)

In [7]:
service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm)

  service_context = ServiceContext.from_defaults(embed_model=embed_model, llm=llm)


In [8]:
vector_index = VectorStoreIndex.from_documents(documents, show_progress=True, 
               service_context=service_context, node_parser=nodes)

Parsing nodes:   0%|          | 0/263 [00:00<?, ?it/s]

Parsing nodes: 100%|██████████| 263/263 [00:00<00:00, 561.54it/s]
Generating embeddings: 100%|██████████| 268/268 [02:25<00:00,  1.84it/s]


In [9]:
vector_index.storage_context.persist(persist_dir="./storage")

In [10]:
storage_context = StorageContext.from_defaults(persist_dir="./storage")

In [11]:
index = load_index_from_storage(storage_context, service_context=service_context)

In [12]:
cohere_rerank = CohereRerank(api_key=COHERE_API_KEY, top_n=2)

In [13]:
query_engine = index.as_query_engine(service_context=service_context,
                similarity_top_k=10,
                node_postprocessors=[cohere_rerank],)

In [14]:
query = "Summarize all the documents"
resp = query_engine.query(query)

CohereAPIError: invalid request: list of documents must not contain only empty strings or only whitespace

In [40]:
resp.response

'The documents present a study on a 1-bit Large Language Model (LLM) called BitNet b1.58. The model is based on the BitNet architecture, which uses a Transformer that replaces nn.Linear with BitLinear. BitNet b1.58 has 1.58-bit weights and 8-bit activations and is trained from scratch. The model introduces a new computation paradigm that requires almost no multiplication operations for matrix multiplication, resulting in energy savings and faster computation. BitNet b1.58 has a much lower memory footprint than full-precision models, reducing the cost and time of loading weights from DRAM.\n\nThe study compares BitNet b1.58 with a full-precision baseline, LLaMA LLM, and shows that BitNet b1.58 can match the performance of the full precision baseline starting from a 3B size. The memory and latency cost of BitNet b1.58 is lower than LLaMA LLM, demonstrating that BitNet b1.58 is a Pareto improvement over the state-of-the-art LLM models.\n\nThe study further scales up the model size to 7B, 