## Load a LLM

In [1]:
# ! pip -q install llama-index llama-cpp-python pypdf qdrant_client fastembed

In [2]:
from llama_index.llms import LlamaCPP
from llama_index.llms.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)

In [3]:
# Innitiate our LLM using Llama CPP

llm = LlamaCPP(model_path = "./models/openhermes-2.5-neural-chat-7b-v3-1-7b.Q5_K_M.gguf", 
                temperature= 0.001, 
                max_new_tokens= 3900, 
                context_window= 3900,
                messages_to_prompt=messages_to_prompt,
               completion_to_prompt=completion_to_prompt,
               verbose=False) 

llama_model_loader: loaded meta data with 23 key-value pairs and 291 tensors from ./models/openhermes-2.5-neural-chat-7b-v3-1-7b.Q5_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: - tensor    0:                token_embd.weight q5_K     [  4096, 32000,     1,     1 ]
llama_model_loader: - tensor    1:           blk.0.attn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    2:            blk.0.ffn_down.weight q6_K     [ 14336,  4096,     1,     1 ]
llama_model_loader: - tensor    3:            blk.0.ffn_gate.weight q5_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    4:              blk.0.ffn_up.weight q5_K     [  4096, 14336,     1,     1 ]
llama_model_loader: - tensor    5:            blk.0.ffn_norm.weight f32      [  4096,     1,     1,     1 ]
llama_model_loader: - tensor    6:              blk.0.attn_k.weight q5_K     [  4096,  1024,     1,     1 ]
llama_model_loader: - tensor    7:         blk.0.attn_output.weight q5_K    

In [4]:
# infering from the database

ans = llm.complete("what is most popular thing about Bangalore in india ? give shot and direct answer in 10 words")
print( ans.text )



Bangalore's popularity mainly lies in its thriving IT industry, vibrant culture, and pleasant climate.


### We can use the stream_complete endpoint to stream the response as it’s being generated rather than waiting for the entire response to be generated.

In [5]:
response_iter = llm.stream_complete("what is most popular thing about Bangalore in india ? give shot and direct answer in 10 words")

for response in response_iter:
    print(response.delta, end="", flush=True)



Bangalore's popularity mainly lies in its thriving IT industry, vibrant culture, and pleasant climate.

## Saving Data to Storage

#### As we are using Qdrant, A production ready Vector DB, we will use Embedding model "sentence-transformers/all-MiniLM-L6-v2" and use the library 'fastembed'

#### 1. Load Embedding Models

In [6]:
from llama_index.embeddings import FastEmbedEmbedding

# load our embedding model
embeddings = FastEmbedEmbedding(model_name="BAAI/bge-small-en-v1.5")

100%|█████████████████████████████████████| 76.7M/76.7M [00:06<00:00, 12.2MiB/s]


#### 2. Create a service context

In [7]:
from llama_index import ServiceContext, OpenAIEmbedding, PromptHelper 
from llama_index.text_splitter import SentenceSplitter
from llama_index import set_global_service_context



# Text Splitter
text_splitter = SentenceSplitter(chunk_size=500, chunk_overlap=20)

# prompt helper
prompt_helper = PromptHelper(
    context_window=4000,
    num_output=4000,
    chunk_size_limit=500,
)


# Service Context
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embeddings,
    text_splitter=text_splitter,
    prompt_helper=prompt_helper)

set_global_service_context(service_context)

#### 3. Load Data from Directory

In [8]:
# load pdf document 

from llama_index import SimpleDirectoryReader, VectorStoreIndex

document = SimpleDirectoryReader(input_dir= "./documents/").load_data()

#### 4. Create a storage context

In [9]:
import qdrant_client

client = qdrant_client.QdrantClient( location=":memory:" ) # you can use :memory: mode for fast and light-weight experiments,

In [10]:
from llama_index.vector_stores.qdrant import QdrantVectorStore
from llama_index.storage.storage_context import StorageContext

vector_store = QdrantVectorStore(client=client, collection_name="paul_graham")

storage_context = StorageContext.from_defaults(vector_store=vector_store)

index = VectorStoreIndex.from_documents( document, storage_context=storage_context, service_context=service_context)

In [11]:
query_engine = index.as_query_engine()

response = query_engine.query( "What is IIM policy on leave ?")

ValueError: Calculated available context size -78 was not non-negative.