In [None]:
import torch

from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)
import os
from llama_index.core.response.notebook_utils import display_response


llm = LlamaCPP(
    model_url="https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q2_K.gguf",
    temperature=0.1,
    max_new_tokens=100,
    # llama2 has a context window of 4096 tokens, but we set it lower to allow for some wiggle room
    context_window=4096,
    # kwargs to pass to __call__()
    generate_kwargs={},
    # kwargs to pass to __init__()
    # set to at least 1 to use GPU
    model_kwargs={"n_gpu_layers": -1},
    # transform inputs into Llama2 format
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

Downloading url https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q2_K.gguf to path /tmp/llama_index/models/llama-2-7b-chat.Q2_K.gguf
total size (MB): 2825.94


2696it [15:09,  2.96it/s]                                                                                                                                      
llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /tmp/llama_index/models/llama-2-7b-chat.Q2_K.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_

In [28]:
response_iter = llm.stream_complete("Hello")
for response in response_iter:
    print(response.delta, end="", flush=True)

 I can provide you with detailed information about the Great Wall of China, including its historical significance, construction methods, and current state. The Great Wall was built over several dynasties and served as a defensive structure against invasions. It stretentreasures from 250e6 BC to 1644 AD.

The wall is made up of various materials such as tamped earth, bricks, wood, and even reused building fragments. The Great Wall was built using three primary construction methods: through arches, battlements, and sectors. Through arches are the open sections between towers or fortresses along the wall. Battlements were used in areas where there were no gates or watchtowers, serving as a defensive barrier against invaders. Sectors were the individual parts of the wall that had their own unique features and purposes.

Today, much of the Great Wall is well-preserved and can be visited by tourists in certain areas. However, some sections are less accessible or even lost to time. The most f


llama_print_timings:        load time =   29370.69 ms
llama_print_timings:      sample time =      90.95 ms /   256 runs   (    0.36 ms per token,  2814.83 tokens per second)
llama_print_timings: prompt eval time =   29362.59 ms /    66 tokens (  444.89 ms per token,     2.25 tokens per second)
llama_print_timings:        eval time =   56673.59 ms /   255 runs   (  222.25 ms per token,     4.50 tokens per second)
llama_print_timings:       total time =   88232.08 ms /   321 tokens


In [10]:
from llama_index.readers.json import JSONReader

# Initialize JSONReader
reader = JSONReader()

# Load data from JSON file
documents = reader.load_data(input_file="data.json", extra_info={})

In [6]:
!pip3 install llama-index-readers-json

Defaulting to user installation because normal site-packages is not writeable
Collecting llama-index-readers-json
  Using cached llama_index_readers_json-0.1.5-py3-none-any.whl (3.3 kB)
Installing collected packages: llama-index-readers-json
Successfully installed llama-index-readers-json-0.1.5


In [7]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [29]:
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.llm = llm



In [11]:
index = VectorStoreIndex.from_documents(documents)

In [17]:
top_k = 3

retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=1,
)

In [23]:
from llama_index.core import get_response_synthesizer

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)],
)


In [31]:
response = query_engine.query("Hello")
display_response(response)

Llama.generate: prefix-match hit

llama_print_timings:        load time =    3835.20 ms
llama_print_timings:      sample time =      64.11 ms /   256 runs   (    0.25 ms per token,  3992.89 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     0 tokens (    -nan ms per token,     -nan tokens per second)
llama_print_timings:        eval time =   23074.40 ms /   256 runs   (   90.13 ms per token,    11.09 tokens per second)
llama_print_timings:       total time =   23884.91 ms /   256 tokens


**`Final Response:`** ttitedtthe ttinst,m
"
t
t
tmin,
tment,
tformered
tinstedin,mind as as as thema,mand as as the the them,minstsminst (tinstsinst,sent,the
tiinst,inst,the,the:M:M,msh for,forment,instmminminst.instmingmentminst,inst,minst,mind,minstedinst,inst,inst,inst,they,inst,inst,inst,inst,inst, and as,inst,all,inst,as as,the,inst,inst as theinst,instinstinstinstinstinst,inst,inst,inst,Minstinstinst
inst,inst,m,inst,inst,inst,inst,min,inst,instmentm.instmind,inst,inst.inst,inst,inst,inst,inst,inst as as,inst as as as as the,the as as as as,inst, as as as the insts and the theinst,inst,inst,inst,instinstinstinstininstinstinstinstinstinstinstinstinst