In [None]:
!pip3 install sentence-transformers
!pip3 install llama-index
!pip3 install llama-index-readers-json

In [None]:
!pip3 install llama-index-embeddings-huggingface
!pip3 install llama-index-llms-llama-cpp

In [33]:
#download data.json

!curl -f -o data.json https://devapi.beyondchats.com/api/get_message_with_sources

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 46685    0 46685    0     0  37402      0 --:--:--  0:00:01 --:--:-- 37407


In [23]:
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.llms.llama_cpp import LlamaCPP
from llama_index.llms.llama_cpp.llama_utils import (
    messages_to_prompt,
    completion_to_prompt,
)
import os
from llama_index.core.response.notebook_utils import display_response


llm = LlamaCPP(
    model_url="https://huggingface.co/TheBloke/Llama-2-7B-Chat-GGUF/resolve/main/llama-2-7b-chat.Q2_K.gguf",
    temperature=0.1,
    max_new_tokens=512,
    context_window=4096,
    generate_kwargs={},
    model_kwargs={"n_gpu_layers": -1},
    messages_to_prompt=messages_to_prompt,
    completion_to_prompt=completion_to_prompt,
    verbose=True,
)

llama_model_loader: loaded meta data with 19 key-value pairs and 291 tensors from /tmp/llama_index/models/llama-2-7b-chat.Q2_K.gguf (version GGUF V2)
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = LLaMA v2
llama_model_loader: - kv   2:                       llama.context_length u32              = 4096
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_cou

In [4]:
response_iter = llm.stream_complete("Hello")
for response in response_iter:
    print(response.delta, end="", flush=True)

  Hello! I'm here to assist you in any way I can. Please provide the specific instructions or questions you have, and I will do my best to help.


llama_print_timings:        load time =   26398.82 ms
llama_print_timings:      sample time =      20.45 ms /    36 runs   (    0.57 ms per token,  1760.13 tokens per second)
llama_print_timings: prompt eval time =   26398.60 ms /    66 tokens (  399.98 ms per token,     2.50 tokens per second)
llama_print_timings:        eval time =   21003.97 ms /    35 runs   (  600.11 ms per token,     1.67 tokens per second)
llama_print_timings:       total time =   47602.11 ms /   101 tokens


In [5]:
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.core import Settings, SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.postprocessor import SimilarityPostprocessor

In [34]:
from llama_index.readers.json import JSONReader

# Initialize JSONReader
reader = JSONReader()

# Load data from JSON file
documents = reader.load_data(input_file="data.json", extra_info={})


In [35]:
Settings.embed_model = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
Settings.llm = llm
Settings.chunk_size = 256

In [36]:
index = VectorStoreIndex.from_documents(documents)

In [37]:
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=1,
)

In [38]:
from llama_index.core import get_response_synthesizer

synth = get_response_synthesizer(streaming=True)

# assemble query engine
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[SimilarityPostprocessor(similarity_cutoff=0.5)],
    response_synthesizer=synth
)


In [12]:
response_stream = query_engine.query("what is the average cost of a regular IVF cycle?")

Llama.generate: prefix-match hit


In [13]:
for text in response_stream.response_gen:
    print(text,end="",flush=True)
print()

 Based on the provided context information, the average cost of a regular IVF (in vitro fertilization) cycle is approximately Rs 2 to 4 lakhs (around $2,700 to $4,80


llama_print_timings:        load time =   26398.82 ms
llama_print_timings:      sample time =      28.04 ms /    50 runs   (    0.56 ms per token,  1783.23 tokens per second)
llama_print_timings: prompt eval time =  115377.55 ms /   279 tokens (  413.54 ms per token,     2.42 tokens per second)
llama_print_timings:        eval time =   30495.33 ms /    49 runs   (  622.35 ms per token,     1.61 tokens per second)
llama_print_timings:       total time =  561251.00 ms /   328 tokens





In [16]:
response_stream = query_engine.query("do you offer online delivery?")

Llama.generate: prefix-match hit


In [17]:
for text in response_stream.response_gen:
    print(text,end="",flush=True)
print()

 Yes, we offer online delivery services through major platforms like Swiggy and Zomato. You can also order directly from our website!


llama_print_timings:        load time =   26398.82 ms
llama_print_timings:      sample time =      17.52 ms /    30 runs   (    0.58 ms per token,  1712.23 tokens per second)
llama_print_timings: prompt eval time =  112976.94 ms /   283 tokens (  399.21 ms per token,     2.50 tokens per second)
llama_print_timings:        eval time =   17837.00 ms /    29 runs   (  615.07 ms per token,     1.63 tokens per second)
llama_print_timings:       total time =  166135.36 ms /   312 tokens





In [18]:
response_stream = query_engine.query("How can I sign up?")

Llama.generate: prefix-match hit


In [19]:
for text in response_stream.response_gen:
    print(text,end="",flush=True)
print()

 Based on the provided context information, it appears that you are interested in signing up for a program related to evidence log recording. To sign up, you can follow these steps:
1. Carry your log with you or keep notebooks


llama_print_timings:        load time =   26398.82 ms
llama_print_timings:      sample time =      29.96 ms /    50 runs   (    0.60 ms per token,  1668.67 tokens per second)
llama_print_timings: prompt eval time =  109232.92 ms /   251 tokens (  435.19 ms per token,     2.30 tokens per second)
llama_print_timings:        eval time =   30782.38 ms /    49 runs   (  628.21 ms per token,     1.59 tokens per second)
llama_print_timings:       total time =  252703.61 ms /   300 tokens





In [26]:
response_stream = query_engine.query("What Method of Counseling do you offer?")

Llama.generate: prefix-match hit


In [27]:
for text in response_stream.response_gen:
    print(text,end="",flush=True)
print()

 Based on the provided context information, MindWorks Buddy offers various methods of counseling, including:
1. Online Counseling: You can connect with their expert psychologists through their website and receive immediate support.
2


llama_print_timings:        load time =   26398.82 ms
llama_print_timings:      sample time =      28.94 ms /    50 runs   (    0.58 ms per token,  1728.01 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     0 tokens (    -nan ms per token,     -nan tokens per second)
llama_print_timings:        eval time =   31750.60 ms /    50 runs   (  635.01 ms per token,     1.57 tokens per second)
llama_print_timings:       total time =   37115.83 ms /    50 tokens





In [39]:
response_stream = query_engine.query("What happens in the first session?")

In [40]:
for text in response_stream.response_gen:
    print(text,end="",flush=True)
print()

 Based on the given context information, it appears that the individual is seeking guidance on how to book a counseling session with a chosen counselor. The process involves logging into a website, selecting the counselor of choice, choosing a date and time, and scheduling the session. Once the session is booked, the chosen counselor will contact the individual and provide a Google link for the video session.
In terms of the first session, it is likely that the counselor will ask the individual questions to get to know them better and understand their reasons for seeking counseling. The counselor may also provide guidance on how to manage any challenges or issues that the individual is facing, and offer support and resources as needed. The exact content of the first session will depend on the individual's specific needs and goals, as well as the counselor's area of expertise.


llama_print_timings:        load time =  145097.58 ms
llama_print_timings:      sample time =     105.69 ms /   188 runs   (    0.56 ms per token,  1778.72 tokens per second)
llama_print_timings: prompt eval time =  145096.98 ms /   354 tokens (  409.88 ms per token,     2.44 tokens per second)
llama_print_timings:        eval time =  117244.48 ms /   187 runs   (  626.98 ms per token,     1.59 tokens per second)
llama_print_timings:       total time =  293084.37 ms /   541 tokens



