In [1]:
from langchain.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.llms import LlamaCpp
from langchain.chains import RetrievalQA

In [2]:
loader = PyMuPDFLoader("Virtual_characters.pdf")
PDF_data = loader.load()

In [3]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=100, chunk_overlap=5)
all_splits = text_splitter.split_documents(PDF_data)

In [4]:
# Embed and store the texts
# Supplying a persist_directory will store the embeddings on disk
persist_directory = 'db'
model_name = "sentence-transformers/all-MiniLM-L6-v2"
model_kwargs = {'device': 'cpu'}
embedding = HuggingFaceEmbeddings(model_name=model_name,
                                  model_kwargs=model_kwargs)

vectordb = Chroma.from_documents(documents=all_splits, embedding=embedding, persist_directory=persist_directory)

In [5]:
from langchain.callbacks.manager import CallbackManager
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from langchain_community.llms import LlamaCpp

llm = LlamaCpp(
    model_path="llama-2_q4.gguf",
    n_gpu_layers=100,
    n_batch=512,
    n_ctx=2048,
    f16_kv=True,
    callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]),
    verbose=True,
)

llama_model_loader: loaded meta data with 16 key-value pairs and 291 tensors from llama-2_q4.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = llama
llama_model_loader: - kv   1:                               general.name str              = models
llama_model_loader: - kv   2:                       llama.context_length u32              = 2048
llama_model_loader: - kv   3:                     llama.embedding_length u32              = 4096
llama_model_loader: - kv   4:                          llama.block_count u32              = 32
llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                 llama.rope.dimension_count u32              = 128
llama_model_loader: - kv   7:                 llama.attention.head_count u32              = 32
ll

In [6]:
from langchain.chains import LLMChain
from langchain.chains.prompt_selector import ConditionalPromptSelector
from langchain.prompts import PromptTemplate

DEFAULT_LLAMA_SEARCH_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""<<SYS>> 
    You are a helpful assistant eager to assist with providing better Google search results.
    <</SYS>> 
    
    [INST] Provide an answer to the following question in 150 words. Ensure that the answer is informative, \
            relevant, and concise:
            {question} 
    [/INST]""",
)

DEFAULT_SEARCH_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are a helpful assistant eager to assist with providing better Google search results. \
        Provide an answer to the following question in about 150 words. Ensure that the answer is informative, \
        relevant, and concise: \
        {question}""",
)

QUESTION_PROMPT_SELECTOR = ConditionalPromptSelector(
    default_prompt=DEFAULT_SEARCH_PROMPT,
    conditionals=[(lambda llm: isinstance(llm, LlamaCpp), DEFAULT_LLAMA_SEARCH_PROMPT)],
)

prompt = QUESTION_PROMPT_SELECTOR.get_prompt(llm)
prompt

PromptTemplate(input_variables=['question'], template='<<SYS>> \n    You are a helpful assistant eager to assist with providing better Google search results.\n    <</SYS>> \n    \n    [INST] Provide an answer to the following question in 150 words. Ensure that the answer is informative,             relevant, and concise:\n            {question} \n    [/INST]')

In [7]:
llm_chain = LLMChain(prompt=prompt, llm=llm)
question = "What is Taiwan known for?"
llm_chain.invoke({"question": question})

  Taiwan is known for its vibrant culture, rich history, and stunning natural beauty. Some of its notable attractions include the bustling Night Markets, where visitors can sample local street food and buy unique souvenirs; the ancient city of Lukang, with its well-preserved traditional architecture; and the beautiful Taroko National Park, featuring marble cliffs, waterfalls, and hiking trails. Taiwan is also famous for its delicious cuisine, including dishes like beef noodle soup, oyster omelets, and bubble tea. Additionally, Taiwan has a thriving tech industry and is home to many world-renowned brands, making it a hub of innovation and technology.


llama_print_timings:        load time =    5102.15 ms
llama_print_timings:      sample time =      49.11 ms /   159 runs   (    0.31 ms per token,  3237.96 tokens per second)
llama_print_timings: prompt eval time =    5102.02 ms /    84 tokens (   60.74 ms per token,    16.46 tokens per second)
llama_print_timings:        eval time =   22931.21 ms /   158 runs   (  145.13 ms per token,     6.89 tokens per second)
llama_print_timings:       total time =   28534.12 ms /   242 tokens


{'question': 'What is Taiwan known for?',
 'text': '  Taiwan is known for its vibrant culture, rich history, and stunning natural beauty. Some of its notable attractions include the bustling Night Markets, where visitors can sample local street food and buy unique souvenirs; the ancient city of Lukang, with its well-preserved traditional architecture; and the beautiful Taroko National Park, featuring marble cliffs, waterfalls, and hiking trails. Taiwan is also famous for its delicious cuisine, including dishes like beef noodle soup, oyster omelets, and bubble tea. Additionally, Taiwan has a thriving tech industry and is home to many world-renowned brands, making it a hub of innovation and technology.'}

In [8]:
retriever = vectordb.as_retriever()

qa = RetrievalQA.from_chain_type(
    llm=llm, 
    chain_type="stuff", 
    retriever=retriever, 
    verbose=True
)

In [10]:
query = "Tell me about Alison Hawk's career and age"
qa.invoke(query)



[1m> Entering new RetrievalQA chain...[0m
 Based

Llama.generate: prefix-match hit


 on the given context, Alison Hawk is a 28-year-old female researcher.
[1m> Finished chain.[0m



llama_print_timings:        load time =    5102.15 ms
llama_print_timings:      sample time =       6.77 ms /    24 runs   (    0.28 ms per token,  3543.48 tokens per second)
llama_print_timings: prompt eval time =       0.00 ms /     1 tokens (    0.00 ms per token,      inf tokens per second)
llama_print_timings:        eval time =    3174.29 ms /    24 runs   (  132.26 ms per token,     7.56 tokens per second)
llama_print_timings:       total time =    3245.13 ms /    25 tokens


{'query': "Tell me about Alison Hawk's career and age",
 'result': ' Based on the given context, Alison Hawk is a 28-year-old female researcher.'}