# Try with your Data

In [2]:
import os
import langchain
import textwrap
import warnings

In [3]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_community.llms import LlamaCpp
from langchain_core.callbacks import StreamingStdOutCallbackHandler
from langchain_core.documents import Document
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_qdrant import Qdrant
from langchain_huggingface import HuggingFaceEmbeddings

In [4]:
from llama_cpp import Llama
from scipy import spatial
from qdrant_client import QdrantClient

In [5]:
from ssec_tutorials import OLMO_MODEL, QDRANT_PATH, QDRANT_COLLECTION_NAME, download_qdrant_data

In [6]:
warnings.filterwarnings('ignore')

## Load OLMo

In [9]:
olmo = LlamaCpp(
    model_path=str(OLMO_MODEL),
    # TODO: What should be the other parameters here? Uncomment the next cell and run it. 
)

llama_model_loader: loaded meta data with 22 key-value pairs and 226 tensors from /Users/a42/.cache/ssec_tutorials/OLMo-7B-Instruct-Q4_K_M.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = olmo
llama_model_loader: - kv   1:                               general.name str              = OLMo-7B-Instruct-hf
llama_model_loader: - kv   2:                           olmo.block_count u32              = 32
llama_model_loader: - kv   3:                        olmo.context_length u32              = 2048
llama_model_loader: - kv   4:                      olmo.embedding_length u32              = 4096
llama_model_loader: - kv   5:                   olmo.feed_forward_length u32              = 11008
llama_model_loader: - kv   6:                  olmo.attention.head_count u32              = 32
llama_model_loader: - kv   7:           

In [10]:
# Uncomment this line to understand your available options. 
#LlamaCpp?

## Data Setup

Use notebooks from the Appendix folder to set up your data pipeline. You can use csv, pdfs, etc. as your data source. 

Note: if you're running things on Codespace, [refer to this link](https://stackoverflow.com/questions/62284623/how-can-i-upload-a-file-to-a-github-codespaces-environment) and upload your data to `resources/data` folder.  

In [12]:
# Add your code here.

In [13]:
# Ensure you can retrieve relevant context based on your prompt. 
# context = retriever.invoke("What's my relevant question?"))
# or 
# Load your data into the context variable
# context = <Add context from your data file>

## Prompt Engineering

In [15]:
# Create a prompt template using OLMo's tokenizer chat template we saw in module 1.
prompt_template = PromptTemplate.from_template(
    template=olmo.client.metadata['tokenizer.chat_template'], 
    template_format="jinja2",
    partial_variables={"add_generation_prompt": True, "eos_token": "<|endoftext|>"},
)

In [16]:
# Test the prompt you want to send to OLMo.

question = "What's the question?"
context = "Use context from above"

# Add prompt instructions
prompt_template.format(
    messages=[
        {
            "role": "user", 
            "content": f"""<Add your prompt instructions>:

            Context: {context}
            
            Question: {question}"""
        }
    ]
)

"<|endoftext|>\n\n<|user|>\n<Add your prompt instructions>:\n\n            Context: Use context from above\n            \n            Question: What's the question?\n\n\n<|assistant|>\n\n"

## RAG

In [17]:
# Chain the prompt template and olmo
llm_chain = prompt_template | olmo

In [18]:
question = "What's the question?"
context = "Use the context from above"

# Invoke the chain with a question and other parameters. 
llm_chain.invoke(
    {
        "messages":
            [{
                "role": "user", 
                "content": f"""<Add your prompt instructions>:
    
                Context: {context}
                
                Question: {question}"""
            }
        ], 
    },
    config={
        'callbacks' : [StreamingStdOutCallbackHandler()]
    }
)

 To provide the correct answer, I need a clear question based on the context provided. In the given text, no specific question is mentioned. If you could provide a specific question related to the context, I would be happy to help with that query.


llama_print_timings:        load time =    3706.46 ms
llama_print_timings:      sample time =       7.86 ms /    51 runs   (    0.15 ms per token,  6488.55 tokens per second)
llama_print_timings: prompt eval time =    5155.86 ms /    44 tokens (  117.18 ms per token,     8.53 tokens per second)
llama_print_timings:        eval time =    2631.83 ms /    50 runs   (   52.64 ms per token,    19.00 tokens per second)
llama_print_timings:       total time =    7829.53 ms /    94 tokens


' To provide the correct answer, I need a clear question based on the context provided. In the given text, no specific question is mentioned. If you could provide a specific question related to the context, I would be happy to help with that query.'