# RAG

In [None]:
from get_Sentence import querySentence
from chunk_Split import GetChunk
from sentence_Embedding import Embedding
import textwrap
from sentence_transformers import SentenceTransformer, util
from time import perf_counter as timer
import torch

file_path = 'data/DemoData.pdf'
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
# extract sentences into a list
text = querySentence(file_path)
list_of_sentence = text.extractSentence()
print(list_of_sentence)
# split doc into chunks
chunks = GetChunk(list_of_sentence)
chunk = chunks.splitChunks()

# embedding chunks
embedded = Embedding(chunk)
embeddedChunk, pages_and_chunks = embedded.EmbedModel()
print("Embedded chunks: \n", embeddedChunk)

def print_wrapped(text, wrap_length=80):
    wrapped_text = textwrap.fill(text, wrap_length)
    print(wrapped_text)

def get_RAG():
    # query
    query = input('Input something to find: ')

    # 2. Embed the query to the same numerical space as the text examples
    embedding_model = SentenceTransformer(model_name_or_path="all-mpnet-base-v2", device=device) # choose the device to load the model to (note: GPU will often be *much* faster than CPU)
    query_embedding = embedding_model.encode(query, convert_to_tensor=True)

    # 3. Get similarity scores with the dot product (we'll time this for fun)
    start_time = timer()
    dot_scores = util.dot_score(a=query_embedding, b=embeddedChunk)[0]
    end_time = timer()
    
    rag_output = ""

    rag_output += f"Time take to get scores on {len(embeddedChunk)} embeddings: {end_time-start_time:.5f} seconds. \n"

    # 4. Get the top-k results (3)
    top_results_dot_product = torch.topk(dot_scores, k=3)

    # print("Results:")
    # Loop through zipped together scores and indicies from torch.topk
    assistant_messages = ""
    for score, idx in zip(top_results_dot_product[0], top_results_dot_product[1]):
        rag_output += f"Score: {score:.4f}\n"
        # Print relevant sentence chunk (since the scores are in descending order, the most relevant chunk will be first)
        rag_output += "Text: \n"
        rag_output += f"""{pages_and_chunks[idx]["sentence_chunk"]} \n"""
        # Print the page number too so we can reference the textbook further (and check the results)
        rag_output += f"Page number: {pages_and_chunks[idx]['page_number']} \n"
        assistant_messages += pages_and_chunks[idx]["sentence_chunk"]

    return query, assistant_messages, rag_output



  from .autonotebook import tqdm as notebook_tqdm


Chunk token count: 10.0 , Text: 7. How are trusted contexts used in DB2?
Embedded chunks: 
 tensor([[ 0.0116, -0.0120, -0.0648,  ...,  0.0090, -0.0040, -0.0219],
        [-0.0058, -0.0187, -0.0417,  ..., -0.0197, -0.0039, -0.0260],
        [ 0.0169, -0.0510, -0.0189,  ..., -0.0379, -0.0407, -0.0381],
        ...,
        [ 0.0420,  0.0210, -0.0310,  ...,  0.0171, -0.0196, -0.0173],
        [-0.0002, -0.0139, -0.0047,  ...,  0.0003, -0.0442, -0.0021],
        [ 0.0204, -0.0233, -0.0428,  ..., -0.0158,  0.0206, -0.0344]],
       device='cuda:0')


# LLM (Gemini)

In [3]:
# LLM model
import google.generativeai as genai
from dotenv import load_dotenv
import os

load_dotenv()
api_key = os.getenv("Gemini_API")

genai.configure(api_key=api_key)
model = genai.GenerativeModel("gemini-1.5-flash")
query, assistant_messages, rag_output = get_RAG()

message = f"""
            "You are an AI assistant that answers questions.",
            "You are a very honest person, never make up answer from what you do not know. If you do not know the answer, say exactly I DO NOT KNOW!",
            "content": {query}

"""

response = model.generate_content(message).text

if bool("I DO NOT KNOW!" in response):
    message += assistant_messages
    rag_response = model.generate_content(message).text
    print_wrapped(rag_output)
    print("__________________________________LLM answer______________________________________")
    print_wrapped(rag_response)
else:
    print_wrapped(response)

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument mat2 in method wrapper_CUDA_mm)

: 