<a href="https://colab.research.google.com/github/winterForestStump/thesis/blob/main/notebooks/rag_x_phi3_generalQA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture --no-stderr
%pip install langchain-nomic langchain langchain-core langchain-community --quiet
%pip install -U tiktoken langchainhub chromadb langgraph tavily-python langchain-text-splitters
%pip install sentence_transformers FlagEmbedding --quiet

In [2]:
# LlamaCpp x GPU usage
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python



In [3]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [4]:
from langchain_community.llms import LlamaCpp
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_core.prompts import PromptTemplate

import chromadb
from langchain.storage.file_system import LocalFileStore
from langchain.storage._lc_store import create_kv_docstore
from langchain.vectorstores import Chroma

from FlagEmbedding import FlagReranker

from langchain_core.output_parsers import JsonOutputParser

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever
from langchain_core.output_parsers import StrOutputParser

from tqdm import tqdm
import pandas as pd
import os

In [5]:
!huggingface-cli download microsoft/Phi-3-mini-4k-instruct-gguf Phi-3-mini-4k-instruct-fp16.gguf --local-dir ./models --local-dir-use-symlinks False

Downloading 'Phi-3-mini-4k-instruct-fp16.gguf' to 'models/.huggingface/download/Phi-3-mini-4k-instruct-fp16.gguf.5d99003e395775659b0dde3f941d88ff378b2837a8dc3a2ea94222ab1420fad3.incomplete'
Phi-3-mini-4k-instruct-fp16.gguf: 100% 7.64G/7.64G [01:21<00:00, 93.6MB/s]
Download complete. Moving file to models/Phi-3-mini-4k-instruct-fp16.gguf
models/Phi-3-mini-4k-instruct-fp16.gguf


In [5]:
TEMP = 0
N_CTX = 4096
N_GPU_L = -1

llm_phi3 = LlamaCpp(
    model_path="/content/models/Phi-3-mini-4k-instruct-fp16.gguf",
    temperature=TEMP,
    n_ctx=N_CTX,
    n_gpu_layers = N_GPU_L,
    verbose=True
)

llama_model_loader: loaded meta data with 23 key-value pairs and 195 tensors from /content/models/Phi-3-mini-4k-instruct-fp16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = phi3
llama_model_loader: - kv   1:                               general.name str              = Phi3
llama_model_loader: - kv   2:                        phi3.context_length u32              = 4096
llama_model_loader: - kv   3:                      phi3.embedding_length u32              = 3072
llama_model_loader: - kv   4:                   phi3.feed_forward_length u32              = 8192
llama_model_loader: - kv   5:                           phi3.block_count u32              = 32
llama_model_loader: - kv   6:                  phi3.attention.head_count u32              = 32
llama_model_loader: - kv   7:               phi3.attention.head_count

In [6]:
questions = pd.read_fwf("https://raw.githubusercontent.com/winterForestStump/thesis/main/questions/questions_ver2.txt", names=['question'])
questions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 35 entries, 0 to 34
Data columns (total 1 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   question  35 non-null     object
dtypes: object(1)
memory usage: 408.0+ bytes


In [7]:
model_name = "BAAI/bge-small-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'}, #gpu
    encode_kwargs=encode_kwargs
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [8]:
reranker = FlagReranker('BAAI/bge-reranker-large', use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

In [9]:
### Metadata company name
prompt_metadata = PromptTemplate(
template="""
  <|assistant|> You are tasked with identifying the correct spelling of the company name mentioned in the user's input by searching through the list of fixed company names in the database metadata.
  This precise spelling will be crucial for SQL filtering purposes. \n
  Provide a concise response containing only the correct company name. \n
  Please format your response as a JSON object with only a single key 'company', WITHOUT any additional commentary. <|end|>
  <|user|> Database metadata with company names: \n\n {metadata_list} \n\n User question: {name_of_the_company} <|end|>
  <|assistant|>
""",
input_variables=["name_of_the_company", "metadata_list"])

retrieval_metadata = prompt_metadata | llm_phi3 | JsonOutputParser()

In [10]:
persistent_client = chromadb.PersistentClient('/content/drive/MyDrive/Thesis/chromadb')
collection = persistent_client.get_or_create_collection("reports_l2")
fs = LocalFileStore('/content/drive/MyDrive/Thesis/reports_store_location')
store = create_kv_docstore(fs)
vectorstore = Chroma(client = persistent_client,
                     collection_name="reports_l2",
                     embedding_function=bge_embeddings,
                     persist_directory='/content/drive/MyDrive/Thesis/chromadb')
vectorstore.persist()

  warn_deprecated(


In [11]:
metadata = vectorstore.get()['metadatas']
metadata_list = []
for i in range(len(metadata)):
  metadata_list.append(metadata[i]['company'])
metadata_list = list(set(metadata_list))

In [12]:
### Retrieval Grader
llm_retrieval = llm_phi3

prompt_retrieval_grader = PromptTemplate(
    template="""<|assistant|> You are a grader assessing relevance of a retrieved document to a user question.
    If the document contains information related to the user question, grade it as relevant. The goal is to filter out erroneous retrievals. \n
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question.<|end|>
    <|user|> Here is the retrieved document: {document}\n Here is the user question: {question} <|end|>
    <|assistant|>
    """,
    input_variables=["question", "document"],
)

retrieval_grader = prompt_retrieval_grader | llm_retrieval | StrOutputParser()

In [13]:
### Generate
llm_generate = llm_phi3

prompt_generate = PromptTemplate(
    template="""<|assistant|> You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
    If you don't know the answer, just say that you don't know. Keep the answer concise <|end|>
    <|user|> Question: {question}. \n Context: {documents} \n Answer: <|end|>
    <|assistant|>""",
    input_variables=["question", "documents"],
)

rag_chain = prompt_generate | llm_generate | StrOutputParser()

In [14]:
### Hallucination Grader
llm_hallucination_grader = llm_phi3

# Prompt
prompt_hallucination_grader = PromptTemplate(
    template=""" <|assistant|> You are a grader assessing whether an answer is grounded in / supported by a set of facts.
    Give a binary 'yes' or 'no' score to indicate whether the answer is grounded in / supported by a set of facts.<|end|>
    <|user|> Here are the facts: {documents} \n Here is the answer: {generation}  <|end|>
    <|assistant|>""",
    input_variables=["generation", "documents"],
)

hallucination_grader = prompt_hallucination_grader | llm_hallucination_grader | StrOutputParser()

In [15]:
### Answer Grader
llm_answer_grader = llm_phi3

# Prompt
prompt_answer_grader = PromptTemplate(
    template="""<|assistant|> You are a grader assessing whether an answer is useful to resolve a question.
    Give a binary score 'yes' or 'no' to indicate whether the answer is useful to resolve a question.<|end|>
    <|user|> Here is the answer: {generation} \n Here is the question: {question} <|end|>
    <|assistant|>""",
    input_variables=["generation", "question"],
)

answer_grader = prompt_answer_grader | llm_answer_grader | StrOutputParser()

In [22]:
company_names = [#'coca cola', 'nike', '3M', 'amazon',
                 #'adobe',
                 #'amd',
                 #'bestbuy',
                 'jpmorgan', 'locheed martin', 'microsoft', 'paypal', 'verizon', 'walmart']

In [23]:
NUM_PAR_CHUNKS = 20
N_DOCS_RETURN = 2

results_list = []

for company_name in company_names:
  for i in tqdm(range(len(questions))):
    company = retrieval_metadata.invoke({"name_of_the_company": company_name, "metadata_list": metadata_list})

    parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
    child_splitter = RecursiveCharacterTextSplitter(chunk_size=256)
    big_chunks_retriever = ParentDocumentRetriever(
      vectorstore=vectorstore, docstore=store, child_splitter=child_splitter, parent_splitter=parent_splitter,
      search_kwargs={'filter': {'company': company['company']}, 'k': NUM_PAR_CHUNKS})

    query = questions['question'][i]
    passage = big_chunks_retriever.invoke(query)
    texts = []
    for i in range(len(passage)):
      texts.append([query, passage[i].page_content])

    scores = reranker.compute_score(texts)
    combined = list(zip(texts, scores))
    sorted_combined = sorted(combined, key=lambda x: x[1], reverse=True)
    top_texts = [item[0] for item in sorted_combined[:N_DOCS_RETURN]]
    docs = [inner_list[1] for inner_list in top_texts if len(inner_list)>1]

    retrieval_grade = retrieval_grader.invoke({"question": query, "document": docs})
    generation = rag_chain.invoke({"context": docs, "question": query})
    hallucination_grade = hallucination_grader.invoke({"documents": docs, "generation": generation})
    answer_grade = answer_grader.invoke({"question": query, "generation": generation})

    results_list.append(pd.DataFrame({
          'question': [query],
          'response': [generation],
          'context': [docs],
          'retrieval_grade': [retrieval_grade],
          'hallucination_grade': [hallucination_grade],
          'answer_grade': [answer_grade]
      }))

  results = pd.concat(results_list, ignore_index=True)
  results.to_json(f'/content/drive/MyDrive/Thesis/rag_evaluation/bge-reranker_x_phi3-4k/eval_{company}.json')

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
llama_print_timings: prompt eval time =    4543.61 ms /   856 tokens (    5.31 ms per token,   188.40 tokens per second)
llama_print_timings:        eval time =      80.58 ms /     2 runs   (   40.29 ms per token,    24.82 tokens per second)
llama_print_timings:       total time =    4637.03 ms /   858 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =     463.94 ms
llama_print_timings:      sample time =      57.33 ms /   104 runs   (    0.55 ms per token,  1814.19 tokens per second)
llama_print_timings: prompt eval time =    4416.88 ms /   824 tokens (    5.36 ms per token,   186.56 tokens per second)
llama_print_timings:        eval time =    4105.65 ms /   103 runs   (   39.86 ms per token,    25.09 tokens per second)
llama_print_timings:       total time =    8651.83 ms /   927 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =     463.94 ms
llama_print_timi