<a href="https://colab.research.google.com/github/winterForestStump/thesis/blob/main/notebooks/rag_x_phi3_financebenchQA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture --no-stderr
%pip install langchain-nomic langchain langchain-core langchain-community chromadb --quiet
%pip install sentence_transformers FlagEmbedding --quiet

In [2]:
# LlamaCpp x GPU usage
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.78.tar.gz (50.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.2/50.2 MB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.78-cp310-cp310-linux_x86_64.whl size=169130810 sha256=172100963ace4c30c4e8ac939346a319299a8f682788c03a1dbf91fc644f7cc3
  Stored in direct

In [3]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
from langchain_community.llms import LlamaCpp
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_core.prompts import PromptTemplate

import chromadb
from langchain.storage.file_system import LocalFileStore
from langchain.storage._lc_store import create_kv_docstore
from langchain.vectorstores import Chroma

from FlagEmbedding import FlagReranker

from langchain_core.output_parsers import JsonOutputParser
from langchain_core.output_parsers import StrOutputParser

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import ParentDocumentRetriever

from tqdm import tqdm
import pandas as pd
import os

In [5]:
!huggingface-cli download microsoft/Phi-3-mini-4k-instruct-gguf Phi-3-mini-4k-instruct-fp16.gguf --local-dir ./models --local-dir-use-symlinks False

Downloading 'Phi-3-mini-4k-instruct-fp16.gguf' to 'models/.huggingface/download/Phi-3-mini-4k-instruct-fp16.gguf.5d99003e395775659b0dde3f941d88ff378b2837a8dc3a2ea94222ab1420fad3.incomplete'
Phi-3-mini-4k-instruct-fp16.gguf: 100% 7.64G/7.64G [01:00<00:00, 127MB/s]
Download complete. Moving file to models/Phi-3-mini-4k-instruct-fp16.gguf
models/Phi-3-mini-4k-instruct-fp16.gguf


In [6]:
TEMP = 0
N_CTX = 4096
N_GPU_L = -1

llm_phi3 = LlamaCpp(
    model_path="/content/models/Phi-3-mini-4k-instruct-fp16.gguf",
    temperature=TEMP,
    n_ctx=N_CTX,
    n_gpu_layers = N_GPU_L,
    verbose=True
)

llama_model_loader: loaded meta data with 23 key-value pairs and 195 tensors from /content/models/Phi-3-mini-4k-instruct-fp16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = phi3
llama_model_loader: - kv   1:                               general.name str              = Phi3
llama_model_loader: - kv   2:                        phi3.context_length u32              = 4096
llama_model_loader: - kv   3:                      phi3.embedding_length u32              = 3072
llama_model_loader: - kv   4:                   phi3.feed_forward_length u32              = 8192
llama_model_loader: - kv   5:                           phi3.block_count u32              = 32
llama_model_loader: - kv   6:                  phi3.attention.head_count u32              = 32
llama_model_loader: - kv   7:               phi3.attention.head_count

In [27]:
questions = pd.read_json('https://raw.githubusercontent.com/patronus-ai/financebench/main/data/financebench_open_source.jsonl', lines=True)
questions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   financebench_id       150 non-null    object
 1   company               150 non-null    object
 2   doc_name              150 non-null    object
 3   question_type         150 non-null    object
 4   question_reasoning    100 non-null    object
 5   domain_question_num   50 non-null     object
 6   question              150 non-null    object
 7   answer                150 non-null    object
 8   justification         100 non-null    object
 9   dataset_subset_label  150 non-null    object
 10  evidence              150 non-null    object
dtypes: object(11)
memory usage: 13.0+ KB


In [8]:
model_name = "BAAI/bge-small-en-v1.5"
encode_kwargs = {'normalize_embeddings': True} # set True to compute cosine similarity

bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'}, #gpu
    encode_kwargs=encode_kwargs
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/94.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/743 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [65]:
reranker = FlagReranker('BAAI/bge-reranker-large', use_fp16=True) # Setting use_fp16 to True speeds up computation with a slight performance degradation

tokenizer_config.json:   0%|          | 0.00/443 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/279 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/801 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/2.24G [00:00<?, ?B/s]

In [10]:
persistent_client = chromadb.PersistentClient('/content/drive/MyDrive/Thesis/chromadb')
collection = persistent_client.get_or_create_collection("reports_l2")
fs = LocalFileStore('/content/drive/MyDrive/Thesis/reports_store_location')
store = create_kv_docstore(fs)
vectorstore = Chroma(client = persistent_client,
                     collection_name="reports_l2",
                     embedding_function=bge_embeddings,
                     persist_directory='/content/drive/MyDrive/Thesis/chromadb')
vectorstore.persist()

  warn_deprecated(


In [11]:
metadata = vectorstore.get()['metadatas']
metadata_list = []
for i in range(len(metadata)):
  metadata_list.append(metadata[i]['company'])
metadata_list = list(set(metadata_list))

Correct metadata Company name invoke:

In [24]:
### Metadata company name
prompt_metadata = PromptTemplate(
template="""
  <|assistant|> You need to identify the correct spelling of companies from the metadata list which are mentioned in the users input.
  Format your response as a JSON object with only a single key 'company', without any additional commentary or explanations. Do not try to answer the question itself.<|end|>
  <|user|>Database metadata list with company names: {metadata_list}. Users input: {input}.<|end|>
  <|assistant|>
""",
input_variables=["input", "metadata_list"])

retrieval_metadata = prompt_metadata | llm_phi3 | JsonOutputParser()

In [28]:
import pandas as pd
from tqdm import tqdm
from json import JSONDecodeError
from langchain_core.output_parsers.json import OutputParserException

if 'correct_name' not in questions.columns:
    questions['correct_name'] = None

for i in tqdm(range(len(questions))):
    try:
        company = retrieval_metadata.invoke({"input": questions['question'][i], "metadata_list": metadata_list})
        questions.at[i, 'correct_name'] = company
    except JSONDecodeError as e:
        questions.at[i, 'correct_name'] = "JSONDecodeError"
    except OutputParserException as e:
        questions.at[i, 'correct_name'] = "OutputParserException"
    except Exception as e:
        questions.at[i, 'correct_name'] = f"Exception: {str(e)}"

  0%|          | 0/150 [00:00<?, ?it/s]Llama.generate: prefix-match hit

llama_print_timings:        load time =     521.11 ms
llama_print_timings:      sample time =      29.95 ms /    52 runs   (    0.58 ms per token,  1736.34 tokens per second)
llama_print_timings: prompt eval time =     283.08 ms /    47 tokens (    6.02 ms per token,   166.03 tokens per second)
llama_print_timings:        eval time =    1897.67 ms /    51 runs   (   37.21 ms per token,    26.87 tokens per second)
llama_print_timings:       total time =    2254.82 ms /    98 tokens
  1%|          | 1/150 [00:02<05:38,  2.27s/it]Llama.generate: prefix-match hit

llama_print_timings:        load time =     521.11 ms
llama_print_timings:      sample time =      54.22 ms /    91 runs   (    0.60 ms per token,  1678.44 tokens per second)
llama_print_timings: prompt eval time =     284.42 ms /    56 tokens (    5.08 ms per token,   196.89 tokens per second)
llama_print_timings:        eval time =    3360.72 ms /    90 r

In [59]:
questions['names_for_filter'] = None
questions['names_for_filter'][:8] = '3M CO'
questions['names_for_filter'][8:10] = 'Activision Blizzard, Inc.'
questions['names_for_filter'][10:15] = 'ADOBE INC.'
questions['names_for_filter'][15:18] = 'AES CORP'
questions['names_for_filter'][18:21] = 'AMAZON COM INC'
questions['names_for_filter'][21:30] = 'Amcor plc'
questions['names_for_filter'][30:38] = 'ADVANCED MICRO DEVICES INC'
questions['names_for_filter'][38:45] = 'AMERICAN EXPRESS CO'
questions['names_for_filter'][45:48] = 'American Water Works Company, Inc.'
questions['names_for_filter'][48:56] = 'BEST BUY CO INC'
questions['names_for_filter'][56:59] = 'Square, Inc.'
questions['names_for_filter'][59:67] = 'BOEING CO'
questions['names_for_filter'][67:70] = 'COCA COLA CO'
questions['names_for_filter'][70:74] = 'CORNING INC /NY'
questions['names_for_filter'][74:75] = 'COSTCO WHOLESALE CORP /NEW'
questions['names_for_filter'][75:79] = 'CVS HEALTH Corp'
questions['names_for_filter'][79:81] = 'FOOT LOCKER, INC.'
questions['names_for_filter'][81:85] = 'GENERAL MILLS INC'
questions['names_for_filter'][85:94] = 'JOHNSON & JOHNSON'
questions['names_for_filter'][94:99] = 'JPMORGAN CHASE & CO'
questions['names_for_filter'][99:100] = 'Kraft Heinz Co'
questions['names_for_filter'][100:103] = 'LOCKHEED MARTIN CORP'
questions['names_for_filter'][103:110] = 'MGM Resorts International'
questions['names_for_filter'][110:112] = 'MICROSOFT CORP'
questions['names_for_filter'][112:114] = 'NETFLIX INC'
questions['names_for_filter'][114:118] = 'NIKE, Inc.'
questions['names_for_filter'][118:119] = 'PayPal Holdings, Inc.'
questions['names_for_filter'][119:130] = 'PEPSICO INC'
questions['names_for_filter'][130:136] = 'PFIZER INC'
questions['names_for_filter'][136:142] = 'Ulta Beauty, Inc.'
questions['names_for_filter'][142:147] = 'VERIZON COMMUNICATIONS INC'
questions['names_for_filter'][147:150] = 'Walmart Inc.'

In [None]:
#questions.to_json(f'/content/drive/MyDrive/Thesis/rag_evaluation/financebench150/questions_metadata_names_retrieved.json')

In [61]:
### Generate
llm_generate = llm_phi3

prompt_generate = PromptTemplate(
    template="""<|assistant|> You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question.
    If you don't know the answer, just say that you don't know. Keep the answer concise <|end|>
    <|user|> Question: {question}. \n Context: {documents} \n Answer: <|end|>
    <|assistant|>""",
    input_variables=["question", "documents"],
)

rag_chain = prompt_generate | llm_generate | StrOutputParser()

In [62]:
### Hallucination Grader
llm_hallucination_grader = llm_phi3

# Prompt
prompt_hallucination_grader = PromptTemplate(
    template=""" <|assistant|> You are a grader assessing whether an answer is grounded in / supported by a set of facts.
    Give a binary 'yes' or 'no' score to indicate whether the answer is grounded in / supported by a set of facts.<|end|>
    <|user|> Here are the facts: {documents} \n Here is the answer: {generation}  <|end|>
    <|assistant|>""",
    input_variables=["generation", "documents"],
)

hallucination_grader = prompt_hallucination_grader | llm_hallucination_grader | StrOutputParser()

In [63]:
### Answer Grader
llm_answer_grader = llm_phi3

# Prompt
prompt_answer_grader = PromptTemplate(
    template="""<|assistant|> You are a grader assessing whether a generated answer is correct or incorrect, comparing a generated answer with the ground truth.
    Give a binary score 'yes' or 'no' to indicate whether the generated answer equal to or contains the ground truth.<|end|>
    <|user|> Here is the answer: {generation} \n Here is the ground truth: {ground_truth} <|end|>
    <|assistant|>""",
    input_variables=["generation", "ground_truth"],
)

answer_grader = prompt_answer_grader | llm_answer_grader | StrOutputParser()

In [66]:
NUM_PAR_CHUNKS = 20
N_DOCS_RETURN = 2

parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=256)

results_list = []

for i in tqdm(range(len(questions))):
  query = questions['question'][i]
  company = questions['names_for_filter'][i]

  big_chunks_retriever = ParentDocumentRetriever(vectorstore=vectorstore, docstore=store, child_splitter=child_splitter, parent_splitter=parent_splitter,
                                                search_kwargs={'filter': {'company': company}, 'k': NUM_PAR_CHUNKS})
  passage = big_chunks_retriever.invoke(query)

  texts = []
  for j in range(len(passage)):
    texts.append([query, passage[j].page_content])

  if not texts:
    error_message = f"Skipping question {i} due to empty texts list"
    print(error_message)
    results_list.append(pd.DataFrame({
                                      'question': [query],
                                      'response': [error_message],
                                      'context': [error_message],
                                      'hallucination_grade': [error_message],
                                      'answer_grade': [error_message],
                                      'ground_truth': [questions['answer'][i]],
                                      'evidence': [questions['evidence'][i]]
                                      }))
    continue

  scores = reranker.compute_score(texts)
  combined = list(zip(texts, scores))
  sorted_combined = sorted(combined, key=lambda x: x[1], reverse=True)
  top_texts = [item[0] for item in sorted_combined[:N_DOCS_RETURN]]
  docs = [inner_list[1] for inner_list in top_texts if len(inner_list)>1]

  generation = rag_chain.invoke({"documents": docs, "question": query})
  hallucination_grade = hallucination_grader.invoke({"documents": docs, "generation": generation})
  answer_grade = answer_grader.invoke({"ground_truth": questions['answer'][i], "generation": generation})

  results_list.append(pd.DataFrame({
                                    'question': [query],
                                    'response': [generation],
                                    'context': [docs],
                                    'hallucination_grade': [hallucination_grade],
                                    'answer_grade': [answer_grade],
                                    'ground_truth': [questions['answer'][i]],
                                    'evidence': [questions['evidence'][i]]
                                    }))

results = pd.concat(results_list, ignore_index=True)
results.to_json(f'/content/drive/MyDrive/Thesis/rag_evaluation/financebench150/eval_v2.json')

  0%|          | 0/150 [00:00<?, ?it/s]Llama.generate: prefix-match hit

llama_print_timings:        load time =     521.11 ms
llama_print_timings:      sample time =      92.70 ms /   122 runs   (    0.76 ms per token,  1316.00 tokens per second)
llama_print_timings: prompt eval time =    6796.71 ms /  1166 tokens (    5.83 ms per token,   171.55 tokens per second)
llama_print_timings:        eval time =    4852.11 ms /   121 runs   (   40.10 ms per token,    24.94 tokens per second)
llama_print_timings:       total time =   11887.78 ms /  1287 tokens
Llama.generate: prefix-match hit

llama_print_timings:        load time =     521.11 ms
llama_print_timings:      sample time =       1.09 ms /     2 runs   (    0.55 ms per token,  1828.15 tokens per second)
llama_print_timings: prompt eval time =    6922.78 ms /  1248 tokens (    5.55 ms per token,   180.27 tokens per second)
llama_print_timings:        eval time =      78.27 ms /     2 runs   (   39.13 ms per token,    25.55 tokens p

In [67]:
len(results)

150