<a href="https://colab.research.google.com/github/winterForestStump/thesis/blob/main/notebooks/noRag_x_phi3_financebenchQA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
%%capture --no-stderr
%pip install langchain langchain-core

In [5]:
%%capture --no-stderr
%pip install langchain-community --quiet

In [2]:
# LlamaCpp x GPU usage
!CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python

Collecting llama-cpp-python
  Downloading llama_cpp_python-0.2.79.tar.gz (50.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.3/50.3 MB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting diskcache>=5.6.1 (from llama-cpp-python)
  Downloading diskcache-5.6.3-py3-none-any.whl (45 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.5/45.5 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: llama-cpp-python
  Building wheel for llama-cpp-python (pyproject.toml) ... [?25l[?25hdone
  Created wheel for llama-cpp-python: filename=llama_cpp_python-0.2.79-cp310-cp310-linux_x86_64.whl size=172368897 sha256=4afe777ab116d11a6b44b4b1094dcd0f5f59ad6fcf2958bd48c0a5ec7617a3da
  Stored in direc

In [3]:
# Mounting Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
from langchain_community.llms import LlamaCpp
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_core.prompts import PromptTemplate
from langchain_core.output_parsers import StrOutputParser
from tqdm import tqdm
import pandas as pd

In [7]:
!huggingface-cli download microsoft/Phi-3-mini-4k-instruct-gguf Phi-3-mini-4k-instruct-fp16.gguf --local-dir ./models --local-dir-use-symlinks False

Downloading 'Phi-3-mini-4k-instruct-fp16.gguf' to 'models/.huggingface/download/Phi-3-mini-4k-instruct-fp16.gguf.5d99003e395775659b0dde3f941d88ff378b2837a8dc3a2ea94222ab1420fad3.incomplete'
Phi-3-mini-4k-instruct-fp16.gguf: 100% 7.64G/7.64G [00:44<00:00, 172MB/s]
Download complete. Moving file to models/Phi-3-mini-4k-instruct-fp16.gguf
models/Phi-3-mini-4k-instruct-fp16.gguf


In [8]:
TEMP = 0
N_CTX = 4096
N_GPU_L = -1

llm_phi3 = LlamaCpp(
    model_path="/content/models/Phi-3-mini-4k-instruct-fp16.gguf",
    temperature=TEMP,
    n_ctx=N_CTX,
    n_gpu_layers = N_GPU_L,
    verbose=True
)

llama_model_loader: loaded meta data with 23 key-value pairs and 195 tensors from /content/models/Phi-3-mini-4k-instruct-fp16.gguf (version GGUF V3 (latest))
llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.
llama_model_loader: - kv   0:                       general.architecture str              = phi3
llama_model_loader: - kv   1:                               general.name str              = Phi3
llama_model_loader: - kv   2:                        phi3.context_length u32              = 4096
llama_model_loader: - kv   3:                      phi3.embedding_length u32              = 3072
llama_model_loader: - kv   4:                   phi3.feed_forward_length u32              = 8192
llama_model_loader: - kv   5:                           phi3.block_count u32              = 32
llama_model_loader: - kv   6:                  phi3.attention.head_count u32              = 32
llama_model_loader: - kv   7:               phi3.attention.head_count

In [13]:
questions = pd.read_json('https://raw.githubusercontent.com/patronus-ai/financebench/main/data/financebench_open_source.jsonl', lines=True)
questions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   financebench_id       150 non-null    object
 1   company               150 non-null    object
 2   doc_name              150 non-null    object
 3   question_type         150 non-null    object
 4   question_reasoning    100 non-null    object
 5   domain_question_num   50 non-null     object
 6   question              150 non-null    object
 7   answer                150 non-null    object
 8   justification         100 non-null    object
 9   dataset_subset_label  150 non-null    object
 10  evidence              150 non-null    object
dtypes: object(11)
memory usage: 13.0+ KB


In [15]:
### Generate
llm_generate = llm_phi3

prompt_generate = PromptTemplate(
    template="""<|assistant|> You are an assistant for question-answering tasks. Answer the user's question.
    If you don't know the answer, just say that you don't know. Keep the answer concise <|end|>
    <|user|> Question: {question}. \n Answer: <|end|> <|assistant|>""",
    input_variables=["question"],
)

rag_chain = prompt_generate | llm_generate | StrOutputParser()

In [16]:
results_list = []

for i in tqdm(range(len(questions))):
  generation = rag_chain.invoke({"question": questions['question'][i]})
  results_list.append({
      'question': questions['question'][i],
      'response': generation,
      'correct_answer': questions['answer'][i]
  })

results = pd.DataFrame(results_list)
results.to_json(f'/content/drive/MyDrive/Thesis/rag_evaluation/phi-3_x_no_rag/financebench_eval_no_rag.json', orient='records')

  0%|          | 0/150 [00:00<?, ?it/s]Llama.generate: prefix-match hit

llama_print_timings:        load time =     492.45 ms
llama_print_timings:      sample time =      45.06 ms /    70 runs   (    0.64 ms per token,  1553.35 tokens per second)
llama_print_timings: prompt eval time =     303.38 ms /    51 tokens (    5.95 ms per token,   168.11 tokens per second)
llama_print_timings:        eval time =    2516.33 ms /    69 runs   (   36.47 ms per token,    27.42 tokens per second)
llama_print_timings:       total time =    2901.51 ms /   120 tokens
  1%|          | 1/150 [00:02<07:14,  2.92s/it]Llama.generate: prefix-match hit

llama_print_timings:        load time =     492.45 ms
llama_print_timings:      sample time =      56.94 ms /    97 runs   (    0.59 ms per token,  1703.46 tokens per second)
llama_print_timings: prompt eval time =     310.63 ms /    61 tokens (    5.09 ms per token,   196.38 tokens per second)
llama_print_timings:        eval time =    3507.39 ms /    96 r