# Week 10 - kpi
    - test data:

## 0. Installation and Setup

In [None]:
# hide output
%%capture output

! pip install peft
! pip install bitsandbytes==0.41.1
! pip install accelerate==0.24.1
! pip install trl==0.4.7
! pip install langchain
! pip install faiss-gpu
! pip install transformers
! pip install pypdfium2
! pip install sentence-transformers

In [1]:
import os, pandas as pd
from google.colab import drive
# Access drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Capstone/'

# get testing files
pdf_path = os.path.join(path, 'Data', 'Company Reports 2022')
output_path = os.path.join(path, 'Data')

Mounted at /content/drive


## 1. Data preparation

In [None]:
from langchain.document_loaders import PyPDFium2Loader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

text_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", " ", "", "."],
    chunk_size = 1000,
    chunk_overlap = 500
)

embeddings = HuggingFaceEmbeddings()

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
from tqdm import tqdm

# get testing data
test_files = [f for f in os.listdir(pdf_path) if 'Exxon' not in f]

vss = {}

for file in tqdm(test_files):
    file_path = os.path.join(pdf_path, file)
    loader = PyPDFium2Loader(file_path)
    data = text_splitter.split_documents(loader.load())
    vs = FAISS.from_documents(data, embeddings)
    vss[file] = vs

100%|██████████| 9/9 [02:31<00:00, 16.83s/it]


In [None]:
def get_answer(df, chain):
    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
        doc = vss[row['file']].similarity_search(row['question'])
        pages = ''
        for d in doc:
            pages += str(d.metadata['page']) + ', '
        pages = pages[:-2]

        df.loc[i, 'answer'] = chain({"input_documents": doc, "question": row['question']}, return_only_outputs=True)['output_text']
        df.loc[i, 'pages'] = pages

## 2. Model

### The original Llama model

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.chains.question_answering import load_qa_chain
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel

access_token = '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'
model_id = 'meta-llama/Llama-2-7b-chat-hf'

# QLoRA configuration
compute_dtype = getattr(torch, 'float16')

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False
)

# Load 4-bit quantized model
model_llama = AutoModelForCausalLM.from_pretrained(
    model_id,
    token = access_token,
    device_map={"": 0},
    quantization_config=bnb_config,
)
model_llama.config.use_cache = False
model_llama.config.pretraining_tp = 1

tokenizer_llama = AutoTokenizer.from_pretrained(model_id, token = access_token)
tokenizer_llama.pad_token = tokenizer_llama.eos_token
tokenizer_llama.padding_side = "right"

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

### get pipelines

In [None]:
# original
pipe = pipeline(
    "text-generation",
    model = model_llama,
    tokenizer = tokenizer_llama,
    max_new_tokens = 128
)

llm = HuggingFacePipeline(pipeline = pipe)

chain_ori = load_qa_chain(llm, chain_type="stuff")

In [None]:
# tuned
adapter_path = os.path.join(path, 'Models', 'QA', 'v7', 'llama-2-7b-qa')

model = PeftModel.from_pretrained(model_llama, adapter_path)

pipe = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer_llama,
    max_new_tokens = 128
)

llm = HuggingFacePipeline(pipeline = pipe)

chain_v7 = load_qa_chain(llm, chain_type="stuff")

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonForCausalLM', 'PLBartFo

In [None]:
# tuned - financial dataset
adapter_path = os.path.join(path, 'Models', 'FIN')

model = PeftModel.from_pretrained(model_llama, adapter_path)

pipe = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer_llama,
    max_new_tokens = 128
)

llm = HuggingFacePipeline(pipeline = pipe)

chain_fin = load_qa_chain(llm, chain_type="stuff")

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonForCausalLM', 'PLBartFo

## 3. KPI calculation

In [None]:
KPIs = ['Total Assets' , 'Short-term. Securities & Non-cash Investment',
        'Property\Plant & Equipment', 'Inventories']

In [None]:
questions = ["What is the company's " + KPI + "?" for KPI in KPIs]

df_ori = pd.DataFrame(columns = ['file', 'question'])
df_v7 = pd.DataFrame(columns = ['file', 'question'])
df_fin = pd.DataFrame(columns = ['file', 'question'])

for f in test_files:
    for q in questions:
        df_ori.loc[len(df_ori)] = [f, q]
        df_v7.loc[len(df_v7)] = [f, q]
        df_fin.loc[len(df_fin)] = [f, q]

In [None]:
get_answer(df_ori, chain_ori)
df_ori.to_csv(os.path.join(output_path, 'KPI_ori.csv'))

100%|██████████| 36/36 [05:28<00:00,  9.11s/it]


In [None]:
get_answer(df_v7, chain_v7)
df_v7.to_csv(os.path.join(output_path, 'KPI_v7.csv'))

100%|██████████| 36/36 [05:54<00:00,  9.84s/it]


In [None]:
get_answer(df_fin, chain_fin)
df_fin.to_csv(os.path.join(output_path, 'KPI_fin.csv'))

100%|██████████| 36/36 [05:41<00:00,  9.49s/it]


## 4. Result

In [2]:
ori = pd.read_csv(os.path.join(path, 'Data', 'KPI_ori.csv'))
v7 = pd.read_csv(os.path.join(path, 'Data', 'KPI_v7.csv'))
fin = pd.read_csv(os.path.join(path, 'Data', 'KPI_fin.csv'))

ori.drop(columns = ['Unnamed: 0'], inplace = True)
v7.drop(columns = ['Unnamed: 0'], inplace = True)
fin.drop(columns = ['Unnamed: 0'], inplace = True)

def adjust_len(row):
    row[-1] = row[-1][:min(len(row[-1]), len(row['Answers']))]

In [7]:
ori.head()

Unnamed: 0,file,question,answer,pages
0,BP PLC_2022.pdf,What is the company's Total Assets?,"The company's Total Assets is 1,752 million. ...","293, 226, 184, 184"
1,BP PLC_2022.pdf,What is the company's Short-term. Securities &...,Short-term securities and non-cash investment...,"358, 242, 403, 254"
2,BP PLC_2022.pdf,What is the company's Property\Plant & Equipment?,"The company's Property, Plant and Equipment (...","191, 403, 220, 403"
3,BP PLC_2022.pdf,What is the company's Inventories?,The company's Inventories include raw materia...,"228, 227, 395, 213"
4,Chevron_2022.pdf,What is the company's Total Assets?,"The company's Total Assets were $557,709 mill...","59, 70, 59, 69"


In [None]:
v7.head()

Unnamed: 0,file,question,answer,pages
0,BP PLC_2022.pdf,What is the company's Total Assets?,"The company's Total Assets is 151,475 million...","293, 226, 184, 184"
1,BP PLC_2022.pdf,What is the company's Short-term. Securities &...,The company has short-term securities and non...,"358, 242, 403, 254"
2,BP PLC_2022.pdf,What is the company's Property\Plant & Equipment?,"The company's property, plant and equipment (...","191, 403, 220, 403"
3,BP PLC_2022.pdf,What is the company's Inventories?,The company's Inventories include a wide rang...,"228, 227, 395, 213"
4,Chevron_2022.pdf,What is the company's Total Assets?,"The company's Total Assets were $257,709 mill...","59, 70, 59, 69"


In [None]:
fin.head()

Unnamed: 0,file,question,answer,pages
0,BP PLC_2022.pdf,What is the company's Total Assets?,"The company's Total Assets is 151,475 million...","293, 226, 184, 184"
1,BP PLC_2022.pdf,What is the company's Short-term. Securities &...,The company's short-term securities and non-c...,"358, 242, 403, 254"
2,BP PLC_2022.pdf,What is the company's Property\Plant & Equipment?,"The company's Property, Plant and Equipment (...","191, 403, 220, 403"
3,BP PLC_2022.pdf,What is the company's Inventories?,"Inventories include raw materials, work in pr...","228, 227, 395, 213"
4,Chevron_2022.pdf,What is the company's Total Assets?,"The company's Total Assets were $257,709 mill...","59, 70, 59, 69"


In [3]:
ans = pd.DataFrame(columns = ['file', 'question', 'pages', 'answer_1', 'answer_2'])
ans.file = ori.file
ans.question = ori.question
ans.pages = ori.pages

In [4]:
ans.head()

Unnamed: 0,file,question,pages,answer_1,answer_2
0,BP PLC_2022.pdf,What is the company's Total Assets?,"293, 226, 184, 184",,
1,BP PLC_2022.pdf,What is the company's Short-term. Securities &...,"358, 242, 403, 254",,
2,BP PLC_2022.pdf,What is the company's Property\Plant & Equipment?,"191, 403, 220, 403",,
3,BP PLC_2022.pdf,What is the company's Inventories?,"228, 227, 395, 213",,
4,Chevron_2022.pdf,What is the company's Total Assets?,"59, 70, 59, 69",,


In [5]:
ans.to_excel(os.path.join(path, 'Data', 'KPI_ans.xlsx'))

In [33]:
t = pd.read_excel(os.path.join(path, 'Data', 'KPI_ans.xlsx'))