KPI test on Chevron 2022 anual report

In [1]:
# hide output
%%capture output

! pip install datasets
! pip install peft==0.6.0
! pip install bitsandbytes==0.41.1
! pip install accelerate==0.24.1
! pip install trl==0.4.7
! pip install langchain
! pip install faiss-gpu
! pip install transformers
! pip install pypdfium2
! pip install sentence-transformers

In [2]:
import os, pandas as pd
from google.colab import drive
# Access drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Capstone/'

Mounted at /content/drive


## Data

In [7]:
from langchain.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.document_loaders import PyPDFium2Loader
from langchain.chains.question_answering import load_qa_chain


text_splitter = RecursiveCharacterTextSplitter(
        separators = ["\n\n", " ", "", "."],
        chunk_size = 1000,
        chunk_overlap = 500
    )
embeddings = HuggingFaceEmbeddings()

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [6]:
def load_store_doc(file):
    loader = PyPDFium2Loader(file)
    data = text_splitter.split_documents(loader.load())
    vs = FAISS.from_documents(data, embeddings)
    return vs

def generate_response(prompt_input, llm, vs):
    doc = vs.similarity_search(prompt_input)
    chain = load_qa_chain(llm, chain_type="stuff")
    output = chain({"input_documents": doc, "question": prompt_input}, return_only_outputs=True)['output_text']
    return doc, output

In [9]:
pdf_file = os.path.join(path, 'Company Reports', 'Chevron', 'Chevron_2022.pdf')
vs = load_store_doc(pdf_file)

## Model


In [3]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, BitsAndBytesConfig, pipeline
from langchain.llms import HuggingFacePipeline
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model, PeftModel

access_token = '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'
model_id = 'meta-llama/Llama-2-7b-chat-hf'

# QLoRA configuration
compute_dtype = getattr(torch, 'float16')

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False
)

# Load 4-bit quantized model
model_llama = AutoModelForCausalLM.from_pretrained(
    model_id,
    token = access_token,
    device_map={"": 0},
    quantization_config=bnb_config,
)
model_llama.config.use_cache = False
model_llama.config.pretraining_tp = 1

tokenizer_llama = AutoTokenizer.from_pretrained(model_id, token = access_token)
tokenizer_llama.pad_token = tokenizer_llama.eos_token
tokenizer_llama.padding_side = "right"

adapter_path = os.path.join(path, 'exp', 'v7', 'llama-2-7b-qa')

model = PeftModel.from_pretrained(model_llama, adapter_path)

pipe = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer_llama,
    max_new_tokens = 128
)

llm = HuggingFacePipeline(pipeline = pipe)

config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonForCausalLM', 'PLBartFo

In [10]:
KPIs = ['Total Assets' , 'Short-term. Securities & Non-cash Investment',
        'Property\Plant & Equipment', 'Inventories']

In [19]:
questions = ["What is the company's" + KPI + "?" for KPI in KPIs]

In [5]:
from langchain.chains.question_answering import load_qa_chain
chain = load_qa_chain(llm, chain_type="stuff")

In [12]:
import csv
KPI_output = os.path.join(path, 'KPI_output.csv')
ans = []
with open(KPI_output, 'w', newline='') as csvfile:
    csv_writer = csv.writer(csvfile)
    csv_writer.writerow('Question', 'Answer', 'Document')
    for q in questions:
        doc, output = generate_response(q, llm, vs)
        csv_writer.writerow([q, output, doc])
