In [None]:
# hide output
%%capture output

! pip install datasets
! pip install peft==0.4.0
! pip install bitsandbytes==0.40.2
! pip install accelerate==0.21.0
! pip install trl==0.4.7
! pip install langchain
! pip install faiss-gpu
! pip install transformers
! pip install pypdfium2
! pip install sentence-transformers
! pip install rouge

In [None]:
import os
from google.colab import drive
# Access drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Capstone/'


# companies
companies = os.listdir(os.path.join(path, 'Company Reports'))
for i, comp in enumerate(companies):
    print(i, ": ", comp)


# get reports
def get_reports(comp, year:int, rep_type:int = 1):
    """
    comp:       string or index
    year:       specific year or # recent year, 0 for all
    rep_type:   report type, 1 for annual report, 2 for sustainability report, 0 for both
    ret:        list of report pathes
    """
    if type(comp) == str:
        if comp not in companies:
            print("Error: ", comp, " does not exist")
            return
    elif type(comp) == int:
        if comp not in range(len(companies)):
            print("Error: invalid index")
            return
        comp = companies[comp]
    else:
        print("Error: invalid company")
        return

    file_path = os.path.join(path, 'Company Reports', comp)
    files = os.listdir(file_path)
    files.sort(reverse=True)

    years = range(2013,2023)
    if year in range(11):
        if year:
            years = years[-year:]
    else:
        years = [year]

    if rep_type == 0:
        reps = ["", "_sus"]
    elif rep_type == 1:
        reps = [""]
    elif rep_type == 2:
        reps = ["_sus"]
    else:
        print("Error: invalid report type")
        return

    ret = []
    for year in years:
        for rep in reps:
            file = comp + '_' + str(year) + rep + '.pdf'
            if file in files:
                ret.append(file)
    return [os.path.join(file_path, file) for file in ret]

Mounted at /content/drive
0 :  ExxonMobil
1 :  Shell plc
2 :  BP PLC
3 :  Saudi Aramco
4 :  Chevron
5 :  TotalEnergies
6 :  Valero Energy
7 :  Marathon Petroleum Corporation
8 :  Sinopec
9 :  PetroChina


In [None]:
file = get_reports(4, 2020, 0)

file = file[0]
file

'/content/drive/MyDrive/Capstone/Company Reports/Chevron/Chevron_2020.pdf'

In [None]:
from langchain.document_loaders import PyPDFium2Loader

loader = PyPDFium2Loader(file)
all_splits = loader.load()

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

In [None]:
embeddings = HuggingFaceEmbeddings()

vs_faiss = FAISS.from_documents(all_splits[:20], embeddings)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
question = """Dear Shareholders"""

letter = vs_faiss.similarity_search(question, k=1)

In [None]:
start = letter[0].metadata['page']

In [None]:
str(all_splits[start:start+3])

"[Document(page_content='1149230_Annual_Report_v8.3.indd 3 3/5/21 5:56 AM\\r\\n \\r\\nbuilding on strengths \\r\\nEven before COVID-19, we were preparing to lead in a future marked by change. Our actions were proactive and disciplined – \\r\\nsimplifying and modernizing work; integrating teams, processes and value chains across business units and geographies; elevating \\r\\nleadership capabilities; advancing digital solutions; and empowering our workforce to make decisions quickly, safely and with \\r\\ngreater accountability. \\r\\nWe leveraged our strengths to design a better company for the long term – one that can act deliberately, seize opportunity and \\r\\ngenerate stronger returns. And we remained true to our values, prepared to succeed in any environment, and adaptive in a dynamic \\r\\nworld where disruption is routine. \\r\\nOur fnancial priorities have not changed: \\r\\ngrowing \\r\\nthe dividend \\r\\nWhile others are lowering dividends, we have maintained ours \\r\\nas 

###5. Model

In [None]:
import torch
from peft import PeftModel
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM, AutoModelForCausalLM, BitsAndBytesConfig

# QLoRA configuration
compute_dtype = getattr(torch, 'float16')

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type='nf4',
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=False
)


access_token = "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"
adapter_path = os.path.join(path, 'T5_prompt_sum_tuned', 'flan-t5-small-sum-prompt')

model_id_llama ='google/flan-t5-small'
model_llama = AutoModelForSeq2SeqLM.from_pretrained(
    model_id_llama,
    token = access_token,
    quantization_config=bnb_config,
    device_map={"": 0},
)

model_new = PeftModel.from_pretrained(model_llama, adapter_path)

tokenizer_llama = AutoTokenizer.from_pretrained(model_id_llama, token = access_token)
tokenizer_llama.pad_token = tokenizer_llama.eos_token
tokenizer_llama.padding_side = "right"

pipe_llama_new = pipeline(
    "text-generation",
    model = model_new,
    tokenizer = tokenizer_llama,
    max_length = 4096
)

llm_llama_new = HuggingFacePipeline(pipeline = pipe_llama_new)

The model 'PeftModelForCausalLM' is not supported for text-generation. Supported models are ['BartForCausalLM', 'BertLMHeadModel', 'BertGenerationDecoder', 'BigBirdForCausalLM', 'BigBirdPegasusForCausalLM', 'BioGptForCausalLM', 'BlenderbotForCausalLM', 'BlenderbotSmallForCausalLM', 'BloomForCausalLM', 'CamembertForCausalLM', 'LlamaForCausalLM', 'CodeGenForCausalLM', 'CpmAntForCausalLM', 'CTRLLMHeadModel', 'Data2VecTextForCausalLM', 'ElectraForCausalLM', 'ErnieForCausalLM', 'FalconForCausalLM', 'FuyuForCausalLM', 'GitForCausalLM', 'GPT2LMHeadModel', 'GPT2LMHeadModel', 'GPTBigCodeForCausalLM', 'GPTNeoForCausalLM', 'GPTNeoXForCausalLM', 'GPTNeoXJapaneseForCausalLM', 'GPTJForCausalLM', 'LlamaForCausalLM', 'MarianForCausalLM', 'MBartForCausalLM', 'MegaForCausalLM', 'MegatronBertForCausalLM', 'MistralForCausalLM', 'MptForCausalLM', 'MusicgenForCausalLM', 'MvpForCausalLM', 'OpenLlamaForCausalLM', 'OpenAIGPTLMHeadModel', 'OPTForCausalLM', 'PegasusForCausalLM', 'PersimmonForCausalLM', 'PLBartFo

In [None]:
#from langchain.chains.summarize import load_summarize_chain
#from langchain.prompts import PromptTemplate

#prompt = """Optimize the prompt "Summarize the company's performances and future plans according to the CEO's letter" accoridng to context: /n {text} /n OPTIMIAZATION:"""
#prompt_template = PromptTemplate(template=prompt, input_variables=["text"])

#summary_chain = load_summarize_chain(llm=llm_llama_new,
#                                     chain_type='stuff',
#                                     prompt=prompt_template)

#output = summary_chain.run(all_splits[start:start+3])
#output

In [None]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

prompt_template = PromptTemplate(input_variables=["prompt"],template="You are a prompt optimizer. Help me optimize this instruction: {prompt} \n Return the new instruction only. Do not give a response to my instruction. ")

chain = LLMChain(llm=llm_llama_new, prompt=prompt_template)
output = chain.run("Summarize the company's performances and future plans according to the CEO's letter")
output

'y instruction. Do not give response to my instruction.'

In [None]:
from langchain.chains.question_answering import load_qa_chain

prompt = """ You are a prompt optimizer, help me improve this instruction, and do not give me a summarization: summarize the company's performances and future plans according to the CEO's letter."""

chain = load_qa_chain(llm_llama_new, chain_type="stuff")
res = chain({"input_documents": all_splits[start:start+3], "question": prompt}, return_only_outputs=True)['output_text']
res

Token indices sequence length is longer than the specified maximum sequence length for this model (1876 > 512). Running this sequence through the model will result in indexing errors


''