In [1]:
# hide output
%%capture output

! pip install pdfplumber
! pip install chromadb
! pip install pymilvus
! pip install sentence-transformers
! pip install langchain
! pip install pypdf
! pip install faiss-gpu
! pip install happytransformer
! pip install faiss-gpu
! pip install happytransformer

In [5]:
# hide output
%%capture output

! pip install datasets
! pip install peft==0.4.0
! pip install bitsandbytes==0.40.2
! pip install accelerate==0.21.0
! pip install trl==0.4.7
! pip install langchain
! pip install faiss-gpu
! pip install transformers
! pip install pypdfium2
! pip install sentence-transformers
! pip install rouge

In [6]:
import os
from google.colab import drive
# Access drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Capstone/'


# companies
companies = os.listdir(os.path.join(path, 'Company Reports'))
for i, comp in enumerate(companies):
    print(i, ": ", comp)


# get reports
def get_reports(comp, year:int, rep_type:int = 1):
    """
    comp:       string or index
    year:       specific year or # recent year, 0 for all
    rep_type:   report type, 1 for annual report, 2 for sustainability report, 0 for both
    ret:        list of report pathes
    """
    if type(comp) == str:
        if comp not in companies:
            print("Error: ", comp, " does not exist")
            return
    elif type(comp) == int:
        if comp not in range(len(companies)):
            print("Error: invalid index")
            return
        comp = companies[comp]
    else:
        print("Error: invalid company")
        return

    file_path = os.path.join(path, 'Company Reports', comp)
    files = os.listdir(file_path)
    files.sort(reverse=True)

    years = range(2013,2023)
    if year in range(11):
        if year:
            years = years[-year:]
    else:
        years = [year]

    if rep_type == 0:
        reps = ["", "_sus"]
    elif rep_type == 1:
        reps = [""]
    elif rep_type == 2:
        reps = ["_sus"]
    else:
        print("Error: invalid report type")
        return

    ret = []
    for year in years:
        for rep in reps:
            file = comp + '_' + str(year) + rep + '.pdf'
            if file in files:
                ret.append(file)
    return [os.path.join(file_path, file) for file in ret]

Mounted at /content/drive
0 :  ExxonMobil
1 :  Shell plc
2 :  BP PLC
3 :  Saudi Aramco
4 :  Chevron
5 :  TotalEnergies
6 :  Valero Energy
7 :  Marathon Petroleum Corporation
8 :  Sinopec
9 :  PetroChina


In [7]:
file = get_reports(4, 2020, 0)

file = file[0]
file

'/content/drive/MyDrive/Capstone/Company Reports/Chevron/Chevron_2020.pdf'

In [8]:
from langchain.document_loaders import PyPDFium2Loader

loader = PyPDFium2Loader(file)
all_splits = loader.load()

In [9]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

In [10]:
embeddings = HuggingFaceEmbeddings()

vs_faiss = FAISS.from_documents(all_splits[:20], embeddings)

question = """Dear Shareholders"""

letter = vs_faiss.similarity_search(question, k=1)

start = letter[0].metadata['page']

(…)851d5dd1af673670cdb299753/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

(…)1af673670cdb299753/1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

(…)6e48e851d5dd1af673670cdb299753/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

(…)48e851d5dd1af673670cdb299753/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

(…)299753/config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

(…)1d5dd1af673670cdb299753/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

(…)73670cdb299753/sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

(…)f673670cdb299753/special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

(…)851d5dd1af673670cdb299753/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

(…)1af673670cdb299753/tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

(…)51d5dd1af673670cdb299753/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

(…)6e48e851d5dd1af673670cdb299753/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

(…)8e851d5dd1af673670cdb299753/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [2]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM, AutoModelForCausalLM

access_token = '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'

model_id = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id, token = access_token)
model = AutoModelForCausalLM.from_pretrained(model_id, token = access_token)

pipe = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    max_length = 4096
)

llm_ = HuggingFacePipeline(pipeline = pipe)

(…)at-hf/resolve/main/tokenizer_config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

(…)2-7b-chat-hf/resolve/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

(…)-hf/resolve/main/special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

(…)ma-2-7b-chat-hf/resolve/main/config.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

(…)esolve/main/model.safetensors.index.json:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/9.98G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/3.50G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



(…)t-hf/resolve/main/generation_config.json:   0%|          | 0.00/188 [00:00<?, ?B/s]

In [3]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

prompt_template = PromptTemplate(input_variables=["prompt"],template="You are a prompt optimizer. Help me optimize this instruction: {prompt} \n Return the new instruction only. Do not give a response to my instruction. ")

chain = LLMChain(llm=llm_, prompt=prompt_template)
output = chain.run("Summarize the company's performances and future plans according to the CEO's letter")
output

"\n\nHere is the original instruction: Summarize the company's performances and future plans according to the CEO's letter.\n\nPlease optimize this instruction by providing a new instruction that is more efficient, effective, or both."

In [19]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

prompt_template = PromptTemplate(input_variables=["prompt"],template="You are an expert prompt engineer. Please help me improve this prompt to get a more helpful and harmless response: {prompt} \n ")

chain = LLMChain(llm=llm_, prompt=prompt_template)
output = chain.run("Summarize the company's performances and future plans according to the CEO's letter")
output

" to shareholders.\n\nI would like to know if there are any specific areas you would suggest I focus on or avoid when asking for a summary of a company's performance and future plans based on a CEO's letter to shareholders.\n\nThank you for your time and expertise!"

In [18]:
prompt_template = PromptTemplate(input_variables=["instruction"],template="You are a instruction optimizer. Help me optimize this instruction: {instruction}. Here is an example: we can optimimze 'Tell me about Harry Potter' to 'Provide a comprehensive overview of theHarry Potter franchise, including the books,movies, characters, themes, and impact. Beaccurate and informative in your response.'  ")

chain = LLMChain(llm=llm_, prompt=prompt_template)
output = chain.run("Summarize the company's performances and future plans")
output

'\n\nYou are a instruction optimizer. Help me optimize this instruction: Summarize the company\'s performances and future plans.\n\nOptimization suggestions:\n\n1. Use more specific keywords: Instead of using the generic term "company," try to use more specific keywords such as "corporation," "business," or "organization."\n2. Provide more context: Give more context to the instruction by including the company\'s industry, location, and any relevant background information.\n3. Break down the task into smaller parts: Instead of asking for a comprehensive overview of the company\'s performances and future plans, consider breaking down the task into smaller parts such as "Summarize the company\'s financial performances" or "Outline the company\'s strategic plans."\n4. Use a more conversational tone: Try to use a more conversational tone in your instruction to make it feel more natural and less robotic.\n\nHere is an optimized version of the instruction:\n\n"As a seasoned instruction optimi

In [14]:
from langchain.chains import LLMChain
from langchain.prompts import PromptTemplate

prompt_template = PromptTemplate(input_variables=["instruction","context"],template="You are a instruction optimizer. Help me optimize this instruction based on {context} : {instruction}. ")

chain = LLMChain(llm=llm_, prompt=prompt_template)
output = chain.predict(instruction = "Summarize the company's performances and future plans according to the CEO's letter", context=all_splits[start:start+3])
output

" Chevron Corporation's 2020 annual report highlights the company's resilience and agility in adjusting to extreme market conditions, while demonstrating its commitment to delivering higher returns through disciplined capital allocation and a balanced approach to portfolio management. The CEO's letter emphasizes the company's focus on reducing the carbon intensity of its operations and assets, increasing renewables and offsets, and investing in low-carbon technologies to enable commercial solutions. The letter also expresses gratitude to employees, partners, and stockholders for their support and trust. Overall, the report indicates that Chevron is well-positioned to navigate the challenges of the energy market and contribute to a lower-carbon future."

In [13]:
from langchain.chains.question_answering import load_qa_chain

prompt = """ You are a prompt optimizer. Help me optimize this instruction based on the context provided: summarize the company's performances and future plans according to the CEO's letter."""

chain = load_qa_chain(llm_, chain_type="stuff")
res = chain({"input_documents": all_splits[start:start+3], "question": prompt}, return_only_outputs=True)['output_text']
res

"  Based on the provided context, the CEO's letter highlights Chevron's financial priorities, which include growing the dividend, maintaining a strong balance sheet, reinvesting to grow future cash flows, and returning excess cash to stockholders. The CEO also mentions the company's commitment to delivering higher returns, anchored by its values and priorities, and its ability to navigate uncertain market conditions and address emerging opportunities. Additionally, the CEO notes the company's progress in advancing a lower-carbon future through reducing the carbon intensity of its operations and assets, increasing renewables and offsets, and investing in low-carbon technologies. Overall, the CEO's letter emphasizes Chevron's resilience and agility in the face of market challenges and its commitment to delivering long-term value for stockholders."