In [1]:
# hide output
%%capture output

! pip install pdfplumber
! pip install chromadb
! pip install pymilvus
! pip install sentence-transformers
! pip install langchain
! pip install pypdf
! pip install faiss-gpu
! pip install happytransformer

In [2]:
! pip install faiss-gpu
! pip install happytransformer



In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# 1. Load Data

In [4]:
import os
from google.colab import drive
# Access drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Capstone/'


# companies
companies = os.listdir(os.path.join(path, 'Company Reports'))
for i, comp in enumerate(companies):
    print(i, ": ", comp)


# get reports
def get_reports(comp, year:int, rep_type:int = 1):
    """
    comp:       string or index
    year:       specific year or # recent year, 0 for all
    rep_type:   report type, 1 for annual report, 2 for sustainability report, 0 for both
    ret:        list of report pathes
    """
    if type(comp) == str:
        if comp not in companies:
            print("Error: ", comp, " does not exist")
            return
    elif type(comp) == int:
        if comp not in range(len(companies)):
            print("Error: invalid index")
            return
        comp = companies[comp]
    else:
        print("Error: invalid company")
        return

    file_path = os.path.join(path, 'Company Reports', comp)
    files = os.listdir(file_path)
    files.sort(reverse=True)

    years = range(2013,2023)
    if year in range(11):
        if year:
            years = years[-year:]
    else:
        years = [year]

    if rep_type == 0:
        reps = ["", "_sus"]
    elif rep_type == 1:
        reps = [""]
    elif rep_type == 2:
        reps = ["_sus"]
    else:
        print("Error: invalid report type")
        return

    ret = []
    for year in years:
        for rep in reps:
            file = comp + '_' + str(year) + rep + '.pdf'
            if file in files:
                ret.append(file)
    return [os.path.join(file_path, file) for file in ret]

Mounted at /content/drive
0 :  ExxonMobil
1 :  Shell plc
2 :  BP PLC
3 :  Saudi Aramco
4 :  Chevron
5 :  TotalEnergies
6 :  Valero Energy
7 :  Marathon Petroleum Corporation
8 :  Sinopec
9 :  PetroChina


In [184]:
file = get_reports(6, 2020, 1)

file = file[0]
file

'/content/drive/MyDrive/Capstone/Company Reports/Valero Energy/Valero Energy_2020.pdf'

## 2. Load and Split


In [185]:
from langchain.document_loaders import PyPDFium2Loader

loader = PyPDFium2Loader(file)
all_splits = loader.load()

## 3. Store in Vector database

In [186]:
embeddings = HuggingFaceEmbeddings()

vs_faiss = FAISS.from_documents(all_splits[:20], embeddings)

## 4. Retrive and Store Relative Contents

In [187]:
question = """Dear Shareholders"""

letter = vs_faiss.similarity_search(question, k=1)
letter

[Document(page_content='UNITED STATES\r\nSECURITIES AND EXCHANGE COMMISSION\r\nWashington, D.C. 20549\r\nFORM 10-K\r\n(Mark One)\r\n☑ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\r\nFor the fiscal year ended December 31, 2020\r\nOR\r\n☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\r\nFor the transition period from _______________ to _______________\r\nCommission file number 001-13175\r\nVALERO ENERGY CORPORATION\r\n(Exact name of registrant as specified in its charter)\r\nDelaware 74-1828067\r\n(State or other jurisdiction of (I.R.S. Employer\r\nincorporation or organization) Identification No.)\r\nOne Valero Way\r\nSan Antonio, Texas 78249\r\n(Address of principal executive offices) (Zip Code)\r\nRegistrant’s telephone number, including area code: (210) 345-2000\r\nSecurities registered pursuant to Section 12(b) of the Act:\r\nTitle of each class Trading Symbol(s) Name of each exchange on which registe

In [188]:
start = letter[0].metadata['page']

In [189]:
str(all_splits[start:start+3])

"[Document(page_content='UNITED STATES\\r\\nSECURITIES AND EXCHANGE COMMISSION\\r\\nWashington, D.C. 20549\\r\\nFORM 10-K\\r\\n(Mark One)\\r\\n☑ ANNUAL REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\\r\\nFor the fiscal year ended December 31, 2020\\r\\nOR\\r\\n☐ TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934\\r\\nFor the transition period from _______________ to _______________\\r\\nCommission file number 001-13175\\r\\nVALERO ENERGY CORPORATION\\r\\n(Exact name of registrant as specified in its charter)\\r\\nDelaware 74-1828067\\r\\n(State or other jurisdiction of (I.R.S. Employer\\r\\nincorporation or organization) Identification No.)\\r\\nOne Valero Way\\r\\nSan Antonio, Texas 78249\\r\\n(Address of principal executive offices) (Zip Code)\\r\\nRegistrant’s telephone number, including area code: (210) 345-2000\\r\\nSecurities registered pursuant to Section 12(b) of the Act:\\r\\nTitle of each class Trading Symbo

## 5. Model

###Vicuna 7b

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM, AutoModelForCausalLM

model_id_vicuna = "lmsys/vicuna-7b-v1.5-16k"

tokenizer_vicuna = AutoTokenizer.from_pretrained(model_id_vicuna)
model_vicuna = AutoModelForCausalLM.from_pretrained(model_id_vicuna)

pipe_vicuna = pipeline(
    "text-generation",
    model = model_vicuna,
    tokenizer = tokenizer_vicuna,
    max_length = 4000,
    temperature = 0,
    top_p = 0
)

llm = HuggingFacePipeline(pipeline = pipe_vicuna)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



###Mistral-7b

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM, AutoModelForCausalLM

model_id_mistral = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer_mistral = AutoTokenizer.from_pretrained(model_id_mistral)
model_mistral = AutoModelForCausalLM.from_pretrained(model_id_mistral)

pipe_mistral = pipeline(
    "text-generation",
    model = model_mistral,
    tokenizer = tokenizer_mistral,
    max_length = 4000
)

llm = HuggingFacePipeline(pipeline = pipe_mistral)

### Llama 2

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM, AutoModelForCausalLM

access_token = "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"

model_id = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id, token = access_token)
model = AutoModelForCausalLM.from_pretrained(model_id, token = access_token)

pipe = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    max_length = 4096
)

llm_ = HuggingFacePipeline(pipeline = pipe)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



##6. Result

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

prompt = """What is the company's performances and future plans according to the CEO's letter: /n {text} /n SUMMARY:"""
prompt_template = PromptTemplate(template=prompt, input_variables=["text"])

summary_chain = load_summarize_chain(llm=llm_,
                                     chain_type='stuff',
                                     prompt=prompt_template)

output = summary_chain.run(all_splits[start:start+3])
output

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

prompt = """Summarize the company's performances and future plans according to the CEO's letter: /n {text} /n SUMMARY:"""
prompt_template = PromptTemplate(template=prompt, input_variables=["text"])

summary_chain = load_summarize_chain(llm=llm_,
                                     chain_type='stuff',
                                     prompt=prompt_template)

output = summary_chain.run(all_splits[start:start+3])
output

"\nThe CEO's letter highlights the importance of affordable and reliable energy in the world economy, emphasizing the need for a lower carbon energy system that balances economic prosperity, energy security, and environmental protection. The company's strategy is to leverage its strengths to deliver lower carbon energy to a growing world, with a focus on investing in high-quality assets, delivering consistent dividend growth, maintaining a strong balance sheet, and repurchasing shares. In 2022, Chevron increased investment by over 75% versus 2021, achieving the highest U.S. production in its history, and delivering a return on capital employed of over 20%. The company also returned $11.25 billion to stockholders through dividend payments and share repurchases.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n

## 7. Compare

**For Marathon Petroleum**

**Vicuna 7b gives**

Marathon Petroleum Corporation (MPC) had a successful year in 2022, with strong financial performance and progress in sustainability initiatives. The company's three strategic pillars - strengthening the competitive position of its assets, fostering a low-cost culture, and improving commercial performance - have been foundational in driving long-term value creation. MPC's focus on sustainable energy and reducing its carbon footprint, as well as its commitment to diversity, equity, and inclusion, demonstrate its dedication to being a responsible corporate citizen. The company's achievements are a testament to the skills and dedication of its workforce, and MPC is well-positioned to create value for its stakeholders in the future.

**Mistral 7b gives**

The CEO's letter highlights the company's achievements in 2022, including meeting robust customer demand, capturing strong margin opportunities, and running the refining system at 96% utilization. The company also made significant progress on its environmental, social, and governance (ESG) efforts, including optimizing its renewable fuels production capabilities and setting a new industry record for the number of refineries certified for superior energy efficiency performance in one year.

Looking forward, the CEO outlines the company's plans to continue to develop enduring capabilities, including strengthening the competitive position of its assets, fostering a low-cost culture, and improving commercial performance. The company also plans to continue to invest in low-carbon projects, particularly around increasing its production of renewable fuels and natural gas, and to explore and develop pathways for emerging opportunities around carbon capture, utilization, and sequestration (CCUS), as well as hydrogen energy production and utilization. The CEO emphasizes the importance of the company's steadfast commitment to returning capital to its shareholders.

**Llama 2 gives**

Based on the CEO's letter, Marathon Petroleum Corporation (MPC) performed well in 2022, demonstrating significant progress in its three strategic pillars: strengthening the competitive position of its assets, fostering a low-cost culture, and improving commercial performance. MPC generated \$16.4 billion of cash from operations and returned \$12 billion to shareholders through share repurchases and dividends. The company is committed to sustainability and has set meaningful targets to reduce greenhouse gas (GHG) emissions, methane emissions, and freshwater intensity. MPC continues to invest in renewable fuels and emerging technologies and is well-positioned to create value for its stakeholders in the long term.

**or**

* MPC achieved \$16.4 billion of cash from operations in 2022 and returned nearly \$12 billion to shareholders through share repurchases and dividends.
* MPC strengthened its competitive position by advancing its Martinez Renewables facility and optimizing its Dickinson Renewables facility, and it set a new industry record for the number of refineries certified for superior energy efficiency performance through the U.S. Environmental Protection Agency’s (EPA) ENERGY STAR® program.
* MPC continued to build upon its Focus on Energy (FOE) program, which uses key performance indicators to guide energy management and promote energy efficiency across its refineries, and it led in sustainable energy by reducing its carbon footprint and conserving natural resources, investing in renewables and emerging technologies, and embedding sustainability in decision-making and stakeholder engagement.
* MPC set meaningful targets to reduce greenhouse gas (GHG) emissions, methane emissions, and freshwater intensity and expanded its existing goal for reducing methane emissions intensity across its natural gas gathering and processing business.
* MPC remains focused on being a good neighbor in its communities through meaningful relationships with key stakeholders, and it invested more than $20 million in the communities where it lives and works and in broader causes that united many of its employees.
* MPC continues to allocate significant growth capital to low-carbon projects, particularly increasing its production of renewable fuels and natural gas, and it is actively involved in public-private alliances to explore and develop pathways for emerging opportunities around carbon capture, utilization, and sequestration, as well as hydrogen energy production and utilization.
* MPC anticipates that demand recovery for its products will continue and global supply constraints will persist in 2023, and it will continue to focus on how it runs the business, always striving to be well-positioned to create value for its stakeholders regardless of market tailwinds or turbulence.
* MPC's achievements are a testament to the skills, dedication, and adaptability of its people, who are enthusiastic about the opportunities ahead for the company, and its workforce demonstrated tremendous effort in 2022 to meet demand for its products, safely maintain and operate its assets, and support many value-creating programs and strategies that are critical for its long-term success.
* MPC's President and CEO, Michael J. Hennigan, is proud to lead the company and thankful and grateful for the work its people do each day to deliver value for the business and its shareholders.

**For BP PLC**

**Vicuna 7b gives**

 What is the company's performance and future plans according to the CEO's letter?

**or**

 The CEO's letter highlights the company's performance and future plans. The company has made substantial progress in transforming from an international oil company

 **Mistral 7b gives**

 7 bp Annual Report and Form 20-F 2022
Environmental, social and governance (ESG) performance
bp’s ESG performance is reported in accordance with the Global Reporting Initiative (GRI) Standards and the Task Force on Climate-related Financial Disclosures (TCFD) Recommendations.
Environmental performance
bp’s environmental performance is reported in accordance with the GRI Standards and the TCFD Recommendations.
Carbon emissions
bp’s carbon emissions decreased by 12.5%

**Llama 2 gives**

1. The energy industry is experiencing a period of significant change, driven by the Russia-Ukraine war, the energy transition, and the need to decarbonize the energy mix.

2. bp is transforming itself into an integrated energy company, with a focus on delivering sustainable value for shareholders while meeting the world's energy needs.

3. bp's financial performance in 2022 was strong, with underlying replacement cost profit up 27.7% and underlying ROACE of 30.5%.