In [4]:
# hide output
%%capture output

! pip install pdfplumber
! pip install chromadb
! pip install pymilvus
! pip install sentence-transformers
! pip install langchain
! pip install pypdf

## 1. Load Data

In [5]:
import os
from google.colab import drive
# Access drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Capstone/'


# companies
companies = os.listdir(os.path.join(path, 'Company Reports'))
for i, comp in enumerate(companies):
    print(i, ": ", comp)


# get reports
def get_reports(comp, year:int, rep_type:int = 1):
    """
    comp:       string or index
    year:       specific year or # recent year, 0 for all
    rep_type:   report type, 1 for annual report, 2 for sustainability report, 0 for both
    ret:        list of report pathes
    """
    if type(comp) == str:
        if comp not in companies:
            print("Error: ", comp, " does not exist")
            return
    elif type(comp) == int:
        if comp not in range(len(companies)):
            print("Error: invalid index")
            return
        comp = companies[comp]
    else:
        print("Error: invalid company")
        return

    file_path = os.path.join(path, 'Company Reports', comp)
    files = os.listdir(file_path)
    files.sort(reverse=True)

    years = range(2013,2023)
    if year in range(11):
        if year:
            years = years[-year:]
    else:
        years = [year]

    if rep_type == 0:
        reps = ["", "_sus"]
    elif rep_type == 1:
        reps = [""]
    elif rep_type == 2:
        reps = ["_sus"]
    else:
        print("Error: invalid report type")
        return

    ret = []
    for year in years:
        for rep in reps:
            file = comp + '_' + str(year) + rep + '.pdf'
            if file in files:
                ret.append(file)
    return [os.path.join(file_path, file) for file in ret]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
0 :  ExxonMobil
1 :  Shell plc
2 :  BP PLC
3 :  Saudi Aramco
4 :  Chevron
5 :  TotalEnergies
6 :  Valero Energy
7 :  Marathon Petroleum Corporation
8 :  Sinopec
9 :  PetroChina


In [3]:
file = get_reports(0, 2022, 1)
file

['/content/drive/MyDrive/Capstone/Company Reports/ExxonMobil/ExxonMobil_2022.pdf']

## 2. Load and Split

In [4]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFLoader(file[0])
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500, chunk_overlap = 0)
all_splits = loader.load_and_split(text_splitter)
all_splits

[Document(page_content='2022Annual  \nReport', metadata={'source': '/content/drive/MyDrive/Capstone/Company Reports/ExxonMobil/ExxonMobil_2022.pdf', 'page': 0}),
 Document(page_content='See Cautionary Statement on Pa ge 139 for important information re gardin g forward-lookin g statements and terms used in this report.Abo ut th e cover:Our work to meet society’s needs is enabled by sites such as our Baytown, Texas, operations, amon g the world’s\nlargest inte grated and most technolo gically advanced refinin g and petrochemical complexes .Financial and operating performance significantly led peers1\nContinuing to be an industry leader in safety5EXXON MOBIL CORPORATION   | 2 022 ANNUAL REPOR T\nEarnings \n$56BCash flow from operations  \n$77B\nshareholder distributions$30B\nROCE with $23B in capex3 25%Structural cost savings2 \n$7B\ntotal shareholder return4 87%\nproduction growth in the\nPermian Basin and Guyan a >30%elimination of routine flarin g in \nPermian operated asset s7 10 0 %

## 3. Model - T5

In [2]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM

model_id = 'google/flan-t5-xxl'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model = model,
    tokenizer = tokenizer,
    max_length = 2000
)

llm = HuggingFacePipeline(pipeline = pipe)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/9.60G [00:00<?, ?B/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/6.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

#### Refine

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

prompt_template = """Write a detailed summary of the following: \n {text} \n SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)

refine_template = (
    "Your job is to produce a final detailed summary on how the company performed in the year with supportive data \n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary with the content below \n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Given the new context, refine the original summary"
)

refine_prompt = PromptTemplate.from_template(refine_template)
chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps=True,
    input_key="input_documents",
    output_key="output_text",
)

result = chain({"input_documents": all_splits[:20]}, return_only_outputs=True)

In [None]:
result

{'intermediate_steps': ['The report is a comprehensive review of the progress made by the Government in implementing the National Development Plan (NDP) and the National Economic and Development Strategy (NEDS) in the period from 2010 to 2022.',
  'Increased supply of reliable energy and essential productsAccelerated lower-emission opportunities6&',
  'Increased supply of reliable energy and essential productsAccelerated lower-emission opportunities6&',
  'a nd a dvanc ed recyclin g facility, capable o f processin g more than 8 0million pounds o f plastic waste per year . Our ability to si gnificantly improve our cost structure and profitably grow our businesses helped improve earnings resiliency and fortify our balance sheet while increasin g flexibility to navigate future down cycles. And through it all, we s hared our success wit h shareholders, rec laimin g',
  'a nd a dvanc ed recyclin g facility, capable o f processin g more than 8 0million pounds o f plastic waste per year . Our

In [None]:
result['output_text']

'We are an industry leader in liquefied natural gas (LNG), a key component to reducinggreenhouse gas emissions in the global energy mix. We participated in the production o f 81 million metric tons per year o f LNG – accounting for one- fifth of global demand. The Coral South Floating LNG development o ffshore Mozambique began production in October, contributing additional supply o f cleaner-burning, reliable, and transportable LNG at a time when it was great ly nee ded in European an d other internationa l mar kets. Upstream production growth in Permian and Guyana versus 2021>30% While achieving record production in 2022, we made progress toward our 2030 Scope 1 and 2 net-zero goal for our Permian Basin unconventional operated assets. During the year, we eliminated routine flaring7 and continued work to electrify operations with lower-emission power generated from wind, solar, and natural gas.'

#### Map-Reduce

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

map_prompt = """Write a summary of the following: /n {text} /n SUMMARY:"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """Given a list of summaries, combine them and tell me how the company performed in a detailed summary: /n {text} /n SUMMARY:"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

In [None]:
summary_chain = load_summarize_chain(llm=llm,
                                     chain_type='map_reduce',
                                     map_prompt=map_prompt_template,
                                     combine_prompt=combine_prompt_template,
                                     verbose=True)

In [None]:
output = summary_chain.run(all_splits[:20])



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mWrite a summary of the following: /n 2022Annual  
Report /n SUMMARY:[0m
Prompt after formatting:
[32;1m[1;3mWrite a summary of the following: /n See Cautionary Statement on Pa ge 139 for important information re gardin g forward-lookin g statements and terms used in this report.Abo ut th e cover:Our work to meet society’s needs is enabled by sites such as our Baytown, Texas, operations, amon g the world’s
largest inte grated and most technolo gically advanced refinin g and petrochemical complexes .Financial and operating performance significantly led peers1
Continuing to be an industry leader in safety5EXXON MOBIL CORPORATION   | 2 022 ANNUAL REPOR T
Earnings 
$56BCash flow from operations  
$77B
shareholder distributions$30B
ROCE with $23B in capex3 25%Structural cost savings2 
$7B
total shareholder return4 87%
production growth in the
Permian B

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (3342 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2905 > 512). Running this sequence through the model will result in indexing errors




[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGiven a list of summaries, combine them and tell me how the company performed in a detailed concise summary: /n The Annual Report for the year ended 31 December 2022 is available in English, French, Spanish, German, Italian, Portuguese, Chinese, Arabic, Russian, Arabic, Chinese, Korean, Japanese, Vietnamese, Thai, Arabic, Chinese, Korean, Japanese, Vietnamese, Thai, Arabic, Chinese, Korean, Japanese, Vietnamese, Thai, Arabic, Chinese, Korean, Japanese, Vietnamese, Thai, Arabic, Chinese, Korean, Japanese, Vietnamese, Thai, Arabic, Chinese, Korean, Japanese, Vietnamese, Thai, Arabic, Chinese, Korean, Japanese, Vietnamese, Thai, Arabic, Chinese, Korean, Japanese, Vietnamese, Thai, Arabic, Chinese, Korean, Japanese, Vietnamese, Thai, Arabic, Chinese, Korean, Japanese, Vietnamese, Thai, Arabic, Chinese, Korean, Japanese, Vietnamese, Thai, Arabic, Chinese, Ko

In [None]:
output

'In 2022, we continued to invest in our businesses, delivering strong earnings growth and a strong balance sheet. ExxonMobil’s energy security, In 2022, we continued to expand our global operations, including the development of the Liza Unity FPSO in Guyana. Our Upstream business works each day to provide reliable and affordable energy solutions. We continuously innovate and invest, using industry-leading technology and processes to safely increase oil and natural gas production to meet the needs of a growing and changing global population. In 2022, we continued to strengthen and actively manage our industry-leading port folio o f strategic projects w hile pro ducing 3.7 mi llion oi l-equiva lent barrels per day. In t he Permian Basin, we maximized the value o f our large acreage position through technology and the scale o f our integrated operations, increasing production by nearly 90,000 oil-equiva lent barrels per day year-over-year while making progress on reducing greenhouse gas e

## 4. Improve

#### Refine

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

prompt_template = """Summarize the key highlights and findings of the company's 2022 annual report, including financial performance, strategic initiatives, market position, challenges faced, and future outlook: \n {text} \n SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)

refine_template = (
    "Your job is to provides a comprehensive overview of the 2022 annual report \n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary with the content below \n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Improve the clarity and coherence of the summary, ensuring it flows logically. Add specific data and figures from the annual report where available, and ensure the summary is concise and focused."
)

refine_prompt = PromptTemplate.from_template(refine_template)
chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps=True,
    input_key="input_documents",
    output_key="output_text",
)

result = chain({"input_documents": all_splits[:10]}, return_only_outputs=True)

In [None]:
result['output_text']

'By evo lving our operating mo del and conso lidating into three core businesses – U pstream, Pro duct So lutions, an d Low Carbon Solutions – we leveraged our advantagesof technolo gy, scale, inte gration, and world-class employees to improve effectiveness, efficiency, and earnin gs resiliency. The positive impact of these chan ges is clearly seen in our results: Our North America refineries collectively delivered their best-ever annual throu ghput ;8 we achieved record production in our Permian o perations; and we significantly increased volumes in Guyana to help ease substantial shorta ges. In addition, we increased earnings to $56billion, well ahead of our peers .1 We a lso increase d planned investments in lower-emission initiatives to approximately $17 billion from 2022 throu gh 2027 and made great strides to lower the emissions intensit y of our o perated assets. Low Carbon Solutions signed a first-of-its-kind agreement to capture, transport, and store carbon dioxide from a refi

#### Map-Reduce

In [None]:
loader = PyPDFLoader(file[0])
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 0)
all_splits = loader.load_and_split(text_splitter)

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

map_prompt = """Summarize the key highlights and findings of the company's annual report for 2022, including financial performance, strategic initiatives, market position, challenges faced, and future outlook, if any: /n {text} /n SUMMARY:"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """Generate a structured summary of the company's annual report for 2022, incorporating the information on revenue and financial performance, expenses and cost management, operations and productivity, market position and competition, strategic initiatives, challenges and risks, governance and compliance, and sustainability and social responsibility, if any: /n {text} /n SUMMARY:"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

summary_chain = load_summarize_chain(llm=llm,
                                     chain_type='map_reduce',
                                     map_prompt=map_prompt_template,
                                     combine_prompt=combine_prompt_template,
                                     verbose=True)

output = summary_chain.run(all_splits[:10])



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSummarize the key highlights and findings of the company's annual report for 2022, including financial performance, strategic initiatives, market position, challenges faced, and future outlook, if any: /n 2022Annual  
Report /n SUMMARY:[0m
Prompt after formatting:
[32;1m[1;3mSummarize the key highlights and findings of the company's annual report for 2022, including financial performance, strategic initiatives, market position, challenges faced, and future outlook, if any: /n See Cautionary Statement on Pa ge 139 for important information re gardin g forward-lookin g statements and terms used in this report.Abo ut th e cover:Our work to meet society’s needs is enabled by sites such as our Baytown, Texas, operations, amon g the world’s
largest inte grated and most technolo gically advanced refinin g and petrochemical complexes .Financial and opera

In [None]:
output

'In 2022, we continued to deliver on our commitment to deliver the energy and products society needs while reducing our own and others’ greenhouse gas emissions. /n Letter to shareholder s By evo lving our operating mo del and conso lidating into three core businesses – U pstream, Pro duct So lutions, an d Low Carbon Solutions – we leveraged our advantagesof technolo gy, scale, inte gration, and world-class employees to improve effectiveness, efficiency, and earnin gs resiliency. The positive impact of these chan ges is clearly seen in our results: Our North America refineries collectively delivered their best-ever annual throu ghput ;8 we achieved record production in our Permian o perations; and we significantly increased volumes in Guyana to help ease substantial shorta ges. In addition, we increased earnings to $56billion, well ahead of our peers .1 We a lso increase d planned investments in lower-emission initiatives to approximately $17 billion from 2022 throu gh 2027 and made gr

## 5. Test on Other Document

#### Shell PLC

In [None]:
file = get_reports(1, 2022, 1)
file

['/content/drive/MyDrive/Capstone/Company Reports/Shell plc/Shell plc_2022.pdf']

In [None]:
loader = PyPDFLoader(file[0])
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 0)
all_splits = loader.load_and_split(text_splitter)
all_splits

[Document(page_content='Annual Report and Accounts for the year ended December 31, 2022 Shell plc\n#PoweringProgressPowering \nProgressShell plc\nAnnual Report \nand Accounts\nfor the year ended \nDecember 31, 2022', metadata={'source': '/content/drive/MyDrive/Capstone/Company Reports/Shell plc/Shell plc_2022.pdf', 'page': 0}),
 Document(page_content="Contents\nIntroduction\niii About this Report\niv Terms and abbreviations\nStrategic Report2 Chair's message\n4 Chief Executive Officer's review\n6 Powering Progress strategy\n15 Risk factors\n27 Progress on strategy – year in review\n27 Performance indicators\n29 Generating shareholder value\n29 Group results\n31 Financial framework\n35 Market overview\n38 Integrated Gas\n44 Upstream\n52 Oil and gas information\n60 Marketing\n65 Chemicals and Products\n73 Renewables and Energy Solutions\n77 Corporate\n78 Our journey to net zero\n106 Respecting nature\n112 Powering lives\n121 Safety\n125 Principal decisions & stakeholders (Section \n172(1

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

prompt_template = """Summarize the key highlights and findings of the company's 2022 annual report, including financial performance, strategic initiatives, market position, challenges faced, and future outlook, if any: \n {text} \n SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)

refine_template = (
    "Your job is to provides a comprehensive overview of the 2022 annual report \n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary with the content below \n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Improve the clarity and coherence of the summary, ensuring it flows logically. Add specific data and figures from the annual report where available, and ensure the summary is concise and focused."
)

refine_prompt = PromptTemplate.from_template(refine_template)
chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps=True,
    input_key="input_documents",
    output_key="output_text",
)

result = chain({"input_documents": all_splits[:20]}, return_only_outputs=True)

In [None]:
result['output_text']

'The Shell plc Annual Report and Accounts for the year ended December 31, 2022 is the 88th annual report of the company.'

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

map_prompt = """Summarize the key highlights and findings of the company's annual report for 2022, including financial performance, strategic initiatives, market position, challenges faced, and future outlook, if any: /n {text} /n SUMMARY:"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """Generate a structured summary of the company's annual report for 2022, incorporating the information on revenue and financial performance, expenses and cost management, operations and productivity, market position and competition, strategic initiatives, challenges and risks, governance and compliance, and sustainability and social responsibility, if any: /n {text} /n SUMMARY:"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

summary_chain = load_summarize_chain(llm=llm,
                                     chain_type='map_reduce',
                                     map_prompt=map_prompt_template,
                                     combine_prompt=combine_prompt_template,
                                     verbose=True)

output = summary_chain.run(all_splits[:20])



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSummarize the key highlights and findings of the company's annual report for 2022, including financial performance, strategic initiatives, market position, challenges faced, and future outlook, if any: /n Annual Report and Accounts for the year ended December 31, 2022 Shell plc
#PoweringProgressPowering 
ProgressShell plc
Annual Report 
and Accounts
for the year ended 
December 31, 2022 /n SUMMARY:[0m
Prompt after formatting:
[32;1m[1;3mSummarize the key highlights and findings of the company's annual report for 2022, including financial performance, strategic initiatives, market position, challenges faced, and future outlook, if any: /n Contents
Introduction
iii About this Report
iv Terms and abbreviations
Strategic Report2 Chair's message
4 Chief Executive Officer's review
6 Powering Progress strategy
15 Risk factors
27 Progress on strategy – y

In [None]:
output

'The report covers the period from 1 January to 31 December 2022. The report is a comprehensive overview of the company\'s activities and performance in the year. The Shell plc Annual Report serves as the Annual Report and Accounts in accordance with UK requirements for the year ended December 31, 2022, for Shell plc (the "Company") and its subsidiaries (collectively referred to as "Shell"). This Report presents the Consolidated Financial Statements of Shell (pages 237-307), the Parent Company Financial Statements of Shell (pages 340-349) and the Financial Statements of the Royal Dutch Shell Dividend Access Trust (pages 352-356 ). Except for these Financial Statements, the numbers presented throughout this Report may not sum precisely to the totals provided and percentages may not precisely reflect the absolute figures due to rounding. The Consolidated Financial Statements have been prepared in accordance with The following table sets out the key financial highlights for the year ended

#### BP PLC

In [None]:
file = get_reports(2, 2022, 1)
file

['/content/drive/MyDrive/Capstone/Company Reports/BP PLC/BP PLC_2022.pdf']

In [None]:
loader = PyPDFLoader(file[0])
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 0)
all_splits = loader.load_and_split(text_splitter)
all_splits

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

prompt_template = """Summarize the key highlights and findings of the company's 2022 annual report, including financial performance, strategic initiatives, market position, challenges faced, and future outlook, if any: \n {text} \n SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)

refine_template = (
    "Your job is to provides a comprehensive overview of the 2022 annual report \n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary with the content below \n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Improve the clarity and coherence of the summary, ensuring it flows logically. Add specific data and figures from the annual report where available, and ensure the summary is concise and focused."
)

refine_prompt = PromptTemplate.from_template(refine_template)
chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps=True,
    input_key="input_documents",
    output_key="output_text",
)

result = chain({"input_documents": all_splits[:20]}, return_only_outputs=True)

In [None]:
result['output_text']

'bp is performing while transforming Our three-pillar strategy is unchanged – it is focused on investing in our transition growth engines and, at the same time, investing in today’s energy system. And integration connects it all.'

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

map_prompt = """Summarize the key highlights and findings of the company's annual report for 2022, including financial performance, strategic initiatives, market position, challenges faced, and future outlook, if any: /n {text} /n SUMMARY:"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """Generate a structured summary of the company's annual report for 2022, incorporating the information on revenue and financial performance, expenses and cost management, operations and productivity, market position and competition, strategic initiatives, challenges and risks, governance and compliance, and sustainability and social responsibility, if any: /n {text} /n SUMMARY:"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

summary_chain = load_summarize_chain(llm=llm,
                                     chain_type='map_reduce',
                                     map_prompt=map_prompt_template,
                                     combine_prompt=combine_prompt_template,
                                     verbose=True)

output = summary_chain.run(all_splits[:10])

In [None]:
output

"bp’s 2022 annual report is a comprehensive report on the company’s performance and strategy. The report covers the company's financial performance, strategic initiatives, market position, challenges faced, and future outlook. The bp annual report for 2022 is a comprehensive report on the company's activities, performance and governance. bp shareholders divided by total equity finance debt a 4Q 2022 vs 4Q 2021 growth in dividend per ordinary share. b Share buybacks announced from 2022 surplus cash flow . The report covers the period from 1 January 2022 to 31 December 2022. bp-operated upstream plant reliability (2021 94.0%) bp-operated refining availability (2021 94.8%) /n The group’s strategy is to be a leading global energy company, delivering value to shareholders through safe, reliable and efficient energy. /n"

#### Saudi Aramco

In [6]:
file = get_reports(3, 2022, 1)
file

['/content/drive/MyDrive/Capstone/Company Reports/Saudi Aramco/Saudi Aramco_2022.pdf']

In [7]:
from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

loader = PyPDFLoader(file[0])
text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 0)
all_splits = loader.load_and_split(text_splitter)
all_splits

[Document(page_content='annual report 2022', metadata={'source': '/content/drive/MyDrive/Capstone/Company Reports/Saudi Aramco/Saudi Aramco_2022.pdf', 'page': 0}),
 Document(page_content='Investing in growth  \nInnovating for sustainability\nWhy have we embarked on our largest  \ncapital expenditure program ever?\nBecause we believe the world’s need for affordable, \nreliable, and sustainable energy will continue to grow, \nand we plan to grow our lower carbon intensity  \ncrude oil production with it.\nBecause increasing our gas production will meet \ngrowing domestic demand, expanding our  \nliquids-to-chemicals capabilities de-risks our  \nUpstream production, and using innovation  \nand technology helps us find potential new  \npathways to a lower-carbon future.\nAnd because we support a practical, stable  \nand inclusive energy transition.\nCover image:  \nKhurais Facility, Saudi ArabiaThis Annual Report covers financial and operational aspects of Aramco and is issued in both Arab

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

prompt_template = """Summarize the key highlights and findings of the company's 2022 annual report, including financial performance, strategic initiatives, market position, challenges faced, and future outlook, if any: \n {text} \n SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)

refine_template = (
    "Your job is to provides a comprehensive overview of the 2022 annual report \n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary with the content below \n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Improve the clarity and coherence of the summary, ensuring it flows logically. Add specific data and figures from the annual report where available, and ensure the summary is concise and focused."
)

refine_prompt = PromptTemplate.from_template(refine_template)
chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps=True,
    input_key="input_documents",
    output_key="output_text",
)

result = chain({"input_documents": all_splits[:20]}, return_only_outputs=True)

In [None]:
result['output_text']

'Aramco’s vision is to be the world’s preeminent integrated energy and chemicals company, operating in a safe, sustainable and reliable manner.'

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

map_prompt = """Summarize the key highlights and findings of the company's annual report for 2022, including financial performance, strategic initiatives, market position, challenges faced, and future outlook, if any: /n {text} /n SUMMARY:"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """Generate a structured summary of the company's annual report for 2022, incorporating the information on revenue and financial performance, expenses and cost management, operations and productivity, market position and competition, strategic initiatives, challenges and risks, governance and compliance, and sustainability and social responsibility, if any: /n {text} /n SUMMARY:"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

summary_chain = load_summarize_chain(llm=llm,
                                     chain_type='map_reduce',
                                     map_prompt=map_prompt_template,
                                     combine_prompt=combine_prompt_template,
                                     verbose=True)

output = summary_chain.run(all_splits[:20])



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSummarize the key highlights and findings of the company's annual report for 2022, including financial performance, strategic initiatives, market position, challenges faced, and future outlook, if any: /n annual report 2022 /n SUMMARY:[0m
Prompt after formatting:
[32;1m[1;3mSummarize the key highlights and findings of the company's annual report for 2022, including financial performance, strategic initiatives, market position, challenges faced, and future outlook, if any: /n Investing in growth  
Innovating for sustainability
Why have we embarked on our largest  
capital expenditure program ever?
Because we believe the world’s need for affordable, 
reliable, and sustainable energy will continue to grow, 
and we plan to grow our lower carbon intensity  
crude oil production with it.
Because increasing our gas production will meet 
growing domestic

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (2900 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2943 > 512). Running this sequence through the model will result in indexing errors




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGenerate a structured summary of the company's annual report for 2022, incorporating the information on revenue and financial performance, expenses and cost management, operations and productivity, market position and competition, strategic initiatives, challenges and risks, governance and compliance, and sustainability and social responsibility, if any: /n The annual report for 2022 is available in English, French, German, Spanish, Italian, Dutch, Swedish, Finnish, Norwegian, Danish, Swedish, Finnish, Norwegian, Swedish, Danish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedish, Swedi

In [None]:
output

'Saudi Aramco’s 2022 Annual Report highlights the company’s growth and innovation, and its commitment to a sustainable energy future. The report covers the period from January 1, 2022 to December 31, 2022. Saudi Aramco’s vision is to be the world’s preeminent integrated energy and chemicals company, operating in a safe, sustainable and reliable manner. Saudi Aramco’s 2022 highlights ............................................. Business model ............................................. Aramco’s operations .................................... Business overview ........................................ Market overview ................................................... Strategy .......................................................... 21 2. Results and performance Key 2022 metrics .......................................... CFO’s message .............................................. Financial performance ................................. Upstream .......................................

### Try Map reduce in more pages

In [10]:
all_splits[20:30]

[Document(page_content='At the same time, we are continuing to build  \nour Downstream business into a world-class \noperation. In 2022 we progressed with refining \nand chemicals projects in Saudi Arabia and Asia, \nwhile also expanding our presence in Europe.\nAramco’s Downstream strategy reflects our  \nbelief that a significant share of future oil \ndemand will come from petrochemicals, not  \nleast from products which are essential inputs  \nfor the energy transition. We estimate, for \nexample, that eight to 11 tons of chemical \nproducts are required to produce one megawatt  \nof renewable energy, on average. Aramco, for  \nits part, is preparing for this future by expanding \nthe integration of our refining and chemicals \nfacilities, and investing in large-scale liquids-to-\nchemicals projects. \nInvesting in the Kingdom’s economic \ndevelopment\nIn 2022 we continued to scale our investments  \nin the Kingdom’s industrial economy, which  \neach year grows more diversified and 

In [11]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

map_prompt = """Summarize the key highlights and findings of the company's annual report for 2022, including financial performance, strategic initiatives, market position, challenges faced, and future outlook, if any: /n {text} /n SUMMARY:"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """Generate a structured summary of the company's annual report for 2022, incorporating the information on revenue and financial performance, expenses and cost management, operations and productivity, market position and competition, strategic initiatives, challenges and risks, governance and compliance, and sustainability and social responsibility, if any: /n {text} /n SUMMARY:"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

summary_chain = load_summarize_chain(llm=llm,
                                     chain_type='map_reduce',
                                     map_prompt=map_prompt_template,
                                     combine_prompt=combine_prompt_template,
                                     verbose=True)

output = summary_chain.run(all_splits[30:40])



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSummarize the key highlights and findings of the company's annual report for 2022, including financial performance, strategic initiatives, market position, challenges faced, and future outlook, if any: /n In Europe, we closed transactions to acquire  
a 30% stake in a refinery in Poland, sole 
ownership of an associated wholesale business, 
and a 50% stake in a jet fuel marketing joint 
venture. Through these transactions we 
expanded our downstream European presence 
and increased our integrated refining and 
petrochemicals capacity.
In August 2022, we signed an equity purchase 
agreement for Valvoline’s global products 
business. This acquisition transaction, which  
is expected to complement Aramco’s own line  
of premium-branded lubricant products, closed 
in March 2023.
Here in the Kingdom, we have made a final 
investment decision with our par

Token indices sequence length is longer than the specified maximum sequence length for this model (570 > 512). Running this sequence through the model will result in indexing errors



[1m> Finished chain.[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGenerate a structured summary of the company's annual report for 2022, incorporating the information on revenue and financial performance, expenses and cost management, operations and productivity, market position and competition, strategic initiatives, challenges and risks, governance and compliance, and sustainability and social responsibility, if any: /n In 2022, we continued to expand our global presence and increase our integrated refining and petrochemicals capacity.

Aramco’s strategy is to leverage the significant potential of our products to meet rising global demand for petrochemicals, which we believe will be critical to the materials transition required to support a lower-carbon future. Advancing our net-zero ambition We see the world’s shift towards a lower-carbon energy future as not only an obligation, but also an opportunity. Indeed, as one of the more reliable su

In [12]:
output

'Saudi Aramco’s (Aramco) annual report for 2022 highlights the Company’s achievements in the past year. Aramco’s financial performance in 2022 was characterized by strong cash flow generation, a reduction in debt, and a strong balance sheet. Aramco signed a share purchase agreement with the Government of the Kingdom of Saudi Arabia to raise SAR 2.8 billion ($0.6 billion) in cash.'

## Vicuna

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM, AutoModelForCausalLM

model_id = 'lmsys/vicuna-13b-v1.5-16k'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)

pipe = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    max_length = 2000
)

vicuna = HuggingFacePipeline(pipeline = pipe)