In [None]:
# hide output
%%capture output

! pip install pdfplumber
! pip install chromadb
! pip install pymilvus
! pip install sentence-transformers
! pip install langchain
! pip install pypdf

In [None]:
! pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# 1. Load Data

In [None]:
import os
from google.colab import drive
# Access drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Capstone/'


# companies
companies = os.listdir(os.path.join(path, 'Company Reports'))
for i, comp in enumerate(companies):
    print(i, ": ", comp)


# get reports
def get_reports(comp, year:int, rep_type:int = 1):
    """
    comp:       string or index
    year:       specific year or # recent year, 0 for all
    rep_type:   report type, 1 for annual report, 2 for sustainability report, 0 for both
    ret:        list of report pathes
    """
    if type(comp) == str:
        if comp not in companies:
            print("Error: ", comp, " does not exist")
            return
    elif type(comp) == int:
        if comp not in range(len(companies)):
            print("Error: invalid index")
            return
        comp = companies[comp]
    else:
        print("Error: invalid company")
        return

    file_path = os.path.join(path, 'Company Reports', comp)
    files = os.listdir(file_path)
    files.sort(reverse=True)

    years = range(2013,2023)
    if year in range(11):
        if year:
            years = years[-year:]
    else:
        years = [year]

    if rep_type == 0:
        reps = ["", "_sus"]
    elif rep_type == 1:
        reps = [""]
    elif rep_type == 2:
        reps = ["_sus"]
    else:
        print("Error: invalid report type")
        return

    ret = []
    for year in years:
        for rep in reps:
            file = comp + '_' + str(year) + rep + '.pdf'
            if file in files:
                ret.append(file)
    return [os.path.join(file_path, file) for file in ret]

Mounted at /content/drive
0 :  ExxonMobil
1 :  Shell plc
2 :  BP PLC
3 :  Saudi Aramco
4 :  Chevron
5 :  TotalEnergies
6 :  Valero Energy
7 :  Marathon Petroleum Corporation
8 :  Sinopec
9 :  PetroChina


In [None]:
file = get_reports(0, 2022, 1)

file = file[0]
file

'/content/drive/MyDrive/Capstone/Company Reports/ExxonMobil/ExxonMobil_2022.pdf'

## 2. Load and Split


In [None]:
#from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

#loader = PyPDFLoader(file)

from langchain.document_loaders import PyPDFium2Loader

loader = PyPDFium2Loader(file)

#text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500, chunk_overlap = 0)
#all_splits = loader.load_and_split(text_splitter)


text_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", " ", "", "."],
    chunk_size = 1000,
    chunk_overlap = 0
)

## 3. Store in Vector database

In [None]:
def load(file_path):
    vs_path_faiss = os.path.join(file_path[:-4], 'faiss')
    if os.path.exists(vs_path_faiss):
        return FAISS.load_local(vs_path_faiss, embeddings)
    else:
        return FAISS.from_documents(text_splitter, embeddings)

In [None]:
embeddings = HuggingFaceEmbeddings()

data = text_splitter.split_documents(loader.load())

vs_faiss = FAISS.from_documents(data, embeddings)

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

## 4. Retrive and Store Relative Contents

In [None]:
def print_doc(doc):
    for i, d in enumerate(doc):
        print('-'*100)
        print('|', str(i+1)+'. Page', d.metadata['page'], '|')
        print('-'*14)
        print(d.page_content)
    print('-'*100)

In [None]:
def store_result(doc):
  result = []
  for i, d in enumerate(doc):
    result.append(d.page_content)
  return result


In [None]:
question = """What is the company's net income?"""

#print_doc(vs_faiss.similarity_search(question))
income = vs_faiss.similarity_search(question)

In [None]:
question = """What is the company's total power generation?"""
power = vs_faiss.similarity_search(question)

In [None]:
question = """What is the company's major project delivery?"""

project = vs_faiss.similarity_search(question)


In [None]:
question = """What is the company's cash flow?"""

cashflow = vs_faiss.similarity_search(question)

In [None]:
question = """What is the company's inventories?"""

inventory = vs_faiss.similarity_search(question)

In [None]:
relative_content = income+power+project+cashflow+inventory
relative_content

[Document(page_content='6,989\r\n-3,030 -170 -250\r\n3,543\r\n2021 Earnings Margins Volume/Mix Other 2022 Earnings', metadata={'source': '/content/drive/MyDrive/Capstone/Company Reports/ExxonMobil/ExxonMobil_2022.pdf', 'page': 68}),
 Document(page_content='2,257\r\n+4,370 +130 +130 +105 6,989\r\n2020 Earnings Margins Volume/Mix Other Identified Items (1) 2021 Earnings', metadata={'source': '/content/drive/MyDrive/Capstone/Company Reports/ExxonMobil/ExxonMobil_2022.pdf', 'page': 69}),
 Document(page_content='1,201\r\n+680\r\n+300 +220\r\n+860 3,259\r\n2020 Earnings Margins Volume/Mix Other Identified Items (1) 2021 Earnings', metadata={'source': '/content/drive/MyDrive/Capstone/Company Reports/ExxonMobil/ExxonMobil_2022.pdf', 'page': 72}),
 Document(page_content='3,259\r\n-220\r\n+20 +30\r\n-670\r\n2,415\r\n2021 Earnings Margins Volume/Mix Other Identified Items (1) 2022 Earnings', metadata={'source': '/content/drive/MyDrive/Capstone/Company Reports/ExxonMobil/ExxonMobil_2022.pdf', 'pag

## 5. Model T-5

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM

model_id = 'google/flan-t5-xxl'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model = model,
    tokenizer = tokenizer,
    max_length = 2000
)

llm = HuggingFacePipeline(pipeline = pipe)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/674 [00:00<?, ?B/s]

Downloading (…)fetensors.index.json:   0%|          | 0.00/53.0k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/9.45G [00:00<?, ?B/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/9.60G [00:00<?, ?B/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/9.96G [00:00<?, ?B/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

Downloading (…)of-00005.safetensors:   0%|          | 0.00/6.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

## 6. Map-reduce Chain

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

map_prompt = """Summarize the key highlights and findings of the company's annual report for 2022, including net income, total power generation, major project delivery, cash flow and inventories, if any: /n {text} /n SUMMARY:"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """Generate a structured summary of the company's annual report for 2022, incorporating the information on net income, total power generation, major project delivery, cash flow and inventories, if any: /n {text} /n SUMMARY:"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

summary_chain = load_summarize_chain(llm=llm,
                                     chain_type='map_reduce',
                                     map_prompt=map_prompt_template,
                                     combine_prompt=combine_prompt_template,
                                     verbose=True)

output = summary_chain.run(relative_content)



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSummarize the key highlights and findings of the company's annual report for 2022, including net income, total power generation, major project delivery, cash flow and inventories, if any: /n 6,989
-3,030 -170 -250
3,543
2021 Earnings Margins Volume/Mix Other 2022 Earnings /n SUMMARY:[0m
Prompt after formatting:
[32;1m[1;3mSummarize the key highlights and findings of the company's annual report for 2022, including net income, total power generation, major project delivery, cash flow and inventories, if any: /n 2,257
+4,370 +130 +130 +105 6,989
2020 Earnings Margins Volume/Mix Other Identified Items (1) 2021 Earnings /n SUMMARY:[0m
Prompt after formatting:
[32;1m[1;3mSummarize the key highlights and findings of the company's annual report for 2022, including net income, total power generation, major project delivery, cash flow and inventori

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1067 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1202 > 512). Running this sequence through the model will result in indexing errors




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGenerate a structured summary of the company's annual report for 2022, incorporating the information on net income, total power generation, major project delivery, cash flow and inventories, if any: /n The company's annual report for 2022, including net income, total power generation, major project delivery, cash flow and inventories, if any: /n

The company's annual report for 2022, including net income, total power generation, major project delivery, cash flow and inventories, if any: /n 2,257 +4,370 +130 +130 +105 6,989 2020 Earnings Margins Volume/Mix Other Identified Items (1) 2021 Earnings /n

The company's annual report for 2022, including net income, total power generation, major project delivery, cash flow and inventories, if any: /n 1,201 +680 +300 +220 +860 3,259 2020 Earnings Margins Volume/Mix Other Identified Items (1) 2021 Earnings /n

The company's annual report for 2022, including net inco

In [None]:
output

"Exxon Mobil Corporation (Exxon Mobil) is a leading international energy company. Exxon Mobil is a Fortune 500 company with a market capitalization of 1 trillion. Exxon Mobil is a leading international energy company with a market capitalization of 1 trillion. The Corporation's business is conducted in a highly competitive, changing global energy business environment where decisions and risks play out over time horizons that are often decades in length. This long-term orientation underpins the Corporation's philosophy on talent development. Talent development begins with recruiting exceptional candidates and continues with individually planned experiences and training designed to facilitate broad development and a deep understanding of our business across the business cycle. Our career-oriented approach to talent development results in strong retention and an average length of service of about 30 years for our career employees. Compensation, benefits, and workplace programs support the

## 7. Refine Chain

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

prompt_template = """Summarize the key highlights and findings of the company's annual report for 2022, including net income, total power generation, major project delivery, cash flow and inventories, if any: /n {text} /n SUMMARY:"""

prompt = PromptTemplate.from_template(prompt_template)

refine_template = (
    "Your job is to provides a comprehensive overview of the 2022 annual report \n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary with the content below \n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Improve the clarity and coherence of the summary, ensuring it flows logically. Add specific data and figures from the annual report where available, and ensure the summary is concise and focused."
)

refine_prompt = PromptTemplate.from_template(refine_template)
chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps=True,
    input_key="input_documents",
    output_key="output_text",
)

result = chain({"input_documents": relative_content}, return_only_outputs=True)

In [None]:
result['output_text']

'Our unique competitive advantages have been built over decades, bringing our shareholders exceptional results through the right strategic priorities and extraordinary execution by our employees around the world. Our five-year plan is expected to drive leading business outcomes and is a continuation of the path that delivered industry-leading performance in 2022. Corporate plan through 2027 VIII EXXON MOBIL CORPORATION | 2022 ANNUAL REPORT Our winning proposition Upstream Low Carbon Solutions Product Solutions 500K 40-50% oil-equivalent barrels of expected growth by 2027 versus 2023 reduction in Upstream greenhouse gas intensity by 203018 2X 1B volume of high-value products with differentiated performance by 2027 versus 2019 pounds per year of advanced recycling capacity expected by 2026 >10% 1B overall return on the portfolio of investments from 2022-202719 cubic feet of low-carbon hydrogen per day expected from our facility in Baytown, Tex., by 2027.'