In [None]:
# hide output
%%capture output

! pip install pdfplumber
! pip install chromadb
! pip install pymilvus
! pip install sentence-transformers
! pip install langchain
! pip install pypdf

## 1. Load Data

In [None]:
import os
from google.colab import drive
# Access drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Capstone/'


# companies
companies = os.listdir(os.path.join(path, 'Company Reports'))
for i, comp in enumerate(companies):
    print(i, ": ", comp)


# get reports
def get_reports(comp, year:int, rep_type:int = 1):
    """
    comp:       string or index
    year:       specific year or # recent year, 0 for all
    rep_type:   report type, 1 for annual report, 2 for sustainability report, 0 for both
    ret:        list of report pathes
    """
    if type(comp) == str:
        if comp not in companies:
            print("Error: ", comp, " does not exist")
            return
    elif type(comp) == int:
        if comp not in range(len(companies)):
            print("Error: invalid index")
            return
        comp = companies[comp]
    else:
        print("Error: invalid company")
        return

    file_path = os.path.join(path, 'Company Reports', comp)
    files = os.listdir(file_path)
    files.sort(reverse=True)

    years = range(2013,2023)
    if year in range(11):
        if year:
            years = years[-year:]
    else:
        years = [year]

    if rep_type == 0:
        reps = ["", "_sus"]
    elif rep_type == 1:
        reps = [""]
    elif rep_type == 2:
        reps = ["_sus"]
    else:
        print("Error: invalid report type")
        return

    ret = []
    for year in years:
        for rep in reps:
            file = comp + '_' + str(year) + rep + '.pdf'
            if file in files:
                ret.append(file)
    return [os.path.join(file_path, file) for file in ret]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
0 :  ExxonMobil
1 :  Shell plc
2 :  BP PLC
3 :  Saudi Aramco
4 :  Chevron
5 :  TotalEnergies
6 :  Valero Energy
7 :  Marathon Petroleum Corporation
8 :  Sinopec
9 :  PetroChina


In [None]:
# get report for 2018
get_reports(0, 2018)

['/content/drive/MyDrive/Capstone/Company Reports/ExxonMobil/ExxonMobil_2018.pdf',
 '/content/drive/MyDrive/Capstone/Company Reports/ExxonMobil/ExxonMobil_2018']

In [None]:
# get reports for recent 3 years
get_reports(0, 3)

['/content/drive/MyDrive/Capstone/Company Reports/ExxonMobil/ExxonMobil_2022_sus.pdf',
 '/content/drive/MyDrive/Capstone/Company Reports/ExxonMobil/ExxonMobil_2022.pdf',
 '/content/drive/MyDrive/Capstone/Company Reports/ExxonMobil/ExxonMobil_2021.pdf']

In [None]:
from langchain.document_loaders import PyPDFLoader

file = get_reports(0, 2018)

loader = PyPDFLoader(file[0])
data = loader.load_and_split()

##2. Split the data

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 2000, chunk_overlap = 0)
all_splits = text_splitter.split_documents(data)

In [None]:
all_splits

[Document(page_content='Exxon Mobil Corporation\nCorporate headquarters5959 Las Colinas Blvd.Irving, Texas  75039-2298exxonmobil.com\nPrinted in U.S.A.NOTE: SPINE WIDTH (X ) TO BE DETERMINED BY PRINTER, TEXT TO CENTER ON X. SPINE TEXT MAY NOT BE POSSIBLE IF THIN BOOK.EXXON MOBIL CORPORATION • SUMMARY ANNUAL REPORT 2018X\nFSC/Recycle Info (by printer)\n002CSN9B53\n2018  SUMMARY\nANNUAL REPORT', metadata={'source': '/content/drive/MyDrive/Capstone/Company Reports/ExxonMobil/ExxonMobil_2018.pdf', 'page': 0}),
 Document(page_content='2 To our shareholders\n 4 2018 results and highlights 5 Competitive advantages 6 Technology 10 Scale 14 Integration 18 Functional excellence 22 People 26 Global operations 28 Upstream highlights 30 Downstream highlights 32 Chemical highlights 34 Corporate sustainability 35 Financial information 40 Frequently used terms 42 Board of Directors, Officers, and Affiliated Companies 44 Investor information 45 General information\nStatements of future events or condit

## 3. Model - T5

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM

model_id = 'google/flan-t5-large'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model = model,
    tokenizer = tokenizer,
    max_length = 2000
)

llm = HuggingFacePipeline(pipeline = pipe)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

##4. Generate Answer

### Refine

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

prompt_template = """Write a summary of the following on how the company performed in the year: \n {text} \n SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)

refine_template = (
    "Your job is to produce a final summary on how the company performed in the year \n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary"
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Given the new context, refine the original summary"
)

refine_prompt = PromptTemplate.from_template(refine_template)
chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps=True,
    input_key="input_documents",
    output_key="output_text",
)

result = chain({"input_documents": all_splits[:10]}, return_only_outputs=True)

Token indices sequence length is longer than the specified maximum sequence length for this model (582 > 512). Running this sequence through the model will result in indexing errors


In [None]:
result

{'intermediate_steps': ["The annual report summarizes the company's performance for the year ending December 31, 2018.",
  'The following is a list of the key findings and conclusions of the annual report for the year ended December 31, 2018.',
  'The following is a list of the key findings and conclusions of the annual report for the year ended December 31, 2018.',
  'The company reported a net income of $1.5 billion for the year ended December 31, 2018.',
  'In 2018, we laid out a growth strategy designed to take advantage of the unique strengths that have made ExxonMobil an industry leader. These include a sharp focus on fundamentals; dedication to innovative technology; deep integration across our businesses; disciplined investment in advantaged projects; and industry-leading execution from our highly skilled workforce. Our goal is to significantly increase the earnings and cash flow generation capacity of our business. In last year’s volatile margin and price environment, we gener

In [None]:
result['output_text']

"The company reported a net income of $36 billion, the highest since 2014, and a net income of $36 billion, the highest since 2014, according to the company's annual report on Form 10-K."

#### Map-Reduce

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

map_prompt = """Write a summary of the following on how the company performed in the year: /n {text} /n SUMMARY:"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

In [None]:
combine_prompt = """Given a list of summaries, combine them to generate a final summary on how the company performed in the year /n {text} /n SUMMARY:"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

In [None]:
summary_chain = load_summarize_chain(llm=llm,
                                     chain_type='map_reduce',
                                     map_prompt=map_prompt_template,
                                     combine_prompt=combine_prompt_template)

In [None]:
output = summary_chain.run(all_splits[:10])

Token indices sequence length is longer than the specified maximum sequence length for this model (601 > 512). Running this sequence through the model will result in indexing errors


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:
output

'ExxonMobil’s annual report for 2018 provides an overview of the company’s operations and financial performance. ExxonMobil has reported a record year for earnings, revenue and net income. The company’s annual report for 2018 shows that the company’s growth strategy has paid off.'

## 5. Improve

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500, chunk_overlap = 0)
all_splits = text_splitter.split_documents(data)

In [None]:
len(all_splits)

107

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM

model_id = 'google/flan-t5-large'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model = model,
    tokenizer = tokenizer,
    max_length = 2000
)

llm = HuggingFacePipeline(pipeline = pipe)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

### Refine

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

prompt_template = """Write a detailed summary of the following on how the company performed
in the year with supportive data to provide an insightful and concise overview of the company's
overall financial health and business performance: \n {text} \n SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)

refine_template = (
    "Your job is to produce a final detailed summary on how the company performed in the year with supportive data \n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary"
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Given the new context, refine the original summary"
)

refine_prompt = PromptTemplate.from_template(refine_template)
chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps=True,
    input_key="input_documents",
    output_key="output_text",
)

result = chain({"input_documents": all_splits[:20]}, return_only_outputs=True)

In [None]:
result['output_text']

'In 2018, we laid out a growth strategy designed to take advantage of the unique strengths that have made ExxonMobil an industry leader. These include a sharp focus on fundamentals; dedication to innovative technology; deep integration across our businesses; disciplined investment in advantaged projects; and industry-leading execution from our highly skilled workforce. Our goal is to significantly increase the earnings and cash flow generation capacity of our business. In last year’s volatile margin and price environment, we generated earnings of $21 billion and cash flow from operations and asset sales of $40 billion – the highest since 2014. This strong performance enabled us to fund attractive investments, reduce debt, and increase the dividend by 6 percent – the 36th consecutive annual increase.'

### Map-Reduce

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

map_prompt = """Write a summary of the following: /n {text} /n SUMMARY:"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

In [None]:
combine_prompt = """Given a list of summaries, combine them and tell me how the company performed: /n {text} /n SUMMARY:"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

In [None]:
summary_chain = load_summarize_chain(llm=llm,
                                     chain_type='map_reduce',
                                     map_prompt=map_prompt_template,
                                     combine_prompt=combine_prompt_template,
                                     verbose=True)

In [None]:
output = summary_chain.run(all_splits[:20])

In [None]:
output

"The Exxon Mobil annual report provides a snapshot of the company's operations and provides a snapshot of the industry. ExxonMobil is a leading global energy company that provides the energy and products that raise living standards. The company’s annual report on Wednesday, December 5, 2018 – the first time the company has reported on its annual results since the company’s founding in 1908 – outlines the company’s growth strategy. The ExxonMobil annual report for the year ended December 31, 2018 shows that the company's operating activities generated a cash flow of $36 billion, the highest since 2014, and that five additional discoveries in offshore Guyana increased the resource estimate to more than 5 billion barrels. The advan tages of our business are reflected in our competitive advantages. The diversified business of BP is a unique combination of strengths and capabilities. We strive to hire the best and develop world-class capabilities through challenging, cross-functional assign

### Improve on Splitter for Map-Reduce Chain



#### (1) overlap = 100

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500, chunk_overlap = 100)
all_splits = text_splitter.split_documents(data)

In [None]:
len(all_splits)

108

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM

model_id = 'google/flan-t5-large'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model = model,
    tokenizer = tokenizer,
    max_length = 3000
)

llm = HuggingFacePipeline(pipeline = pipe)

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

map_prompt = """Write a summary of the following: /n {text} /n SUMMARY:"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """Given a list of summaries, combine them and tell me how the company performed in a detailed concise summary: /n {text} /n SUMMARY:"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

In [None]:
summary_chain = load_summarize_chain(llm=llm,
                                     chain_type='map_reduce',
                                     map_prompt=map_prompt_template,
                                     combine_prompt=combine_prompt_template,
                                     verbose=True)

In [None]:
output = summary_chain.run(all_splits[:20])

In [None]:
output

"The Exxon Mobil annual report provides a snapshot of the company’s operations and financial performance. ExxonMobil is a leading global energy company that provides the energy and products that raise living standards. The company’s annual report on Wednesday, December 5, 2018 – the first time the company has reported on its annual results since the company’s founding in 1908 – outlines the company’s growth strategy. The 2018 annual report of Exxon Mobil, the world's largest oil and gas company, summarizes the company's progress in 2018. We are advancing our efforts to expand our presence in the Middle East, where we are advancing our efforts to expand our presence in the Middle East. ExxonMobil’s Chairman and CEO Darren Woods writes in a letter to shareholders that the company is committed to a safe work environment, a culture of open communication and trust, and a commitment to reducing environmental impacts and managing the risks of climate change. Darren Woods, chairman and ceo of 

#### (2) overlap = 500

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500, chunk_overlap = 500)
all_splits = text_splitter.split_documents(data)

In [None]:
len(all_splits)

114

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM

model_id = 'google/flan-t5-large'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model = model,
    tokenizer = tokenizer,
    max_length = 3000
)

llm = HuggingFacePipeline(pipeline = pipe)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

map_prompt = """Write a summary of the following: /n {text} /n SUMMARY:"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """Given a list of summaries, combine them and tell me how the company performed in a detailed concise summary: /n {text} /n SUMMARY:"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

In [None]:
summary_chain = load_summarize_chain(llm=llm,
                                     chain_type='map_reduce',
                                     map_prompt=map_prompt_template,
                                     combine_prompt=combine_prompt_template)

In [None]:
output = summary_chain.run(all_splits[:20])

Token indices sequence length is longer than the specified maximum sequence length for this model (3634 > 512). Running this sequence through the model will result in indexing errors


In [None]:
output

"The ExxonMobil annual report for 2018 summarizes the company's activities for the year ending December 31, 2018."

# Model - Mistral-7b

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1000, chunk_overlap = 0)
all_splits = text_splitter.split_documents(data)

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM, AutoModelForCausalLM

model_id_mistral = "mistralai/Mistral-7B-Instruct-v0.1"

tokenizer_mistral = AutoTokenizer.from_pretrained(model_id_mistral)
model_mistral = AutoModelForCausalLM.from_pretrained(model_id_mistral)

pipe_mistral = pipeline(
    "text-generation",
    model = model_mistral,
    tokenizer = tokenizer_mistral,
    max_length = 2000
)

llm_mistral = HuggingFacePipeline(pipeline = pipe_mistral)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

### Refine

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

prompt_template = """Write a summary of the following on how the company performed in the year: \n {text} \n SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)

refine_template = (
    "Your job is to produce a summary on how the company performed in the year \n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary"
    "(only if needed) with some more context below.\n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Given the new context, refine the original summary"
)

refine_prompt = PromptTemplate.from_template(refine_template)
chain = load_summarize_chain(
    llm=llm_mistral,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps=True,
    input_key="input_documents",
    output_key="output_text",
)

result = chain({"input_documents": all_splits[:10]}, return_only_outputs=True)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [None]:
result['output_text']

' as follows:\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\

### Map-Reduce

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

map_prompt = """Write a summary of the following: /n {text} /n SUMMARY:"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """Given a list of summaries, combine them to generate a final summary on how the company performed in the year /n {text} /n SUMMARY:"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

In [None]:
summary_chain = load_summarize_chain(llm=llm_mistral,
                                     chain_type='map_reduce',
                                     map_prompt=map_prompt_template,
                                     combine_prompt=combine_prompt_template)

In [None]:
output = summary_chain.run(all_splits[:10])

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


Downloading (…)olve/main/vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1344 > 1024). Running this sequence through the model will result in indexing errors
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


In [None]:
output

"\n\nExxonMobil Corporation is a multinational oil and gas corporation headquartered in Irving, Texas, United States. The company was founded in 1870 as the Standard Oil Company of New Jersey and has since grown to become one of the largest and most profitable companies in the world.\n\nIn 2018, ExxonMobil reported net income of $11.1 billion on revenue of $231.1 billion. The company's operations are supported by a strong balance sheet, with total assets of $224.4 billion and total equity of $104.4 billion as of December 31, 2018.\n\nExxonMobil is committed to sustainability and has set ambitious goals to reduce its greenhouse gas emissions and increase its use of renewable energy sources. The company is also focused on improving its environmental performance and reducing its impact on local communities and ecosystems.\n\nOverall, ExxonMobil is a leading player in the global oil and gas industry and is well-positioned to continue its growth and success in the years to come."

## Test on another Document with Our Current Best Model

In [None]:
from langchain.document_loaders import PyPDFLoader

file = get_reports(0, 2022, 1)

loader = PyPDFLoader(file[0])
data = loader.load_and_split()

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500, chunk_overlap = 0)
all_splits = text_splitter.split_documents(data)

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM

model_id = 'google/flan-t5-large'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model = model,
    tokenizer = tokenizer,
    max_length = 2000
)

llm = HuggingFacePipeline(pipeline = pipe)

#### Refine

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

prompt_template = """Write a detailed summary of the following: \n {text} \n SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)

refine_template = (
    "Your job is to produce a final detailed summary on how the company performed in the year with supportive data \n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary with the content below \n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Given the new context, refine the original summary"
)

refine_prompt = PromptTemplate.from_template(refine_template)
chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps=True,
    input_key="input_documents",
    output_key="output_text",
)

result = chain({"input_documents": all_splits[:3]}, return_only_outputs=True)

In [None]:
result['output_text']

'By evo lving our operating mo del and conso lidating into three core businesses – U pstream, Pro duct So lutions, and Low Carbon Solutions – we leveraged our advantagesof technolo gy, scale, inte gration, and world-class employees to improve effectiveness, efficiency, and earnin gs resiliency. The positive impact of these chan ges is clearly seen in our results: Our North America refineries collectively delivered their best-ever annual throu ghput ;8 we achieved record production in our Permian o perations; and we significantly increased volumes in Guyana to help ease substantial shorta ges. In addition, we increased earnings to $56billion, well ahead of our peers.1 We a lso increase d planne d investments in lower-emission initiatives to approximately $17 billion from 2022 throu gh 2027 and made great strides to lower the emissions intensit y of our o perated assets. Low Carbon Solutions signed a first-of-its-kind agreement to capture, transport, and permanentl y store u p to 2 milli

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

prompt_template = """Write a detailed summary of the following: \n {text} \n SUMMARY:"""
prompt = PromptTemplate.from_template(prompt_template)

refine_template = (
    "Your job is to produce a final detailed summary on how the company performed in the year with supportive data \n"
    "We have provided an existing summary up to a certain point: {existing_answer}\n"
    "We have the opportunity to refine the existing summary with the content below \n"
    "------------\n"
    "{text}\n"
    "------------\n"
    "Given the new context, refine the original summary"
)

refine_prompt = PromptTemplate.from_template(refine_template)
chain = load_summarize_chain(
    llm=llm,
    chain_type="refine",
    question_prompt=prompt,
    refine_prompt=refine_prompt,
    return_intermediate_steps=True,
    input_key="input_documents",
    output_key="output_text",
)

result = chain({"input_documents": all_splits[:20]}, return_only_outputs=True)

In [None]:
result

{'intermediate_steps': ['The 2022 Annual Report is the most recent annual report of the United States Department of Agriculture.',
  'The following is a list of the key financial metrics used in the 2022 Annual Report:',
  'By evo lving our operating mo del and conso lidating into three core businesses – U pstream, Pro duct So lutions, and Low Carbon Solutions – we leveraged our advantagesof technolo gy, scale, inte gration, and world-class employees to improve effectiveness, efficiency, and earnin gs resiliency. The positive impact of these chan ges is clearly seen in our results: Our North America refineries collectively delivered their best-ever annual throu ghput ;8 we achieved record production in our Permian o perations; and we significantly increased volumes in Guyana to help ease substantial shorta ges. In addition, we increased earnings to $56billion, well ahead of our peers.1 We a lso increase d planne d investments in lower-emission initiatives to approximately $17 billion f

In [None]:
result['output_text']

'For the fiscal year ended December 31, 2022 or TRANSITION REPORT PURSUANT TO SECTION 13 OR 15(d) OF THE SECURITIES EXCHANGE ACT OF 1934 New Jersey (State or other jurisdiction of incorporation or organization) Title of Each Class Common Stock, without par value 0.142% Notes due 2024 0.524% Notes due 2028 0.835% Notes due 2032 1.408% Notes due 2039 For the transition period from ___ to ___'

#### Map-Reduce

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

map_prompt = """Write a summary of the following: /n {text} /n SUMMARY:"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """Given a list of summaries, combine them and tell me how the company performed in a detailed concise summary: /n {text} /n SUMMARY:"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

In [None]:
summary_chain = load_summarize_chain(llm=llm,
                                     chain_type='map_reduce',
                                     map_prompt=map_prompt_template,
                                     combine_prompt=combine_prompt_template,
                                     verbose=True)

In [None]:
output = summary_chain.run(all_splits[:20])



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mWrite a summary of the following: /n 2022Annual  
Report /n SUMMARY:[0m
Prompt after formatting:
[32;1m[1;3mWrite a summary of the following: /n See Cautionary Statement on Pa ge 139 for important information re gardin g forward-lookin g statements and terms used in this report.Abo ut th e cover:Our work to meet society’s needs is enabled by sites such as our Baytown, Texas, operations, amon g the world’s
largest inte grated and most technolo gically advanced refinin g and petrochemical complexes .Financial and operating performance significantly led peers1
Continuing to be an industry leader in safety5EXXON MOBIL CORPORATION   | 2 022 ANNUAL REPOR T
Earnings 
$56BCash flow from operations  
$77B
shareholder distributions$30B
ROCE with $23B in capex3 25%Structural cost savings2 
$7B
total shareholder return4 87%
production growth in the
Permian B

In [None]:
output

'The Exxon Mobil Corp. annual report on financial performance for the year ended December 31, 2017 includes a discussion of the company’s operating performance, earnings per share, dividends and other financial information.'