In [19]:
# hide output
%%capture output

! pip install pdfplumber
! pip install chromadb
! pip install pymilvus
! pip install sentence-transformers
! pip install langchain
! pip install pypdf

In [None]:
! pip install faiss-gpu



In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# 1. Load Data

In [None]:
import os
from google.colab import drive
# Access drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Capstone/'


# companies
companies = os.listdir(os.path.join(path, 'Company Reports'))
for i, comp in enumerate(companies):
    print(i, ": ", comp)


# get reports
def get_reports(comp, year:int, rep_type:int = 1):
    """
    comp:       string or index
    year:       specific year or # recent year, 0 for all
    rep_type:   report type, 1 for annual report, 2 for sustainability report, 0 for both
    ret:        list of report pathes
    """
    if type(comp) == str:
        if comp not in companies:
            print("Error: ", comp, " does not exist")
            return
    elif type(comp) == int:
        if comp not in range(len(companies)):
            print("Error: invalid index")
            return
        comp = companies[comp]
    else:
        print("Error: invalid company")
        return

    file_path = os.path.join(path, 'Company Reports', comp)
    files = os.listdir(file_path)
    files.sort(reverse=True)

    years = range(2013,2023)
    if year in range(11):
        if year:
            years = years[-year:]
    else:
        years = [year]

    if rep_type == 0:
        reps = ["", "_sus"]
    elif rep_type == 1:
        reps = [""]
    elif rep_type == 2:
        reps = ["_sus"]
    else:
        print("Error: invalid report type")
        return

    ret = []
    for year in years:
        for rep in reps:
            file = comp + '_' + str(year) + rep + '.pdf'
            if file in files:
                ret.append(file)
    return [os.path.join(file_path, file) for file in ret]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
0 :  ExxonMobil
1 :  Shell plc
2 :  BP PLC
3 :  Saudi Aramco
4 :  Chevron
5 :  TotalEnergies
6 :  Valero Energy
7 :  Marathon Petroleum Corporation
8 :  Sinopec
9 :  PetroChina


In [None]:
file = get_reports(2, 2022, 1)

file = file[0]
file

'/content/drive/MyDrive/Capstone/Company Reports/BP PLC/BP PLC_2022.pdf'

## 2. Load and Split


In [None]:
#from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

#loader = PyPDFLoader(file)

from langchain.document_loaders import PyPDFium2Loader

loader = PyPDFium2Loader(file)

#text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500, chunk_overlap = 0)
#all_splits = loader.load_and_split(text_splitter)


text_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", " ", "", "."],
    chunk_size = 1000,
    chunk_overlap = 0
)

In [None]:
all_splits = loader.load_and_split(text_splitter)


## 3. Store in Vector database

In [None]:
def load(file_path):
    vs_path_faiss = os.path.join(file_path[:-4], 'faiss')
    if os.path.exists(vs_path_faiss):
        return FAISS.load_local(vs_path_faiss, embeddings)
    else:
        return FAISS.from_documents(text_splitter, embeddings)

In [None]:
embeddings = HuggingFaceEmbeddings()

data = text_splitter.split_documents(loader.load())

vs_faiss = FAISS.from_documents(data, embeddings)

## 4. Retrive and Store Relative Contents

In [None]:
def print_doc(doc):
    for i, d in enumerate(doc):
        print('-'*100)
        print('|', str(i+1)+'. Page', d.metadata['page'], '|')
        print('-'*14)
        print(d.page_content)
    print('-'*100)

In [None]:
def store_result(doc):
  result = []
  for i, d in enumerate(doc):
    result.append(d.page_content)
  return result


In [None]:
question = """what is the company's financial performance?"""

#print_doc(vs_faiss.similarity_search(question))
financial = vs_faiss.similarity_search(question)

In [None]:
question = """what are the company's strategic initiatives?"""
strategy = vs_faiss.similarity_search(question)

In [None]:
question = """what is the company's market position and competition?"""

position = vs_faiss.similarity_search(question)


In [None]:
question = """what are the challenges the company faces?"""

challenges = vs_faiss.similarity_search(question)

In [None]:
question = """what is the company's future outlook?"""

future = vs_faiss.similarity_search(question)

In [None]:
relative_content = financial+strategy+position+challenges+future
relative_content

[Document(page_content='316 720 1,495 — 1,873 193 9,299 \r\nResults of operations for the year ended 31 Decembera\r\nSales and other operating revenuesd\r\nThird parties 549 — 2,101 420 2,977 3,836 — 6,551 1,588 18,022 \r\nSales between businesses 5,747 — 12,746 — 538 2,146 — 9,932 1,472 32,581 \r\n 6,296 — 14,847 420 3,515 5,982 — 16,483 3,060 50,603 \r\nExploration expenditure 11 — 144 109 172 57 — 94 (2) 585 \r\nProduction costs 498 — 2,102 83 327 592 — 723 107 4,432 \r\nProduction taxes 1 — 194 — 513 — — 1,544 73 2,325 \r\nOther costs (income)e\r\n (210) (47) 2,926 63 96 206 32 (44) 300 3,322 \r\nDepreciation, depletion and amortization 1,242 — 3,122 18 680 2,075 1 2,495 384 10,017 \r\nNet impairments and (gains) losses on sale of \r\nbusinesses and fixed assetsf\r\n (433) (901) 217 (3) 1,570 (1,189) 1,523 (341) (43) 400 \r\n 1,109 (948) 8,705 270 3,358 1,741 1,556 4,471 819 21,081 \r\nProfit (loss) before taxationg\r\n 5,187 948 6,142 150 157 4,241 (1,556) 12,012 2,241 29,522 \r\n

## 5. Model T-5

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM

model_id = 'google/flan-t5-xxl'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model = model,
    tokenizer = tokenizer,
    max_length = 2000
)

llm = HuggingFacePipeline(pipeline = pipe)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

## 6. Map-reduce Chain

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

map_prompt = """Summarize the key highlights and findings of the company's annual report for 2022, including financial performance, strategic initiatives, market position, challenges faced, and future outlook, if any: /n {text} /n SUMMARY:"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """Generate a structured summary of the company's annual report for 2022, incorporating the information on revenue and financial performance, expenses and cost management, operations and productivity, market position and competition, strategic initiatives, challenges and risks, governance and compliance, and sustainability and social responsibility, if any: /n {text} /n SUMMARY:"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

summary_chain = load_summarize_chain(llm=llm,
                                     chain_type='map_reduce',
                                     map_prompt=map_prompt_template,
                                     combine_prompt=combine_prompt_template,
                                     verbose=True)

output = summary_chain.run(relative_content)



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSummarize the key highlights and findings of the company's annual report for 2022, including financial performance, strategic initiatives, market position, challenges faced, and future outlook, if any: /n 316 720 1,495 — 1,873 193 9,299 
Results of operations for the year ended 31 Decembera
Sales and other operating revenuesd
Third parties 549 — 2,101 420 2,977 3,836 — 6,551 1,588 18,022 
Sales between businesses 5,747 — 12,746 — 538 2,146 — 9,932 1,472 32,581 
 6,296 — 14,847 420 3,515 5,982 — 16,483 3,060 50,603 
Exploration expenditure 11 — 144 109 172 57 — 94 (2) 585 
Production costs 498 — 2,102 83 327 592 — 723 107 4,432 
Production taxes 1 — 194 — 513 — — 1,544 73 2,325 
Other costs (income)e
 (210) (47) 2,926 63 96 206 32 (44) 300 3,322 
Depreciation, depletion and amortization 1,242 — 3,122 18 680 2,075 1 2,495 384 10,017 
Net i

Token indices sequence length is longer than the specified maximum sequence length for this model (2161 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2855 > 512). Running this sequence through the model will result in indexing errors




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGenerate a structured summary of the company's annual report for 2022, incorporating the information on revenue and financial performance, expenses and cost management, operations and productivity, market position and competition, strategic initiatives, challenges and risks, governance and compliance, and sustainability and social responsibility, if any: /n The company's financial performance was impacted by the weakening of the oil price and the impact of the weaker pound.

The company's sales and other operating revenues increased by 6% to $27.7 billion. The company's exploration expenditure increased by 6% to $1.8 billion. The company's production costs increased by 6% to $1.95 billion. The company's net impairments and (gains) losses on sale of businesses and fixed assets increased by 6% to $2.5 billion. The company's profit before taxation increased by 2% to $2.1 billion.

The company's balance sheet 

### output on file 0,2022,1

In [None]:
output

"ExxonMobil Corporation (Exxon Mobil) is a leading international energy company. Exxon Mobil is a Fortune 500 company with a market capitalization of 1 trillion. Exxon Mobil is a member of the Standard & Poor's 500 Index and the Dow Jones Industrial Average. Exxon Mobil is a Fortune 100 company and a member of the Dow Jones Sustainability Index."

### output on file 2,2022,1

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

map_prompt = """Summarize the key highlights and findings of the company's annual report for 2022, including financial performance, strategic initiatives, market position, challenges faced, and future outlook, if any: /n {text} /n SUMMARY:"""
map_prompt_template = PromptTemplate(template=map_prompt, input_variables=["text"])

combine_prompt = """Generate a structured summary of the company's annual report for 2022, incorporating the information on revenue and financial performance, expenses and cost management, operations and productivity, market position and competition, strategic initiatives, challenges and risks, governance and compliance, and sustainability and social responsibility, if any: /n {text} /n SUMMARY:"""
combine_prompt_template = PromptTemplate(template=combine_prompt, input_variables=["text"])

summary_chain = load_summarize_chain(llm=llm,
                                     chain_type='map_reduce',
                                     map_prompt=map_prompt_template,
                                     combine_prompt=combine_prompt_template,
                                     verbose=True)

output = summary_chain.run(relative_content)



[1m> Entering new MapReduceDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSummarize the key highlights and findings of the company's annual report for 2022, including financial performance, strategic initiatives, market position, challenges faced, and future outlook, if any: /n 316 720 1,495 — 1,873 193 9,299 
Results of operations for the year ended 31 Decembera
Sales and other operating revenuesd
Third parties 549 — 2,101 420 2,977 3,836 — 6,551 1,588 18,022 
Sales between businesses 5,747 — 12,746 — 538 2,146 — 9,932 1,472 32,581 
 6,296 — 14,847 420 3,515 5,982 — 16,483 3,060 50,603 
Exploration expenditure 11 — 144 109 172 57 — 94 (2) 585 
Production costs 498 — 2,102 83 327 592 — 723 107 4,432 
Production taxes 1 — 194 — 513 — — 1,544 73 2,325 
Other costs (income)e
 (210) (47) 2,926 63 96 206 32 (44) 300 3,322 
Depreciation, depletion and amortization 1,242 — 3,122 18 680 2,075 1 2,495 384 10,017 
Net i

Token indices sequence length is longer than the specified maximum sequence length for this model (2161 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (2855 > 512). Running this sequence through the model will result in indexing errors




[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mGenerate a structured summary of the company's annual report for 2022, incorporating the information on revenue and financial performance, expenses and cost management, operations and productivity, market position and competition, strategic initiatives, challenges and risks, governance and compliance, and sustainability and social responsibility, if any: /n The company's financial performance was impacted by the weakening of the oil price and the impact of the weaker pound.

The company's sales and other operating revenues increased by 6% to $27.7 billion. The company's exploration expenditure increased by 6% to $1.8 billion. The company's production costs increased by 6% to $1.95 billion. The company's net impairments and (gains) losses on sale of businesses and fixed assets increased by 6% to $2.5 billion. The company's profit before taxation increased by 2% to $2.1 billion.

The company's balance sheet 

In [None]:
output

NameError: ignored

In [2]:
! pip install happytransformer

Collecting happytransformer
  Downloading happytransformer-3.0.0-py3-none-any.whl (24 kB)
Collecting transformers<5.0.0,>=4.30.1 (from happytransformer)
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m33.3 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets<3.0.0,>=2.13.1 (from happytransformer)
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m37.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece (from happytransformer)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m66.0 MB/s[0m eta [36m0:00:00[0m
Collecting accelerate<1.0.0,>=0.20.1 (from happytransformer)
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━

In [3]:
from happytransformer import HappyTextToText, TTSettings

In [4]:
happy_tt = HappyTextToText("T5", "vennify/t5-base-grammar-correction")

args = TTSettings(num_beams=5, min_length=1)



Downloading (…)lve/main/config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

This sentence has bad grammar.


In [20]:
output = "What we’ve achieved Increased renewable energy generating capacity to FID More clean energy What we’ve achieved Forged partnerships and collaborations to help communities benefit from the energy transition Just transition Sustainable livelihoods What we’ve achieved Confirmed that in 2022 all bp employees worldwide were paid a fair wagea Greater equity What we’ve achieved Launched a social mobility framework for action and business resource group Enhance wellbeing What we’ve achieved Provided access to health and wellbeing programmes for all employees"

In [21]:
# Add the prefix "grammar: " before each input
result = happy_tt.generate_text("grammar:"+output, args=args)

print(result.text) # This sentence has bad grammar.

What we’ve achieved Increased renewable energy generating capacity to FID More clean energy What we’ve achieved Forged partnerships and collaborations to help communities benefit from the energy transition Just transition Sustainable livelihoods What we’ve achieved Confirm
