In [None]:
# hide output
%%capture output

! pip install pdfplumber
! pip install chromadb
! pip install pymilvus
! pip install sentence-transformers
! pip install langchain
! pip install pypdf

In [None]:
! pip install faiss-gpu



In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# 1. Load Data

In [None]:
import os
from google.colab import drive
# Access drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Capstone/'


# companies
companies = os.listdir(os.path.join(path, 'Company Reports'))
for i, comp in enumerate(companies):
    print(i, ": ", comp)


# get reports
def get_reports(comp, year:int, rep_type:int = 1):
    """
    comp:       string or index
    year:       specific year or # recent year, 0 for all
    rep_type:   report type, 1 for annual report, 2 for sustainability report, 0 for both
    ret:        list of report pathes
    """
    if type(comp) == str:
        if comp not in companies:
            print("Error: ", comp, " does not exist")
            return
    elif type(comp) == int:
        if comp not in range(len(companies)):
            print("Error: invalid index")
            return
        comp = companies[comp]
    else:
        print("Error: invalid company")
        return

    file_path = os.path.join(path, 'Company Reports', comp)
    files = os.listdir(file_path)
    files.sort(reverse=True)

    years = range(2013,2023)
    if year in range(11):
        if year:
            years = years[-year:]
    else:
        years = [year]

    if rep_type == 0:
        reps = ["", "_sus"]
    elif rep_type == 1:
        reps = [""]
    elif rep_type == 2:
        reps = ["_sus"]
    else:
        print("Error: invalid report type")
        return

    ret = []
    for year in years:
        for rep in reps:
            file = comp + '_' + str(year) + rep + '.pdf'
            if file in files:
                ret.append(file)
    return [os.path.join(file_path, file) for file in ret]

Mounted at /content/drive
0 :  ExxonMobil
1 :  Shell plc
2 :  BP PLC
3 :  Saudi Aramco
4 :  Chevron
5 :  TotalEnergies
6 :  Valero Energy
7 :  Marathon Petroleum Corporation
8 :  Sinopec
9 :  PetroChina


In [None]:
file = get_reports(0, 2022, 1)

file = file[0]
file

'/content/drive/MyDrive/Capstone/Company Reports/ExxonMobil/ExxonMobil_2022.pdf'

## 2. Load and Split


In [None]:
#from langchain.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

#loader = PyPDFLoader(file)

from langchain.document_loaders import PyPDFium2Loader

loader = PyPDFium2Loader(file)

#text_splitter = RecursiveCharacterTextSplitter(chunk_size = 1500, chunk_overlap = 0)
#all_splits = loader.load_and_split(text_splitter)


text_splitter = RecursiveCharacterTextSplitter(
    separators = ["\n\n", " ", "", "."],
    chunk_size = 1000,
    chunk_overlap = 0
)

## 3. Store in Vector database

In [None]:
def load(file_path):
    vs_path_faiss = os.path.join(file_path[:-4], 'faiss')
    if os.path.exists(vs_path_faiss):
        return FAISS.load_local(vs_path_faiss, embeddings)
    else:
        return FAISS.from_documents(text_splitter, embeddings)

In [None]:
embeddings = HuggingFaceEmbeddings()

data = text_splitter.split_documents(loader.load())

vs_faiss = FAISS.from_documents(data, embeddings)

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

##4. QA Model

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM, AutoModelForCausalLM

model_id_mistral = "mistralai/Mistral-7B-Instruct-v0.1"
tokenizer_mistral = AutoTokenizer.from_pretrained(model_id_mistral)
model_mistral = AutoModelForCausalLM.from_pretrained(model_id_mistral)

pipe_mistral = pipeline(
    "text-generation",
    model = model_mistral,
    tokenizer = tokenizer_mistral,
    max_length = 1500,
    pad_token_id = model_mistral.config.eos_token_id
)

llm_mistral = HuggingFacePipeline(pipeline = pipe_mistral)

Downloading (…)okenizer_config.json:   0%|          | 0.00/1.47k [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/72.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)model.bin.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00002.bin:   0%|          | 0.00/9.94G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00002.bin:   0%|          | 0.00/5.06G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

## 5. Retrive and Store Relative Contents

In [None]:
from langchain.chains.question_answering import load_qa_chain

def get_answer(doc, question):
  chain = load_qa_chain(llm_mistral, chain_type="stuff")
  res = chain({"input_documents": doc, "question": question}, return_only_outputs=True)
  return res['output_text']

In [None]:
res = []

In [None]:
question = """What is the company's net income?"""
doc = vs_faiss.similarity_search(question)
res.append(get_answer(doc, question))

In [None]:
question = """What is the company's operating income?"""
doc = vs_faiss.similarity_search(question)
res.append(get_answer(doc, question))

In [None]:
question = """What is the company's major project delivery?"""
doc = vs_faiss.similarity_search(question)
res.append(get_answer(doc, question))

In [None]:
question = """What is the company's capital expenditure?"""
doc = vs_faiss.similarity_search(question)
res.append(get_answer(doc, question))

In [None]:
question = """What is the company's ROACE?"""
doc = vs_faiss.similarity_search(question)
res.append(get_answer(doc, question))

In [None]:
res

[" The company's net income is $2,415.",
 " The company's operating income is $3,259.",
 " The company's major project delivery is expected to drive leading business outcomes and is a continuation of the path that delivered industry-leading performance in 2022. The company's five-year plan is expected to drive leading business outcomes and is a continuation of the path that delivered industry-leading performance in 2022.",
 " The company's capital expenditure is $6,989 million.",
 " The company's ROACE (Return on Assets, Capital Employment, and Asset Turnover) is $23B."]

In [None]:
res = [" The company's net income is $2,415.",
       " The company's operating income is $3,259.",
       " The company's major project delivery is expected to drive leading business outcomes and is a continuation of the path that delivered industry-leading performance in 2022. The company's five-year plan is expected to drive leading business outcomes and is a continuation of the path that delivered industry-leading performance in 2022.",
       " The company's capital expenditure is $6,989 million.",
       " The company's ROACE (Return on Assets, Capital Employment, and Asset Turnover) is $23B."]

## 5. Model T-5

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM

model_id = 'google/flan-t5-xxl'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model = model,
    tokenizer = tokenizer,
    max_length = 2000
)

llm = HuggingFacePipeline(pipeline = pipe)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

In [None]:
doc = text_splitter.create_documents(res)

In [None]:
doc

[Document(page_content="The company's net income is $2,415."),
 Document(page_content="The company's operating income is $3,259."),
 Document(page_content="The company's major project delivery is expected to drive leading business outcomes and is a continuation of the path that delivered industry-leading performance in 2022. The company's five-year plan is expected to drive leading business outcomes and is a continuation of the path that delivered industry-leading performance in 2022."),
 Document(page_content="The company's capital expenditure is $6,989 million."),
 Document(page_content="The company's ROACE (Return on Assets, Capital Employment, and Asset Turnover) is $23B.")]

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

prompt = """Summarize the performance of the company in a detailed and well-structured paragraph: /n {text} /n SUMMARY:"""
prompt_template = PromptTemplate(template=prompt, input_variables=["text"])

summary_chain = load_summarize_chain(llm=llm,
                                     chain_type='stuff',
                                     prompt=prompt_template,
                                     verbose=True)

summary_chain.run(doc)



[1m> Entering new StuffDocumentsChain chain...[0m


[1m> Entering new LLMChain chain...[0m
Prompt after formatting:
[32;1m[1;3mSummarize the performance of the company in a detailed and well-structured paragraph: /n The company's net income is $2,415.

The company's operating income is $3,259.

The company's major project delivery is expected to drive leading business outcomes and is a continuation of the path that delivered industry-leading performance in 2022. The company's five-year plan is expected to drive leading business outcomes and is a continuation of the path that delivered industry-leading performance in 2022.

The company's capital expenditure is $6,989 million.

The company's ROACE (Return on Assets, Capital Employment, and Asset Turnover) is $23B. /n SUMMARY:[0m

[1m> Finished chain.[0m

[1m> Finished chain.[0m


"The company's major project delivery is expected to drive leading business outcomes and is a continuation of the path that delivered industry-leading performance in 2022. The company's five-year plan is expected to drive leading business outcomes and is a continuation of the path that delivered industry-leading performance in 2022. The company's capital expenditure is $6,989 million. The company's ROACE (Return on Assets, Capital Employment, and Asset Turnover) is $23B."