In [1]:
# hide output
%%capture output

! pip install pdfplumber
! pip install chromadb
! pip install pymilvus
! pip install sentence-transformers
! pip install langchain
! pip install pypdf

In [2]:
! pip install faiss-gpu

Collecting faiss-gpu
  Downloading faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (85.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.5/85.5 MB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-gpu
Successfully installed faiss-gpu-1.7.2


In [3]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.document_loaders import PyPDFium2Loader
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

##1. Load Data

In [4]:
import os
from google.colab import drive
# Access drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Capstone/'


# companies
companies = os.listdir(os.path.join(path, 'Company Reports'))
for i, comp in enumerate(companies):
    print(i, ": ", comp)


# get reports
def get_reports(comp, year:int, rep_type:int = 1):
    """
    comp:       string or index
    year:       specific year or # recent year, 0 for all
    rep_type:   report type, 1 for annual report, 2 for sustainability report, 0 for both
    ret:        list of report pathes
    """
    if type(comp) == str:
        if comp not in companies:
            print("Error: ", comp, " does not exist")
            return
    elif type(comp) == int:
        if comp not in range(len(companies)):
            print("Error: invalid index")
            return
        comp = companies[comp]
    else:
        print("Error: invalid company")
        return

    file_path = os.path.join(path, 'Company Reports', comp)
    files = os.listdir(file_path)
    files.sort(reverse=True)

    years = range(2013,2023)
    if year in range(11):
        if year:
            years = years[-year:]
    else:
        years = [year]

    if rep_type == 0:
        reps = ["", "_sus"]
    elif rep_type == 1:
        reps = [""]
    elif rep_type == 2:
        reps = ["_sus"]
    else:
        print("Error: invalid report type")
        return

    ret = []
    for year in years:
        for rep in reps:
            file = comp + '_' + str(year) + rep + '.pdf'
            if file in files:
                ret.append(file)
    return [os.path.join(file_path, file) for file in ret]

Mounted at /content/drive
0 :  ExxonMobil
1 :  Shell plc
2 :  BP PLC
3 :  Saudi Aramco
4 :  Chevron
5 :  TotalEnergies
6 :  Valero Energy
7 :  Marathon Petroleum Corporation
8 :  Sinopec
9 :  PetroChina


In [5]:
file = get_reports(4, 2022, 1)

file = file[0]
file

'/content/drive/MyDrive/Capstone/Company Reports/Chevron/Chevron_2022.pdf'

##2. Model

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM

model_id = 'google/flan-t5-xxl'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model = model,
    tokenizer = tokenizer,
    max_length = 2000
)

llm = HuggingFacePipeline(pipeline = pipe)

Loading checkpoint shards:   0%|          | 0/5 [00:00<?, ?it/s]

##3. Function

In [None]:
def getSplits(file):
  loader = PyPDFium2Loader(file)
  all_splits = loader.load()

  return all_splits

def get_letter(all_splits):


  embeddings = HuggingFaceEmbeddings()
  vs_faiss = FAISS.from_documents(all_splits[:20], embeddings)

  question = """Dear shareholders"""
  letter = vs_faiss.similarity_search(question, k=1)

  start = letter[0].metadata['page']

  return letter, start

In [None]:
def getSummarization(letter, start):
  prompt = """What is the company's performances and future plans according to the CEO's letter: /n {text} /n SUMMARY:"""
  prompt_template = PromptTemplate(template=prompt, input_variables=["text"])

  summary_chain = load_summarize_chain(llm=llm,
                                       chain_type='stuff',
                                       prompt=prompt_template)

  output = summary_chain.run(all_splits[start:start+3])
  return output

In [None]:
all_splits = getSplits(file)


In [None]:
letter, start = get_letter(all_splits)

In [None]:

getSummarization(letter, start)


Token indices sequence length is longer than the specified maximum sequence length for this model (2918 > 512). Running this sequence through the model will result in indexing errors
