In [None]:
# hide output
%%capture output

! pip install pdfplumber
! pip install chromadb
! pip install pymilvus
! pip install sentence-transformers
! pip install langchain
! pip install pypdf
! pip install faiss-gpu
! pip install happytransformer

In [None]:
! pip install faiss-gpu
! pip install happytransformer



In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# 1. Load Data

In [None]:
import os
from google.colab import drive
# Access drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Capstone/'


# companies
companies = os.listdir(os.path.join(path, 'Company Reports'))
for i, comp in enumerate(companies):
    print(i, ": ", comp)


# get reports
def get_reports(comp, year:int, rep_type:int = 1):
    """
    comp:       string or index
    year:       specific year or # recent year, 0 for all
    rep_type:   report type, 1 for annual report, 2 for sustainability report, 0 for both
    ret:        list of report pathes
    """
    if type(comp) == str:
        if comp not in companies:
            print("Error: ", comp, " does not exist")
            return
    elif type(comp) == int:
        if comp not in range(len(companies)):
            print("Error: invalid index")
            return
        comp = companies[comp]
    else:
        print("Error: invalid company")
        return

    file_path = os.path.join(path, 'Company Reports', comp)
    files = os.listdir(file_path)
    files.sort(reverse=True)

    years = range(2013,2023)
    if year in range(11):
        if year:
            years = years[-year:]
    else:
        years = [year]

    if rep_type == 0:
        reps = ["", "_sus"]
    elif rep_type == 1:
        reps = [""]
    elif rep_type == 2:
        reps = ["_sus"]
    else:
        print("Error: invalid report type")
        return

    ret = []
    for year in years:
        for rep in reps:
            file = comp + '_' + str(year) + rep + '.pdf'
            if file in files:
                ret.append(file)
    return [os.path.join(file_path, file) for file in ret]

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
0 :  ExxonMobil
1 :  Shell plc
2 :  BP PLC
3 :  Saudi Aramco
4 :  Chevron
5 :  TotalEnergies
6 :  Valero Energy
7 :  Marathon Petroleum Corporation
8 :  Sinopec
9 :  PetroChina


In [None]:
file = get_reports(9, 2020, 0)

file = file[0]
file

'/content/drive/MyDrive/Capstone/Company Reports/PetroChina/PetroChina_2020.pdf'

## 2. Load and Split


In [None]:
from langchain.document_loaders import PyPDFium2Loader

loader = PyPDFium2Loader(file)
all_splits = loader.load()

## 3. Store in Vector database

In [None]:
embeddings = HuggingFaceEmbeddings()

vs_faiss = FAISS.from_documents(all_splits[:20], embeddings)

## 4. Retrive and Store Relative Contents

In [None]:
question = """Dear Shareholders"""

letter = vs_faiss.similarity_search(question, k=1)
letter

[Document(page_content='012 PETROCHINA COMPANY LIMITED\r\nCHANGES IN SHAREHOLDINGS AND INFORMATION ON SHAREHOLDERS\r\nUnit: Shares\r\nName of \r\nshareholders\r\nNature of \r\nshareholding Number of shares Capacity\r\nPercentage of such \r\nshares in the same \r\nclass of the issued \r\nshare capital (%)\r\nPercentage \r\nof total share \r\ncapital (%)\r\nCNPC\r\nA Shares 146,882,339,136 (L) Beneficial Owner 90.71 80.25\r\nH Shares 291,518,000 (L) (1)\r\nInterest of Corporation \r\nControlled by the \r\nSubstantial Shareholder 1.38 0.16\r\nBlackRock, Inc. (2) H Shares\r\n1,263,923,364 (L) Interest of Corporation \r\nControlled by the \r\nSubstantial Shareholder\r\n5.99 0.69\r\n492,000 (S) 0.00 0.00\r\n(L) Long position (S) Short position \r\nNotes: (1) 291,518,000 H shares (long position) were held by Fairy King Investments Limited, an overseas wholly-owned subsidiary of \r\nCNPC. CNPC is deemed to be interested in the H shares held by Fairy King Investments Limited.\r\n(2) Blackrock, 

In [None]:
start = letter[0].metadata['page']

In [None]:
str(all_splits[start:start+3])

"[Document(page_content='012 PETROCHINA COMPANY LIMITED\\r\\nCHANGES IN SHAREHOLDINGS AND INFORMATION ON SHAREHOLDERS\\r\\nUnit: Shares\\r\\nName of \\r\\nshareholders\\r\\nNature of \\r\\nshareholding Number of shares Capacity\\r\\nPercentage of such \\r\\nshares in the same \\r\\nclass of the issued \\r\\nshare capital (%)\\r\\nPercentage \\r\\nof total share \\r\\ncapital (%)\\r\\nCNPC\\r\\nA Shares 146,882,339,136 (L) Beneficial Owner 90.71 80.25\\r\\nH Shares 291,518,000 (L) (1)\\r\\nInterest of Corporation \\r\\nControlled by the \\r\\nSubstantial Shareholder 1.38 0.16\\r\\nBlackRock, Inc. (2) H Shares\\r\\n1,263,923,364 (L) Interest of Corporation \\r\\nControlled by the \\r\\nSubstantial Shareholder\\r\\n5.99 0.69\\r\\n492,000 (S) 0.00 0.00\\r\\n(L) Long position (S) Short position \\r\\nNotes: (1) 291,518,000 H shares (long position) were held by Fairy King Investments Limited, an overseas wholly-owned subsidiary of \\r\\nCNPC. CNPC is deemed to be interested in the H shares h

## 5. Model

### Llama 2

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM, AutoModelForCausalLM

access_token = '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'

model_id = "meta-llama/Llama-2-7b-chat-hf"
tokenizer = AutoTokenizer.from_pretrained(model_id, token = access_token)
model = AutoModelForCausalLM.from_pretrained(model_id, token = access_token)

pipe = pipeline(
    "text-generation",
    model = model,
    tokenizer = tokenizer,
    max_length = 4096
)

llm_ = HuggingFacePipeline(pipeline = pipe)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



##6. Result

In [None]:
from langchain.chains.summarize import load_summarize_chain
from langchain.prompts import PromptTemplate

prompt = """Summarize the company's performances and future plans according to the CEO's letter: /n {text} /n SUMMARY:"""
prompt_template = PromptTemplate(template=prompt, input_variables=["text"])

summary_chain = load_summarize_chain(llm=llm_,
                                     chain_type='stuff',
                                     prompt=prompt_template)

output = summary_chain.run(all_splits[start:start+3])
output

"\nAccording to the CEO's letter, the company's performance in 2020 was affected by the COVID-19 pandemic, but it managed to deliver a steady production and operation under control. The company also implemented requirements for high-quality development, coordinated the advancement of COVID-19 prevention and control, resumption of work and production, production and operation, and reform and innovation. The CEO highlighted the company's efforts to ensure the health and safety of employees while maintaining the stable and controlled advancement of production and operation, deepen the improvement of quality and profitability, and successfully complete the pipeline assets restructuring.\n\nThe CEO also provided information on shareholdings and information on shareholders, including the changes in shareholdings and information on shareholders as of December 31, 2020. The company's controlling shareholder is CNPC, and the ultimate controller is State-owned Assets Supervision and Administrati