## 0. Installation and Setup

In [None]:
# hide output
%%capture output

! pip install pdfplumber
! pip install chromadb
! pip install pymilvus
! pip install sentence-transformers
! pip install langchain
! pip install pypdf

## 1. Load Data
In Langchiain, we use document_loaders to load our data. We can simply import langchain.document_loaders and specify the data type.
1. folder: DirectoryLoader
2. Azure: AzureBlobStorageContainerLoader
3. CSV file: CSVLoader
4. Google Drive: GoogleDriveLoader
5. Website: UnstructuredHTMLLoader
6. PDF: PyPDFLoader
7. Youtube: YoutubeLoader

For more data loader refer to the following link:
https://python.langchain.com/docs/modules/data_connection/document_loaders.html

In [None]:
import os
from google.colab import drive
# Access drive
drive.mount('/content/drive')
path = '/content/drive/MyDrive/Capstone/'


# companies
companies = os.listdir(os.path.join(path, 'Company Reports'))
for i, comp in enumerate(companies):
    print(i, ": ", comp)


# get reports
def get_reports(comp, year):
    """
    comp:   string or index
    year:   specific year or # recent year, 0 for all
    ret:    list of report pathes
    """
    if type(comp) == str:
        if comp not in companies:
            print("Error: ", comp, " does not exist")
            return
    elif type(comp) == int:
        if comp not in range(len(companies)):
            print("Error: invalid index")
            return
        comp = companies[comp]
    else:
        print("Error: invalid company")
        return

    if type(year) != int:
        print("Error: invalid year")
        return

    file_path = os.path.join(path, 'Company Reports', comp)
    files = os.listdir(file_path)
    files.sort(reverse=True)

    if year in range(11):
        if year:
            files = files[:year]
    else:
        files = [f for f in files if str(year) in f]
    return [os.path.join(file_path, file) for file in files]

Mounted at /content/drive
0 :  ExxonMobil
1 :  Shell plc
2 :  BP PLC
3 :  Saudi Aramco
4 :  Chevron
5 :  TotalEnergies
6 :  Valero Energy
7 :  Marathon Petroleum Corporation
8 :  Sinopec
9 :  PetroChina


In [None]:
! pip install img2table
! sudo apt install tesseract-ocr
! pip install pytesseract
import pytesseract
from img2table.document import PDF
from img2table.ocr import TesseractOCR

# Instantiation of the pdf
file = get_reports(1, 2018)
pdf = PDF(src=file[0])

# Instantiation of the OCR, Tesseract, which requires prior installation
ocr = TesseractOCR(lang="eng")

# Table identification and extraction
pdf_tables = pdf.extract_tables(ocr=ocr)
pdf_tables

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
tesseract-ocr is already the newest version (4.1.1-2.1build1).
0 upgraded, 0 newly installed, 0 to remove and 18 not upgraded.


KeyboardInterrupt: ignored

In [None]:
# take pdf as a exapmle. This is helpful if we directly download the documents from company website.
from langchain.document_loaders import PyPDFLoader

file = get_reports(1, 2018)

loader = PyPDFLoader(file[0])
data = loader.load_and_split()


# We can also use github (Website type) to store our original data.

# from langchain.document_loaders import WebBaseLoader

# loader = WebBaseLoader("https://drive.google.com/file/d/1EA8Iifu4kSIfziXAYz33P7Zon_u_beWb/view?usp=drive_link")
# data = loader.load()

## 2. Split the data
Once we loaded documents, we need to transform them to better suit our application. The simplest example is to split a long document into smaller chunks that can fit into our model's context window. The most common Splitter in LangChain includes:

1. RecursiveCharacterTextSplitter()
2. CharacterTextSplitter()

The paramether of above functions:
 - length_function: how the length of chunks is calculated. Defaults to just counting number of characters, but it's pretty common to pass a token counter here.
 - chunk_size: the maximum size of your chunks (as measured by the length function).
 - chunk_overlap: the maximum overlap between chunks. It can be nice to have some overlap to maintain some continuity between chunks (e.g. do a sliding window).
 - add_start_index: whether to include the starting position of each chunk within the original document in the metadata.

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size = 200, chunk_overlap = 0)
all_splits = text_splitter.split_documents(data)


## 3. Vectorstores
Since the input of model is vector instead of character, we need to transfer the text data into vector space(embeddding). There are already some useful vector database like ChromaDB, Milvus, pgvector...

Before we load the data into vector database, we need a perfect embeddings model.The Embeddings class is a class designed for interfacing with text embedding models. There are lots of embedding model providers (OpenAI, Cohere, Hugging Face, etc).

https://python.langchain.com/en/latest/modules/indexes/vectorstores.html

In [None]:
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

embeddings = HuggingFaceEmbeddings()


comp = companies[0]
year = 2018

comp_path = os.path.join(path, 'chroma_db', comp + '_' + str(year))

vectorstore = Chroma.from_documents(all_splits, embeddings, persist_directory=comp_path)

## 4.Retrive
Retrieve relevant splits for any question using similarity search. There are servral way for retrievals, Vectorstores+similarity_search are commonly used. We can also use SVM Retriever.

In [None]:
question = "What's the upstream earnings after income tax in 2017?"

# Vectorstores+ s imilarity_search
docs = vectorstore.similarity_search(question)


## Another algo SVM Retriever
from langchain.retrievers import SVMRetriever

svm_retriever = SVMRetriever.from_documents(all_splits, embeddings)
docs_svm=svm_retriever.get_relevant_documents(question)

KeyboardInterrupt: ignored

## 5. Model
The LLM we are using

In [None]:
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM

model_id = 'google/flan-t5-large'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model = model,
    tokenizer = tokenizer,
    max_length = 100
)

llm = HuggingFacePipeline(pipeline = pipe)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

## 6. Generate Answer
The key function of this part is RetrievalQA(). We need to feed our model, retriever and prompt into the function to create Q&A object.

For details on RetrievalQA, refers to
https://api.python.langchain.com/en/latest/chains/langchain.chains.retrieval_qa.base.RetrievalQA.html

In [None]:
from langchain.chains.question_answering import load_qa_chain

chain = load_qa_chain(llm, chain_type="stuff")
chain({"input_documents": docs, "question": question}, return_only_outputs=True)['output_text']

'14,079'

## 7. Q&A Wrapper
Retriver wrapped here

In [None]:
# wrapper function
def get_answer(question) :
  docs = vectorstore.similarity_search(question)
  res = chain({"input_documents": docs, "question": question}, return_only_outputs=True)
  return res['output_text']

In [None]:
question = "What's the upstream earnings after income tax in 2017?"
get_answer(question)

'14,079'

## 8. Conversation


In [None]:
from langchain.chains import ConversationalRetrievalChain

# svm_retriever = SVMRetriever.from_documents(all_splits, embeddings)
qa = ConversationalRetrievalChain.from_llm(llm, vectorstore.as_retriever())

chat_history = []
while True:
  question = input('Send a question:')
  # Use chat_history to store history data
  result = qa({'question': question, 'chat_history': chat_history})
  chat_history.append((question, result['answer']))
  print(result['answer'])

## 8. Old code

In [None]:
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

model_id = 'google/flan-t5-large'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=100
)

llm = HuggingFacePipeline(pipeline=pipe)


# Create Q&A object
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)



qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever = svm_retriever,
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

# Feed our question and get the answer.
result = qa_chain({"query": question})
result["result"]

'14,079 13,355 196 7,101 27,548'

In [None]:
from langchain.chains.llm import LLMChain
from langchain.prompts import PromptTemplate
from langchain.llms import HuggingFacePipeline
from transformers import AutoTokenizer, pipeline, AutoModelForSeq2SeqLM
from langchain.chains.combine_documents.stuff import StuffDocumentsChain
from langchain.chains import RetrievalQA
from langchain.prompts import PromptTemplate

model_id = 'google/flan-t5-large'
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

pipe = pipeline(
    "text2text-generation",
    model=model,
    tokenizer=tokenizer,
    max_length=100
)

llm = HuggingFacePipeline(pipeline=pipe)


# Create Q&A object
template = """Use the following pieces of context to answer the question at the end.
If you don't know the answer, just say that you don't know, don't try to make up an answer.
Use three sentences maximum and keep the answer as concise as possible.
Always say "thanks for asking!" at the end of the answer.
{context}
Question: {question}
Helpful Answer:"""
QA_CHAIN_PROMPT = PromptTemplate.from_template(template)



qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectorstore.as_retriever(),
    chain_type_kwargs={"prompt": QA_CHAIN_PROMPT}
)

# Feed our question and get the answer.
result = qa_chain({"query": question})
result["result"]

'14,079 13,355 196 7,101 27,548'

Reference:
https://python.langchain.com/docs/use_cases/question_answering/#step-4-retrieve