<a href="https://colab.research.google.com/github/yohanesnuwara/PetroRAG/blob/main/notebooks/RAG_for_Reports_v1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PetroRAG v1



PetroRAG is Retrieval Augmented Generation or RAG supports for Oil and Gas reports question & answering. It can load multiple files. This the first generation, using OpenAI service.

Credit: [Yohanes Nuwara](https://github.com/yohanesnuwara/PetroRAG)



In [14]:
# @title Setup (click Play to run)

!pip -q install openai==0.28
!pip -q install pypdf2
!pip -q install tiktoken

import numpy as np
import pandas as pd
import tiktoken
import openai
import PyPDF2
import nltk
import os
import glob
from tqdm import tqdm
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [15]:
# @title Input your OpenAI API Key
# @markdown Find here: https://www.howtogeek.com/885918/how-to-get-an-openai-api-key/

API_key = ''  # @param {type: "string"}


In [23]:
# @title Process documents

# OpenAI API key
openai.api_key = API_key

# Model setup
COMPLETIONS_MODEL = "gpt-3.5-turbo"
EMBEDDING_MODEL = "text-embedding-ada-002"

# Encoding generator
MAX_SECTION_LEN = 500
SEPARATOR = "\n* "
ENCODING = "cl100k_base"  # Suitable for gpt-3.5-turbo and gpt-4


# Functions
def trim(text, n_start, n_end):
  # Split the sentence into a list of sentences
  sentences = nltk.sent_tokenize(text)

  # Slice the list to only include the first two sentences
  trimmed_sentences = sentences[n_start:n_end]

  # Join the sliced list of sentences back into a string
  trimmed_sentence = " ".join(trimmed_sentences)

  return trimmed_sentence

def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.

    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    embeddings = {}
    for idx, r in tqdm(df.iterrows(), total=len(df), desc="Computing embeddings"):
        embeddings[idx] = get_embedding(r.Text)
    return embeddings

def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.

    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections.

    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)

    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)

    return document_similarities

def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)

    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []

    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.
        document_section = df.loc[section_index]

        chosen_sections_len += 100 + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break

        chosen_sections.append(SEPARATOR + document_section.Text.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))

    # Useful diagnostic information
    # print(f"Selected {len(chosen_sections)} document sections:")
    # print("\n".join(chosen_sections_indexes))

    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""

    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"



def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict[tuple[str, str], np.ndarray],
    show_prompt: bool = False
) -> str:
    # Construct the prompt using your custom logic
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )

    if show_prompt:
        print("Constructed Prompt:\n", prompt)

    # Use the ChatCompletion API with messages
    response = openai.ChatCompletion.create(
        # model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        **COMPLETIONS_API_PARAMS
    )

    # Extract and return the response content
    return response["choices"][0]["message"]["content"].strip() + "mlkaxklaxqpxoqkpoqa"

"""
Read files
"""

# Specify the directory containing the PDF files
pdf_directory = "/content"

# Use glob to find all PDF files in the specified directory
pdf_paths = glob.glob(os.path.join(pdf_directory, "*.PDF")) + glob.glob(os.path.join(pdf_directory, "*.pdf"))

# Create the pdf_files list
pdf_files = []
for idx, file_path in enumerate(pdf_paths, start=1):
    # Assign an iterative name like file1, file2, file3, ...
    file_name = f"file{idx}"
    pdf_files.append([file_name, file_path])

"""
Text chunking
"""

# Create an empty list to store DataFrames
dfs = []

# Set interval
interval = 10

# Iterate over pdf_files with tqdm for progress tracking
for f in tqdm(pdf_files, desc="Creating chunks of texts from reports"):
    company_name = f[0]
    pdf_file = open(f[1], "rb")
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ""

    # Extract text from PDF pages
    for page in range(len(pdf_reader.pages)):
        text += pdf_reader.pages[page].extract_text()

    # Clean and format document
    training_data = text.replace("\n", " ")

    # Tokenize sentences using NLTK
    sentences = nltk.sent_tokenize(training_data)

    # Trim sentences and append to dfs
    for l in range(0, len(sentences), interval):
        trimmed_sentence = trim(training_data, l, l + interval)

        # Append a new DataFrame to the list with the company name and trimmed sentence
        dfs.append(pd.DataFrame({"Article_ID": [f"{company_name}_{l}"], "Text": [trimmed_sentence]}))

# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)
print("\n")

"""
Compute embeddings
"""

embed_df = compute_doc_embeddings(df)

"""
Answer query
"""

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

COMPLETIONS_API_PARAMS = {
    # Temperature of 0.5 for balanced outputs.
    "temperature": 0.9,
    "max_tokens": 2000,
    "model": "gpt-3.5-turbo",  # Replace with your desired model
}

Creating chunks of texts from reports: 100%|██████████| 2/2 [00:13<00:00,  6.59s/it]






Computing embeddings: 100%|██████████| 161/161 [00:34<00:00,  4.61it/s]


In [44]:
# @title Ask the report
# @markdown Put your question or prompt here:

Question = 'can you describe the drilling activities?'  # @param {type: "string"}

# Process the prompt
answer_query_with_context(Question, df, embed_df, show_prompt=False)

'The drilling activities involved drilling an 8½" hole from 3210m to 3473m, taking pressure points and circulating the hole clean, POOH with 8½ BHA to shoe at 2531m, laying out the same BHA, and drilling to total depth (TD) at 3498m. Additionally, activities included laying out an 8 1/2" drilling BHA and running the casing with various specifications and procedures such as cleaning, drifting, measuring, and doping at Statoil onshore base.'