<a href="https://colab.research.google.com/github/yohanesnuwara/energy-rag/blob/main/notebooks/RAG_OpenAI.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# RAG with OpenAI

In [None]:
!pip -q install openai==0.28
!pip -q install pypdf2
!pip -q install tiktoken

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m76.5/76.5 kB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m232.6/232.6 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import numpy as np
import pandas as pd
import tiktoken
import openai
import PyPDF2
import nltk
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [None]:
openai.api_key = "" # Fill with your OpenAI API key

In [None]:
def trim(text, n_start, n_end):
  # Split the sentence into a list of sentences
  sentences = nltk.sent_tokenize(text)

  # Slice the list to only include the first two sentences
  trimmed_sentences = sentences[n_start:n_end]

  # Join the sliced list of sentences back into a string
  trimmed_sentence = " ".join(trimmed_sentences)

  return trimmed_sentence

# trim(training_data, 0, 2)

In [None]:
import pandas as pd

# Create an empty list to store DataFrames
dfs = []

pdf_files = [
    ["Geological Report", "/content/Discovery_report.pdf"],
    ["Completion Report", "/content/COMPLETION_REPORT_1.PDF"],
    ["Drilling Report", "/content/DRILLING_PROGRAMME_1.PDF"]]

interval = 10

for f in pdf_files:
    company_name = f[0]
    pdf_file = open(f[1], "rb")
    pdf_reader = PyPDF2.PdfReader(pdf_file)
    text = ""
    for page in range(len(pdf_reader.pages)):
        text += pdf_reader.pages[page].extract_text()

    # Clean and format document
    training_data = text.replace("\n", " ")

    # NLTK
    sentences = nltk.sent_tokenize(training_data)

    # trim sentence
    for l in range(0, len(sentences), interval):
        trimmed_sentence = trim(training_data, l, l+interval)

        # Append a new DataFrame to the list with the company name and trimmed sentence
        dfs.append(pd.DataFrame({"Article_ID": [f"{company_name}_{l}"], "Text": [trimmed_sentence]}))

# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)


In [None]:
df

Unnamed: 0,Article_ID,Text
0,Geological Report_0,Denne rapport tilhører6 STATOIL LTEK DOK.SENTE...
1,Geological Report_10,This includes seismic and structural interpret...
2,Geological Report_20,. 74 5.6 Jurassic and Triassic sediment distri...
3,Geological Report_30,"progradation) 99 Figure 5.14 Isochore map, Bas..."
4,Geological Report_40,"Scale 1:50000. 4 Time map, Top Mesozoic Sandst..."
...,...,...
355,Drilling Report_650,no Volve F-15 & F-15A Page 52 of 66 B.1.2 ...
356,Drilling Report_660,No. 0 Well: 15/9-F-15 & F-15A Date 16.03.2...
357,Drilling Report_670,0 Well: 15/9-F-15 & F-15A Date 16.03.2009 ...
358,Drilling Report_680,no Volve F-15 & F-15A Page 62 of 66 C.1.3 ...


In [None]:
COMPLETIONS_MODEL = "gpt-3.5-turbo"
EMBEDDING_MODEL = "text-embedding-ada-002"

In [None]:
def get_embedding(text: str, model: str=EMBEDDING_MODEL) -> list[float]:
    result = openai.Embedding.create(
      model=model,
      input=text
    )
    return result["data"][0]["embedding"]

def compute_doc_embeddings(df: pd.DataFrame) -> dict[tuple[str, str], list[float]]:
    """
    Create an embedding for each row in the dataframe using the OpenAI Embeddings API.

    Return a dictionary that maps between each embedding vector and the index of the row that it corresponds to.
    """
    return {
        idx: get_embedding(r.Text) for idx, r in df.iterrows()
    }

def load_embeddings(df):
    """
    Read the document embeddings and their keys from a CSV.

    fname is the path to a CSV with exactly these named columns:
        "title", "heading", "0", "1", ... up to the length of the embedding vectors.
    """

    # df = pd.read_csv(fname, header=0)
    max_dim = max([int(c) for c in df.columns if c != "title" and c != "heading"])
    return {
           (r.title, r.heading): [r[str(i)] for i in range(max_dim + 1)] for _, r in df.iterrows()
    }

In [None]:
embed_df = compute_doc_embeddings(df)

embed_df

In [None]:
# An example embedding:
example_entry = list(embed_df.items())[0]
print(f"{example_entry[0]} : {example_entry[1][:5]}... ({len(example_entry[1])} entries)")

0 : [-0.006125323474407196, -0.008757434785366058, 0.012355233542621136, -0.03977102413773537, -0.007028259336948395]... (1536 entries)


In [None]:
import numpy as np

def vector_similarity(x: list[float], y: list[float]) -> float:
    """
    Returns the similarity between two vectors.

    Because OpenAI Embeddings are normalized to length 1, the cosine similarity is the same as the dot product.
    """
    return np.dot(np.array(x), np.array(y))

def order_document_sections_by_query_similarity(query: str, contexts: dict[(str, str), np.array]) -> list[(float, (str, str))]:
    """
    Find the query embedding for the supplied query, and compare it against all of the pre-calculated document embeddings
    to find the most relevant sections.

    Return the list of document sections, sorted by relevance in descending order.
    """
    query_embedding = get_embedding(query)

    document_similarities = sorted([
        (vector_similarity(query_embedding, doc_embedding), doc_index) for doc_index, doc_embedding in contexts.items()
    ], reverse=True)

    return document_similarities

In [None]:
order_document_sections_by_query_similarity("What is the lithology of the field?", embed_df)[:5]

[(0.8497804042188715, 57),
 (0.8453651125238638, 61),
 (0.8397181506960563, 60),
 (0.8374355996977627, 59),
 (0.8328094379563438, 72)]

In [None]:
import tiktoken

MAX_SECTION_LEN = 500
SEPARATOR = "\n* "
ENCODING = "cl100k_base"  # Suitable for gpt-3.5-turbo and gpt-4

encoding = tiktoken.get_encoding(ENCODING)
separator_len = len(encoding.encode(SEPARATOR))

COMPLETIONS_API_PARAMS = {
    # Temperature of 0.5 for balanced outputs.
    "temperature": 0.9,
    "max_tokens": 2000,
    "model": "gpt-3.5-turbo",  # Replace with your desired model
}

In [None]:
def construct_prompt(question: str, context_embeddings: dict, df: pd.DataFrame) -> str:
    """
    Fetch relevant
    """
    most_relevant_document_sections = order_document_sections_by_query_similarity(question, context_embeddings)

    chosen_sections = []
    chosen_sections_len = 0
    chosen_sections_indexes = []

    for _, section_index in most_relevant_document_sections:
        # Add contexts until we run out of space.
        document_section = df.loc[section_index]

        chosen_sections_len += 100 + separator_len
        if chosen_sections_len > MAX_SECTION_LEN:
            break

        chosen_sections.append(SEPARATOR + document_section.Text.replace("\n", " "))
        chosen_sections_indexes.append(str(section_index))

    # Useful diagnostic information
    # print(f"Selected {len(chosen_sections)} document sections:")
    # print("\n".join(chosen_sections_indexes))

    header = """Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."\n\nContext:\n"""

    return header + "".join(chosen_sections) + "\n\n Q: " + question + "\n A:"



In [None]:
def answer_query_with_context(
    query: str,
    df: pd.DataFrame,
    document_embeddings: dict[tuple[str, str], np.ndarray],
    show_prompt: bool = False
) -> str:
    # Construct the prompt using your custom logic
    prompt = construct_prompt(
        query,
        document_embeddings,
        df
    )

    if show_prompt:
        print("Constructed Prompt:\n", prompt)

    # Use the ChatCompletion API with messages
    response = openai.ChatCompletion.create(
        # model="gpt-3.5-turbo",
        messages=[
            {"role": "system", "content": "You are a helpful assistant."},
            {"role": "user", "content": prompt}
        ],
        **COMPLETIONS_API_PARAMS
    )

    # Extract and return the response content
    return response["choices"][0]["message"]["content"].strip()

In [None]:
answer_query_with_context("is there any second attempt to perform second casing pressure?",
                          df, embed_df, show_prompt=True)

Constructed Prompt:
 Answer the question as truthfully as possible using the provided context, and if the answer is not contained within the text below, say "I don't know."

Context:

* When pressuring up the well to 75 bar a pressure drop of 1. 6 bar/10 min decreasing to 1.3 bar/10 min was experienced. Drilled out 20" casing shoetrack, spotted LCM pill and attempted to perform second casing pressure test to 75 bar without  success. This time having a 1 bar/10 min pressure drop. Decided to leave out the casing pressure test and instead perform sufficient FIT below the 20” csg shoe as a part of the 17 ½”  section, ref dispensation nr 76943. BIT RUNS   26" Smith Bits, Bit Type: XR+C (rerun) 2x22 1x20 1x18 TFA=1.298, IADC 115  S/N MY6318    DRILLING FLUID   The 26" section was drilled with seawater and hi-vis sweeps. At TD the well was circulated clean with 30 m 3 hi-vis, 30 m3 seawater and finally another 30 m3 hi-vis sweep before displacing  well to 1.40 SG mud. Due to tight spots while

'Yes, there was a second attempt to perform a second casing pressure test to 75 bar, but it was not successful.'

In [None]:
import json

with open("embeddings.json", "w", encoding="utf-8") as json_file:
    json.dump(embed_df, json_file, indent=4)

In [None]:
df.to_csv("text-chunks.csv")