In [None]:
# Date: 18.02.25
# Purpose: Testing manual set up of LLM as a judge v2
# Theme: Eval
# Status: WIP

In [None]:
# Note: incudes experiments with retrival (hybrid search) before LLM as judge (see below for reasons)

#Terms of reference: 
#1. **Relevance:** Does the response directly answer the query?
#2. **Faithfulness:** Is the response factually grounded in the provided documents?
#3. **Completeness:** Does the response cover all necessary details?
#4. **Conciseness:** Is the response free from unnecessary information?
#5. **Coherence:** Is the response well-structured and readable?

#Status checking: A logical (and common) approach is to compare the answer to the retrieved chunks to check for **Faithfulness**.
#That obviously pre-supposes the retrieved chunks hold the information to answer the question (otherwise comparison of retrieved to 
#answer means nothing). 

#So before starting on assessing **Faithfulness**, steps need to be taken to validate the retrieved chunks.

#Here are some steps to try to improve Chunk Recall
#- 1.0 Chunk Overlap & Adaptive Sizing
#Use overlapping chunks to reduce cases where key information is split across boundaries.
#Dynamically expand chunks if retrieval confidence is low.

#- 2.0 Hybrid Retrieval (BM25 + Embeddings)
#Combine dense vector search (embeddings) with BM25 (keyword-based) to improve recall.
#This balances semantic and exact-match retrieval.

#- 3.0 Query Expansion / Rewriting
#Automatically rewrite the query to include synonyms or more context.
#Use an LLM to generate multiple variations of the query.

#- 4.0 Multi-Hop Retrieval
#If a query requires multiple steps (e.g., cross-referencing different sections), use iterative retrieval instead of single-pass. 

In [None]:
# Lib:
# --- Install lib (1st set is for rag)
##!pip3 install --upgrade google-cloud-aiplatform
##!pip3 install ipython pandas[output_formatting] google-cloud-language==2.10.0
# !pip install langchain # 10.11.24 comment out as not sure using & trying to simplify code.
# !pip install langchain-community # 10.11.24 comment out as not sure using & trying to simplify code.

!pip install PyPDF2
!pip install pypdf
!pip install sentence-transformers
#!pip install torch
#!pip install pandas # 10.11.24 comment out as not sure using & trying to simplify code.
!pip install pdfplumber
#!pip install llama_index # 10.11.24 comment out as not sure using & trying to simplify code.
pip install pandas
pip install spacy
pip install rank_bm25
pip install markdown
#pip install google-generativeai langchain-community # wrong
pip install langchain-google-genai


In [None]:
# # RAG:

## Includes two parts.
# The 2nd part replaced the 1st. Now both commented out and replaced with next cell
# Which is the 2nd code here + overlap

# Part i
# # Spacy1
# # -- Do NOT Delete. Splitting by sentence and maintaining chunk size. --

# # Trying to replace above - 1st time using spacy.
# import os
# import pandas as pd
# import pdfplumber
# import spacy
# from spacy.lang.en import English

# # Load English language model and add sentencizer
# nlp = English()
# nlp.add_pipe('sentencizer')

# def process_pdf(filepath, max_chunk_size=1000):
#     """Processes a PDF, splits by sentences using spaCy, and returns a DataFrame"""
#     chunks = []
#     with pdfplumber.open(filepath) as pdf:
#         for page_number, page in enumerate(pdf.pages):
#             page_text = page.extract_text()
#             doc = nlp(page_text)  # Process text with spaCy
#             sentences = [str(sent).strip() for sent in doc.sents]  # Extract sentences

#             chunk_text = ""
#             for sentence in sentences:
#                 # Combine sentences within chunk limit
#                 if len(chunk_text + sentence) <= max_chunk_size:
#                     chunk_text += " " + sentence
#                 else:
#                     # Add current chunk and reset
#                     chunks.append({
#                         "page_number": page_number+1,  # Page number is 1-indexed
#                         "chunk_text": chunk_text.strip(),
#                         "chunk_length": len(chunk_text.strip())
#                     })
#                     chunk_text = sentence
#             # Add remaining text in the last chunk
#             if chunk_text:
#                 chunks.append({
#                     "page_number": page_number+1,
#                     "chunk_text": chunk_text.strip(),
#                     "chunk_length": len(chunk_text.strip())
#                 })
#     return pd.DataFrame(chunks)

# def process_pdf_directory(directory_path, max_chunk_size=1000):
#     """Processes all PDFs in a directory and combines results into a single DataFrame"""
#     all_chunks = []
#     for filename in os.listdir(directory_path):
#         if filename.endswith(".pdf"):
#             file_path = os.path.join(directory_path, filename)
#             pdf_df = process_pdf(file_path, max_chunk_size)
#             all_chunks.append(pdf_df)
#     return pd.concat(all_chunks, ignore_index=True)

# # Example usage
# #directory_path = "/kaggle/input/payment-hltc"
# directory_path = "/content/Untitled Folder"
# max_chunk_size = 1000
# all_chunks_df = process_pdf_directory(directory_path, max_chunk_size)

# display(all_chunks_df.head())






# Part ii
# # -- Replaces above: Key difference being this includes the doc name:
# import os
# import pandas as pd
# import pdfplumber
# import spacy
# from spacy.lang.en import English

# # Load English language model and add sentencizer
# nlp = English()
# nlp.add_pipe('sentencizer')

# def process_pdf(filepath, doc_name, max_chunk_size=1000):
#     """Processes a PDF, splits by sentences using spaCy, and returns a DataFrame"""
#     chunks = []
#     with pdfplumber.open(filepath) as pdf:
#         for page_number, page in enumerate(pdf.pages):
#             page_text = page.extract_text()
#             doc = nlp(page_text)  # Process text with spaCy
#             sentences = [str(sent).strip() for sent in doc.sents]  # Extract sentences

#             chunk_text = ""
#             for sentence in sentences:
#                 # Combine sentences within chunk limit
#                 if len(chunk_text + sentence) <= max_chunk_size:
#                     chunk_text += " " + sentence
#                 else:
#                     # Add current chunk and reset
#                     chunks.append({
#                         "document_name": doc_name,  # Include document name
#                         "page_number": page_number + 1,  # Page number is 1-indexed
#                         "chunk_text": chunk_text.strip(),
#                         "chunk_length": len(chunk_text.strip())
#                     })
#                     chunk_text = sentence

#             # Add remaining text in the last chunk
#             if chunk_text:
#                 chunks.append({
#                     "document_name": doc_name,  # Include document name
#                     "page_number": page_number + 1,
#                     "chunk_text": chunk_text.strip(),
#                     "chunk_length": len(chunk_text.strip())
#                 })

#     return pd.DataFrame(chunks)

# def process_pdf_directory(directory_path, max_chunk_size=1000):
#     """Processes all PDFs in a directory and combines results into a single DataFrame"""
#     all_chunks = []
#     for filename in os.listdir(directory_path):
#         if filename.endswith(".pdf"):
#             file_path = os.path.join(directory_path, filename)
#             pdf_df = process_pdf(file_path, filename, max_chunk_size)  # Pass filename as document name
#             all_chunks.append(pdf_df)
#     return pd.concat(all_chunks, ignore_index=True)

# # Example usage
# # directory_path = "/kaggle/input/payment-hltc"
# directory_path = "data/pdf"
# max_chunk_size = 1000
# all_chunks_df = process_pdf_directory(directory_path, max_chunk_size)

# display(all_chunks_df.head())


In [None]:
# Copy of the last set of code above, but checking if overlap can be added?
import os
import pandas as pd
import pdfplumber
import spacy
from spacy.lang.en import English

# Load English language model and add sentencizer
nlp = English()
nlp.add_pipe('sentencizer')

def process_pdf(filepath, doc_name, max_chunk_size=1000, chunk_overlap=200):
    """Processes a PDF, splits by sentences using spaCy, and returns a DataFrame with chunk overlap"""
    chunks = []
    with pdfplumber.open(filepath) as pdf:
        for page_number, page in enumerate(pdf.pages):
            page_text = page.extract_text()
            if not page_text:
                continue  # Skip empty pages
            
            doc = nlp(page_text)  # Process text with spaCy
            sentences = [str(sent).strip() for sent in doc.sents if sent.text.strip()]  # Extract non-empty sentences

            chunk_text = ""
            overlap_text = ""  # Store overlap from previous chunk
            
            for sentence in sentences:
                if len(chunk_text + sentence) <= max_chunk_size:
                    chunk_text += " " + sentence
                else:
                    # Save chunk and include overlap
                    chunks.append({
                        "document_name": doc_name,
                        "page_number": page_number + 1,
                        "chunk_text": chunk_text.strip(),
                        "chunk_length": len(chunk_text.strip())
                    })

                    # Set overlap for next chunk (take the last `chunk_overlap` characters)
                    overlap_text = chunk_text[-chunk_overlap:] if chunk_overlap > 0 else ""
                    
                    # Start new chunk with overlap
                    chunk_text = overlap_text + " " + sentence

            # Add remaining text in the last chunk
            if chunk_text.strip():
                chunks.append({
                    "document_name": doc_name,
                    "page_number": page_number + 1,
                    "chunk_text": chunk_text.strip(),
                    "chunk_length": len(chunk_text.strip())
                })

    return pd.DataFrame(chunks)

def process_pdf_directory(directory_path, max_chunk_size=1000, chunk_overlap=200):
    """Processes all PDFs in a directory and combines results into a single DataFrame"""
    all_chunks = []
    for filename in os.listdir(directory_path):
        if filename.endswith(".pdf"):
            file_path = os.path.join(directory_path, filename)
            pdf_df = process_pdf(file_path, filename, max_chunk_size, chunk_overlap)  # Pass overlap parameter
            all_chunks.append(pdf_df)
    return pd.concat(all_chunks, ignore_index=True)

# Example usage
directory_path = "data/pdf"
max_chunk_size = 1000
chunk_overlap = 200  # Adjust overlap size as needed
all_chunks_df = process_pdf_directory(directory_path, max_chunk_size, chunk_overlap)

display(all_chunks_df.head())


In [None]:
print('heelloo')

In [None]:
df1 = all_chunks_df.copy()
df1['page_content'] = df1['chunk_text']
df1.head(1)

In [None]:
# Embedding:
# -- Embedding
pd.set_option("display.max_colwidth", 50)
#!pip install --upgrade sentence-transformers
#import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer

# Load the Sentence Transformer model
model_trans = SentenceTransformer('sentence-transformers/all-mpnet-base-v2')

print(df1.columns)
try:
  # Attempt to access the column
    embeddings = model_trans.encode(df1['page_content'])
except KeyError:
    # Handle missing column or data (e.g., print error message)
    print("Error: 'page_content' column not found in df1.")
    # You can choose to exit or impute missing values here

#page_content
# Embed the chunked text
embeddings = model_trans.encode(df1['page_content'])
#embeddings = model_trans.encode(df1['filtered_text'])

# Convert embedding vectors into one-dimensional arrays
one_dimensional_embeddings = []
for embedding in embeddings:
    one_dimensional_embedding = np.ravel(embedding)
    one_dimensional_embeddings.append(one_dimensional_embedding)

# Combine the original text and embeddings into a DataFrame
data = {
    "original_text": df1['page_content'],
    #"original_text": df1['filtered_text'],
    "embeddings": one_dimensional_embeddings
}

dfe = pd.DataFrame(data)
df3 = pd.concat([df1, dfe], axis=1)
df3.drop(['original_text'], axis=1, inplace=True)
display(df3.head(3))

In [None]:
qadf3 = df3[['document_name']]
qadf3unique = qadf3['document_name'].unique()
print(qadf3unique)

In [None]:
# Model
import google.generativeai as genai
from dotenv import load_dotenv
import os

api_key = os.getenv('GOOGLE_API_KEY')
generation_config = {
    "temperature": 0,
    "top_p": 0.95,
    "top_k": 20,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}
model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    generation_config=generation_config,
)
chat_session = model.start_chat(history=[])
# response = chat_session.send_message(prompt)
# response.text.strip()


##### Hybrid retrieval

In [None]:
# Uses BM25 + embedding search via code i developed.
# Langchain offers and ensemble approach, which abstracts away code, but granularity of control is lost
# ----- Langchain approach:
# ensemble_retriever = EnsembleRetriever(retrievers=[bm25_retriever, vector_store_retriever], weights=[0.5, 0.5])
# source: https://python.langchain.com/docs/concepts/retrievers/

In [None]:
# Adding the BM25, so set up - 2.0 Hybrid Retrieval (BM25 + Embeddings)
# ---------------------------------------------------------------------

from rank_bm25 import BM25Okapi
from sklearn.preprocessing import normalize
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

# Assuming 'df3' is your dataframe and has a column "text" with the document content
df3["tokenized_text"] = df3["chunk_text"].apply(lambda x: x.lower().split())  # Simple tokenization
# Initialize BM25 model
bm25 = BM25Okapi(df3["tokenized_text"].tolist())


In [None]:
# Merging of BM25 and my existing embedding code
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

pd.set_option("Display.max_colwidth", 200)

###### --------------------- BM25 --------------------------
# User query
#user_query = "Looking at my Bendigo home loan, when is interest debited to your loan account?"
#user_query = "Do you offer off-set facilities with your loans & how do they work?"
user_query = "Can i make additional repayments?"

# 1️⃣ BM25 Retrieval
tokenized_query = user_query.lower().split()
bm25_scores = bm25.get_scores(tokenized_query)
#print(bm25_scores)


# ###### --------------------- my code --------------------------
# Assuming 'model_trans' is a pre-loaded SentenceTransformer model
# user = "Looking at my Bendigo home loan, when interest is debit to your loan account?"
# Encode the user's query into an embedding
question_embedding = model_trans.encode(user_query, convert_to_tensor=True).cpu().numpy()

# Normalize document embeddings (if needed)
normalized_embeddings = normalize(df3['embeddings'].tolist(), axis=1)

# Calculate cosine similarity between the user's question and each document's embedding
cos_sim = cosine_similarity([question_embedding], normalized_embeddings)[0]

# Print the cosine similarities to debug
#print("Cosine similarities:", cos_sim)

# # ------------------------ Inserting BM25 ------------------------------------
# Hybrid Scoring (Weighted Combination)
lambda_weight = 0.1  # Adjust this to balance BM25 vs. Embeddings
hybrid_scores = lambda_weight * bm25_scores + (1 - lambda_weight) * cos_sim

# Retrieve Top-K Results
top_k = 20
most_similar_indices = np.argsort(hybrid_scores)[::-1][:top_k]
most_similar_rows = df3.iloc[most_similar_indices]
#print(most_similar_rows)

# # Print results
#print(most_similar_rows[["chunk_text"]])
# -----The original code ------
# most_similar_indices = np.argsort(cos_sim)[::-1][:15]
# most_similar_rows = df3.iloc[most_similar_indices]
# # ----------------------------------------------------------------------------

# # Display the most similar rows
# pd.set_option('display.max_colwidth', 600)
qachunk = most_similar_rows[['document_name', 'page_number', 'chunk_text']]
display(qachunk.head(3))

context = "\n\n".join(most_similar_rows["chunk_text"].tolist())
prompt = user_query + context
response = chat_session.send_message(prompt)
# response.text.strip()
import textwrap
#wrapped_text = textwrap.fill(response.text.strip(), width=110)
#print(wrapped_text)
import markdown
from IPython.display import display, Markdown
display(Markdown(response.text))


##### Embedding retrieval

In [None]:
# Extract of the uncommented code in my retrival code (pre-BM25). I want to compare the results of this to the hybrid version above
question_embedding = model_trans.encode(user_query, convert_to_tensor=True).cpu().numpy()
# Normalize document embeddings (if needed)
normalized_embeddings = normalize(df3['embeddings'].tolist(), axis=1)
# Calculate cosine similarity between the user's question and each document's embedding
cos_sim = cosine_similarity([question_embedding], normalized_embeddings)[0]
# Get the indices of the top 5 most similar documents
#most_similar_indices = np.argsort(cos_sim)[::-1][:5]
most_similar_indices = np.argsort(cos_sim)[::-1][:12]
most_similar_rows = df3.iloc[most_similar_indices]
qachunk2 = most_similar_rows[['document_name', 'page_number', 'chunk_text']]
display(qachunk2.head(3))

context = "\n\n".join(most_similar_rows["chunk_text"].tolist())
prompt = user_query + context
response = chat_session.send_message(prompt)
# response.text.strip()
# import textwrap
# wrapped_text = textwrap.fill(response.text.strip(), width=110)
import markdown
from IPython.display import display, Markdown
display(Markdown(response.text))

In [None]:
# LLM as a judge prompt - v1
judge = """You are an AI assistant evaluating the quality of an answer generated by a retrieval-augmented generation (RAG) system. Your task is to assess how well the generated answer aligns with the retrieved content.
Instructions:
Compare the Generated Answer with the Retrieved Content

Identify whether all factual claims in the answer are supported by the retrieved content.
Determine if any part of the answer introduces information not found in the retrieved content.
Check for contradictions or inconsistencies.
Faithfulness Rating (1-5):

1 - Completely unfaithful: The answer contains major factual inaccuracies, hallucinations, or contradictions.
2 - Mostly unfaithful: The answer has some correct details but includes substantial information not found in the retrieved content.
3 - Partially faithful: The answer is mostly correct but includes minor unsupported details or slight misinterpretations.
4 - Mostly faithful: The answer is well-grounded in the retrieved content but may have minor rewording or slight extrapolations.
5 - Completely faithful: The answer is fully supported by the retrieved content with no additional or misleading information.
Justification for the Score:

Highlight specific parts of the answer that are directly supported by the retrieved content.
Point out any parts that are unsupported, incorrect, or misleading.
Explain why the given faithfulness rating was chosen.
"""

res = "this is the response:" + response.text
con = "this is the context:" + context
prompt = res + con + judge
responsej = chat_session.send_message(prompt)
# response.text.strip()
# import textwrap
# wrapped_text = textwrap.fill(response.text.strip(), width=110)
import markdown
from IPython.display import display, Markdown
display(Markdown(responsej.text))

In [None]:
# LLM as a judge prompt - v2
judge = """You are an AI assistant evaluating the quality of an answer generated by a retrieval-augmented generation (RAG) system. Your task is to assess how well the generated answer aligns with the retrieved content.
Instructions:
Compare the Generated Answer with the Retrieved Content

Identify whether all factual claims in the answer are supported by the retrieved content.
Determine if any part of the answer introduces information not found in the retrieved content.
Check for contradictions or inconsistencies.
Faithfulness Rating (1-5):

1 - Completely unfaithful: The answer contains major factual inaccuracies, hallucinations, or contradictions.
2 - Mostly unfaithful: The answer has some correct details but includes substantial information not found in the retrieved content.
3 - Partially faithful: The answer is mostly correct but includes minor unsupported details or slight misinterpretations.
4 - Mostly faithful: The answer is well-grounded in the retrieved content but may have minor rewording or slight extrapolations.
5 - Completely faithful: The answer is fully supported by the retrieved content with no additional or misleading information.
Justification for the Score:

Provide a table with key parts of the response linked to the relevant sections of the context.
Explain why the given faithfulness rating was chosen.
"""

res = "this is the response:" + response.text
con = "this is the context:" + context
prompt = res + con + judge
responsej = chat_session.send_message(prompt)
# response.text.strip()
# import textwrap
# wrapped_text = textwrap.fill(response.text.strip(), width=110)
import markdown
from IPython.display import display, Markdown
display(Markdown(responsej.text))

In [None]:
import google.generativeai
from langchain_google_genai import ChatGoogleGenerativeAI
llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    temperature=0,
    max_retries=2
)

In [None]:
# Langchain
from langchain_google_genai import ChatGoogleGenerativeAI
from langchain.evaluation import load_evaluator

# Initialize Gemini Flash 1.4 in LangChain
llm = ChatGoogleGenerativeAI(model="gemini-1.5-flash")

# Load an evaluator for faithfulness
evaluator = load_evaluator("labeled_criteria", llm=llm, criteria={"faithfulness": "Is the answer grounded in the retrieved content?"})

# Example evaluation
eval_result = evaluator.evaluate(
    input="The company reported a 10% revenue increase in Q4, driven by strong sales in the technology sector.",  # Retrieved content
    prediction="The company had a 10% revenue increase due to tech and healthcare."  # Generated answer
)

print(eval_result)



In [None]:
# (original code - preBM25)
# # Retrieval:
# # -- Above 3 cells combined to function (but keeping above 3 if I need to work on the separately)

# # Above cell, converted to function
# # import numpy as np
# import pandas as pd
# pd.set_option('display.max_colwidth', 600)
# from sentence_transformers import SentenceTransformer


# #user = "Are you allowed to change the terms and conditions"
# #user = "What is the eligibility of the Easy Money Card?"
# #user = "Can the Youth Debit Mastercard be used to purchase beer?"
# #user = "Can the Youth Debit Mastercard be used to purchase alcohol?" # for some reason this question crashes output
# #user = "Can i make foreign currency transactions on my card?"
# #user = "Can I use my Bendigo Bank debit card at a non-Bendigo atm?"
# user = "Can addtional payments be made on my home loan?"

# #def answer_question(user, df3):
# # --- part one ---
# question_embedding = model_trans.encode(user, convert_to_tensor=True).cpu()
# similarities = df3['embeddings'].apply(lambda x: np.dot(question_embedding, x))

# # The idea of creating two, is if no good answer from 1st, then summarise 2nd & see if the helps.
# most_similar_indices = similarities.nlargest(5).index # 5
# #top_rank = similarities.nlargest(1).index

# most_similar_rows = df3.iloc[most_similar_indices]
# #top_rank_rows = df3.iloc[top_rank]

# cont = most_similar_rows['page_content'].tolist()
# #cont = most_similar_rows['page_content_org'].tolist()
# context = ' '.join(cont)


# # -- get source --
# # -- Extract from func (could make output of func)
# #ans = (answer_question(user, df3))
# question_embedding = model_trans.encode(user, convert_to_tensor=True).cpu()
# similarities = df3['embeddings'].apply(lambda x: np.dot(question_embedding, x))
# most_similar_indices = similarities.nlargest(5).index
# most_similar_rows = df3.iloc[most_similar_indices]

# pd.set_option('display.max_colwidth', 50)
# display(most_similar_rows)

# # # -- Get the target index
# # question_embedding1 = model_trans.encode(ans, convert_to_tensor=True).cpu()
# # similarities1 = most_similar_rows['embeddings'].apply(lambda x: np.dot(question_embedding1, x))
# # most_similar_indices1 = similarities1.nlargest(5).index

# # # -- Print source
# # pd.set_option('display.max_colwidth', None)
# # filtered_df = most_similar_rows.loc[most_similar_indices1]
# # #filtered_df = filtered_df[['Title', 'page', 'page_content']]
# # #filtered_df = filtered_df[['page_content', 'source']]
# # #filtered_df = filtered_df[['page_content_org', 'Title']]
# # #filtered_df = filtered_df[['page_content', 'metadata']]
# # filtered_df = filtered_df[['page_content', 'page_number', 'filename']]

# # # get page no & source for output
# # filtered_df = most_similar_rows.loc[most_similar_indices1]
# # first_row = filtered_df.iloc[0]
# # filename = first_row['filename']
# # page_number = first_row['page_number']
# # source = first_row['page_content']

# # # -- Print source details
# # print('File name:', filename)
# # print('Page no:', page_number)
# # print('Extract:', source)

# # # # print extract of top row for citation:
# # # filtered_df1a = filtered_df.head(1)
# # # joined_stringa = filtered_df1a['page_content'].str.cat(sep='\n')
# # # print('Extract:','\n', joined_stringa)

# # # # Print dataframe for ref:
# # filtered_df = filtered_df[['page_number','chunk_text','chunk_length','filename']]
# # display(filtered_df)


# -- Same function as above, but newer version (as only seems to be pulling from the 1st doc)
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import normalize

# Assuming 'model_trans' is a pre-loaded SentenceTransformer model
user = "Looking at my Bendigo home loan, when interest is debit to your loan account?"

# Encode the user's query into an embedding
question_embedding = model_trans.encode(user, convert_to_tensor=True).cpu().numpy()

# Print the embeddings for both documents for debugging
print("Embedding for Document 1:", df3['embeddings'].iloc[0])
print("Embedding for Document 2:", df3['embeddings'].iloc[1])

# Normalize document embeddings (if needed)
normalized_embeddings = normalize(df3['embeddings'].tolist(), axis=1)

# Calculate cosine similarity between the user's question and each document's embedding
cos_sim = cosine_similarity([question_embedding], normalized_embeddings)[0]

# Print the cosine similarities to debug
print("Cosine similarities:", cos_sim)

# Get the indices of the top 5 most similar documents
most_similar_indices = np.argsort(cos_sim)[::-1][:5]
most_similar_rows = df3.iloc[most_similar_indices]

# Display the most similar rows
pd.set_option('display.max_colwidth', 600)
qachunk = most_similar_rows[['document_name', 'page_number', 'chunk_text']]
display(qachunk)

# Print out the selected most similar documents for debugging
print("Most Similar Indices:", most_similar_indices)
print("Most Similar Rows:")
print(most_similar_rows[['document_name', 'page_number', 'chunk_text']])

# Optionally, check the content of the most similar documents
for idx in most_similar_indices:
    print(f"Document {idx}: {df3.iloc[idx]['document_name']}, Content: {df3.iloc[idx]['page_content']}")



In [None]:
qaqachunk = qachunk[['document_name']]
qaqachunkunique = qaqachunk['document_name'].unique()
print(qaqachunkunique)

In [None]:
# Generation:
# -------------------------
# ----- This is the one ---
# -------------------------
# Following some learnings from discord, it looks like the eval is to eval Q&A NOT generate the A, which is odd. So this will combine part 1 (to generate the A)
# and part 2 (the eval - using the Q&A from part 1)

# ----------------------------------------------

# context3 = "As I was going to St. Ives,\
#             I met a man with seven wives.\
#             Every wife had seven sacks,\
#             Every sack had seven cats,\
#             Every cat had seven kits:\
#             Kits, cats, sacks, and wives.\
#             How many were going to St. Ives?"

# prompt = "What is the key themse of this text??"
#prompt = prompt + context4

#system = "Please provide the answer in a concise and correctly formatted response."
system = "Please provide the answer in a concise, plain text format (i.e. a sentence, paragraph, etc.) with no bullet points, special characters, etc."

prompt = user + context
prompt = system + prompt

# # This is a deliberate attempt to get the answer to be of poor quality and see if that is reflected in the score
# misdirection = """you are to deliberately say the opposite of what you find as the answer, i.e. if you can see the correct answer is up, then you must say the answer is down.
#                   Additionally, you are to include information that has nothing to do with this topic at hand, i.e. if the question is about soccer, you are to talk about fishing"""
# prompt = misdirection + prompt
# # Comment out the above section if trying to get proper scores.

# -- Part 1a: this is vanilla call to llm
print('-' * 15, 'part 1', '-' * 15)
# WIP: recreate part 1
# Replaces previous version. This one adds temperature
# --- Ground truth calls from llm ---

import google.generativeai as genai
from dotenv import load_dotenv
import os

api_key = os.getenv('GOOGLE_API_KEY')
generation_config = {
    "temperature": 0,
    "top_p": 0.95,
    "top_k": 20,
    "max_output_tokens": 8192,
    "response_mime_type": "text/plain",
}
model = genai.GenerativeModel(
    model_name="gemini-1.5-flash",
    generation_config=generation_config,
)
chat_session = model.start_chat(history=[])
response = chat_session.send_message(prompt)
response.text.strip()



In [None]:
# -- Lets have a look at the context --
pd.set_option('display.max_colwidth', 500)
qa = most_similar_rows[['document_name', 'page_number', 'chunk_text']]
display(qa)

In [None]:
most_similar_rows.head(1)