In [None]:
%pip install langchainhub gpt4all

## Fetching Relevant Documents

In [None]:
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_elasticsearch import ElasticsearchStore
import numpy as np
from dotenv import load_dotenv
import os

In [None]:
load_dotenv()  # This loads the .env file at the application start
password = os.getenv('passwd')
api_key = os.getenv('api_key')

In [None]:
# basic model used for embeddings, can be improved by using a more complex model
model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"

In [None]:
embedding = GPT4AllEmbeddings(model_name=model_name)

In [None]:
cloud_id  = '802f868877384e9798b731802ffa4827:ZXVyb3BlLXdlc3QzLmdjcC5jbG91ZC5lcy5pbyQ0NzYyZTQ2YzQ5NDg0ODY5YTAzZDMxYzg5NjY2MjY3YyQ1ZjQ3NWI2NTQxOTI0NmZiODcxNDc3NjZlMTI4YWE2YQ=='
elastic_vector_search = ElasticsearchStore(
    es_cloud_id=cloud_id,
    index_name="embeddings_index",
    embedding=embedding,
    es_user="group13",
    es_password=password,
    es_api_key=api_key
)

In [None]:
question = "Inwieweit wird in der Organisation Informationssicherheit gemanagt?"

In [None]:
# using the most basic retrieval method for now, to be experimented with
retriever = elastic_vector_search.as_retriever(search_type="similarity", search_kwargs={"k": 20})

retrieved_docs = retriever.invoke(question)

In [None]:
retrieved_docs

In [None]:
document_texts = [result.page_content for result in retrieved_docs]  # adjust the key according to your result structure

# Concatenate these texts into a single string to provide as context
context = " ".join(document_texts)

## Generating Prompt for Question Answering

In [None]:
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.llms import GPT4All
from langchain_core.callbacks import StreamingStdOutCallbackHandler
from langchain import hub

# Says max 3 sentences, can change accoriding to the requirement
prompt = hub.pull("rlm/rag-prompt")

example_messages = prompt.invoke(
    {"context": "filler context", "question": "filler question"}
).to_messages()

In [None]:
print(example_messages[0].content)

In [None]:
import os
# os.chdir('c:\\Users\\rafay\\OneDrive\\Desktop\\Masters\\DS')

In [None]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


# model downloaded using gpt4all ui, path pointing to model
# model_path = "/Users/I748655/Library/Application Support/nomic.ai/GPT4All/Meta-Llama-3-8B-Instruct.Q4_0.gguf"
model_path = "/Users/omeriqbal/Downloads/Meta-Llama-3-8B-Instruct.Q4_0.gguf"
# Callbacks support token-wise streaming
callbacks = [StreamingStdOutCallbackHandler()]

llm = GPT4All(model=model_path, callbacks=callbacks, verbose=True)

In [None]:
def build_context(results):
    return "\n\n".join(result.page_content for result in results)

In [None]:
rag_chain = (
    {"context": retriever | build_context, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [None]:
for chunk in rag_chain.stream(question):
    print(chunk, end="", flush=True)

 The risk analysis is conducted by the management and auditors. The methodology used includes document review or interviews with users, administrators, and other relevant stakeholders. The risk analysis is conducted by the management and auditors. The methodology used includes document review or interviews with users, administrators, and other relevant stakeholders.

<h2>Testing</h2>

In [None]:
from test_data import test_questions
generated_responses2 = []
for i, question in enumerate(test_questions):
    response = ""
    print("\nQuestion no: ", i+1)
    for chunk in rag_chain.stream(question):
        response+=chunk
        #I added a print just in case that the chunks from the previous question remain inside the buffer for the next question
        print(chunk, end="", flush=True)
    
    generated_responses2.append(response)
        

In [None]:
import numpy as np

np.save('generated_responses2.npy',generated_responses2)

In [None]:
resp = llm.generate(["What is the capital of France?"])

In [None]:
resp.generations[0][0].text

In [None]:
from test_data import test_questions,test_responses

print(len(test_questions))
print(len(test_responses))

In [None]:
generated_responses = []
for i, question in enumerate(test_questions):
    response = ""
    print("\nQuestion no: ", i+1)
    for chunk in rag_chain.stream(question):
        response+=chunk
        #I added a print just in case that the chunks from the previous question remain inside the buffer for the next question
        print(chunk, end="", flush=True)
    
    generated_responses.append(response)
        
    

In [None]:
import numpy as np

np.save('DataScienceGroup13/src/generated_responses.npy',generated_responses)

In [None]:
generated_responses=np.load("generated_responses2.npy")

In [None]:
from nltk.tokenize import word_tokenize 

precisions = []
recalls = []
f1s=[]

for test_response, gen_response in zip(test_responses,generated_responses):
    # Tokenize the sentence
    response_words = word_tokenize(test_response)
    golden_words = word_tokenize(gen_response)
    
    # Filter out punctuation
    response_words = [word for word in response_words if word.isalnum()]
    golden_words = [word for word in golden_words if word.isalnum()]

    # Convert arrays to sets
    response_set = set(response_words)
    gen_set = set(golden_words)

    # Find the intersection of the two sets
    intersection = response_set.intersection(gen_set)

    # Get the number of shared elements
    num_shared_elements = len(intersection)
    pred_length = len(response_words)
    gold_length = len(golden_words)
    
    precision= num_shared_elements/pred_length
    recall= num_shared_elements/gold_length
    f1=2*precision*recall/(precision+recall)
    
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

print("Precision:", np.mean(precisions))
print("Recall:", np.mean(recalls))
print("F1 score:", np.mean(f1s))

In [None]:
test_responses = np.array(test_responses)
test_unicode = test_responses.astype('<U1049')

In [None]:
#Bleu wants matching 4-grams which aren't found in German texts usually

from nltk.translate.bleu_score import corpus_bleu


# tokenized_references = [word_tokenize(resp) for resp in test_responses[0]]
# tokenized_predictions = [word_tokenize(gen) for gen in generated_responses[0]]

tokenized_references = word_tokenize(test_responses[0]) 
tokenized_predictions = word_tokenize(generated_responses[0])



print(tokenized_predictions)
print(tokenized_references)
print(len(tokenized_references))
print(len(tokenized_predictions))

# # Calculate BLEU score
# bleu_score = corpus_bleu(list_of_references=[[tokenized_references]],hypotheses= [tokenized_predictions])
bleu_score = corpus_bleu(list_of_references=[[['zur', 'Informationssicherheit','in']]],hypotheses= [['zur', 'Informationssicherheit','in']])
print(f"BLEU score: {bleu_score:.4f}")

In [None]:
#PIP INSTALL EVALUATE
from evaluate import load
bertscore = load("bertscore")

results = bertscore.compute(predictions=generated_responses, references=test_unicode, lang="de")



In [None]:
print("BERT precision:" ,np.mean(results['precision']))
print("BERT Recall:" ,np.mean(results['recall']))
print("BERT F1:", np.mean(results['f1']))

In [None]:
bleurt = load("bleurt", module_type="metric")
bleurt_results = bleurt.compute(predictions=generated_responses, references=test_unicode)

In [None]:
print("BLEURT SCORE: ", np.mean(bleurt_results['scores']))

In [None]:
meteor= load("meteor")

meteor_results = meteor.compute(predictions=generated_responses, references=test_unicode)

In [None]:
print(meteor_results)

In [None]:
bleu= load("bleu")

bleu_results = bleu.compute(predictions=generated_responses, references=test_unicode)
print(bleu_results)

In [None]:
retrieved_docs_list = []
for i, question in enumerate(test_questions):
    retrieved_docs = retriever.invoke(question)
    
    content = []
    for doc in retrieved_docs:
        content.append(doc.page_content)
    
    retrieved_docs_list.append(content)

In [None]:
np.save("retrieved_docs_list.npy",retrieved_docs_list)

In [None]:
retrieved_documents_list = np.load("DataScienceGroup13/src/retrieved_docs_list.npy")
retrieved_docs_list = []
for doc in retrieved_documents_list:
    retrieved_docs_list.append(doc[:5])

In [None]:
de_resp=llmJudge.generate(
prompts = ["GEBEN SIE NUR PUNKTE!!! Geben Sie dem folgenden Filmtitel aus dem Horror-Genre eine Punktzahl zwischen 1 und 5:Flüstern der Ewigkeit"],\
        max_tokens=5,temp=0,top_p=1)

In [None]:
de_resp

In [None]:
EVALUATION_PROMPT_TEMPLATE = """
GEBEN SIE NUR PUNKTE!!! Sie müssen mit der Bewertung beginnen
Sie erhalten eine Antwort, die von einem RAG-Modell (Retrieval-Augmented Generation) für eine bestimmte Abfrage generiert wird. Ihre Aufgabe besteht darin, die Antwort anhand einer Metrik zu bewerten.
Bitte stellen Sie sicher, dass Sie diese Anweisungen sorgfältig lesen und verstehen. 
Bitte lassen Sie dieses Dokument während der Durchsicht geöffnet und schlagen Sie bei Bedarf darin nach.

Evaluationskriterien:

{Kriterien}

Bewertungsschritte:

{Schritte}

Beispiel:

Abfrage:

{Abfrage}

Abgerufene Dokumente:

{retrieved_documents}

Grundwahrheit:

{ground_truth}

Generierte Antwort:

{Antwort}

Bewertungsformular (NUR Ergebnisse):

- {metric_name}
"""

In [None]:
CONTEXT_PRECISION_CRITERIA = """
Kontextgenauigkeit (1–5) – misst das Signal-Rausch-Verhältnis des abgerufenen Kontexts. \
Die abgerufenen Dokumente sollten einen hohen Anteil relevanter Informationen enthalten, die zur Beantwortung der Anfrage erforderlich sind, sowie einen minimalen Anteil irrelevanter Informationen.
"""

CONTEXT_PRECISION_STEPS = """
1. Lesen Sie die Anfrage und die abgerufenen Dokumente sorgfältig durch.
2. Identifizieren Sie, wie viele der abgerufenen Informationen für die Beantwortung der Anfrage relevant sind.
3. Weisen Sie basierend auf dem Anteil relevanter Informationen eine Kontextpräzisionsbewertung von 1 bis 5 zu.
"""
CONTEXT_RECALL_CRITERIA = """
Kontextrückruf (1-5) – misst, ob alle relevanten Informationen, die zur Beantwortung der Anfrage erforderlich sind, abgerufen wurden. \
Die abgerufenen Dokumente sollten alle notwendigen Informationen enthalten, die zur umfassenden Beantwortung der Anfrage erforderlich sind.
"""

CONTEXT_RECALL_STEPS = """
1. Lesen Sie die Anfrage, die abgerufenen Dokumente und die Ground-Truth-Informationen sorgfältig durch.
2. Stellen Sie fest, ob die abgerufenen Dokumente alle relevanten Informationen enthalten, die zur Beantwortung der Anfrage erforderlich sind.
3. Weisen Sie basierend auf der Vollständigkeit der abgerufenen Informationen einen Kontextrückruf-Score von 1 bis 5 zu.
"""
FAITHFULNESS_CRITERIA = """
Treue (1-5) – misst die sachliche Richtigkeit der generierten Antwort. \
Die Antwort sollte nur Aussagen enthalten, die von den abgerufenen Dokumenten unterstützt werden.
"""

FAITHFULNESS_STEPS = """
1. Lesen Sie die Anfrage, die abgerufenen Dokumente und die generierte Antwort sorgfältig durch.
2. Identifizieren Sie die Aussagen in der Antwort und vergleichen Sie sie jeweils mit den abgerufenen Dokumenten auf sachliche Richtigkeit.
3. Weisen Sie basierend auf dem Anteil richtiger Aussagen einen Treuewert von 1 bis 5 zu.
"""

ANSWER_RELEVANCY_CRITERIA = """
Antwortrelevanz (1–5) – misst, wie relevant die generierte Antwort für die Anfrage ist. \
Die Antwort sollte alle Teile der Anfrage umfassend und genau beantworten.
"""

ANSWER_RELEVANCY_STEPS = """
1. Lesen Sie die Abfrage und die generierte Antwort sorgfältig durch.
2. Bestimmen Sie, wie gut die Antwort auf die Anfrage eingeht, einschließlich aller Aspekte der Frage.
3. Weisen Sie eine Antwortrelevanzbewertung von 1 bis 5 zu, basierend auf der Relevanz und Vollständigkeit der Antwort.
"""

In [None]:
def get_rag_score(
    criteria: str, steps: str, query: str, retrieved_documents: list, ground_truth: str, response: str, metric_name: str
):
    retrieved_documents_str = "\n".join(retrieved_documents)
    prompt = EVALUATION_PROMPT_TEMPLATE.format(
        Kriterien=criteria,
        Schritte=steps,
        metric_name=metric_name,
        Abfrage=query,
        retrieved_documents=retrieved_documents_str,
        ground_truth=ground_truth,
        Antwort=response,
    )
    response=llmJudge.generate(prompts = [prompt],max_tokens=5,temp=0.1,top_p=1)
    
    return response.generations[0][0].text


In [None]:
import re

def get_first_german_number(text):
    # Regular expression to match numbers with comma as decimal separator
    match = re.search(r'\d+(,\d+)?', text)
    if match:
        # If a match is found, replace comma with dot and convert to float
        number_str = match.group().replace(',', '.')
        return float(number_str) if '.' in number_str else int(number_str)
    else:
        return 0

In [None]:
import copy


evaluation_metrics = {
    "Context Precision": (CONTEXT_PRECISION_CRITERIA, CONTEXT_PRECISION_STEPS),
    "Context Recall": (CONTEXT_RECALL_CRITERIA, CONTEXT_RECALL_STEPS),
    "Faithfulness": (FAITHFULNESS_CRITERIA, FAITHFULNESS_STEPS),
    "Answer Relevancy": (ANSWER_RELEVANCY_CRITERIA, ANSWER_RELEVANCY_STEPS),
}

queries = copy.deepcopy(test_questions)
retrieved_documents_list = copy.deepcopy(retrieved_docs_list)
responses = copy.deepcopy(generated_responses)
ground_truths = copy.deepcopy(test_responses)

data = {"Evaluation Type": [], "Query Type": [], "Score": []}

for eval_type, (criteria, steps) in evaluation_metrics.items():
    for i, (query, retrieved_documents, response, ground_truth) in enumerate(zip(queries, retrieved_documents_list, responses, ground_truths)):
        data["Evaluation Type"].append(eval_type)
        data["Query Type"].append(f"Query {i + 1}")
        result = get_rag_score(criteria, steps, query, retrieved_documents, ground_truth, response, eval_type)
        print(result)
        score_num = get_first_german_number(result)
        print(score_num)
        data["Score"].append(score_num)


In [None]:
np.save("G_Eval.py",data)

In [None]:
context_precision = data['Score'][:41]
context_recall = data['Score'][41:82]
faithfulness = data['Score'][82:123]
answer_relevancy = data['Score'][123:164]


In [None]:
context_precision = [i for i in context_precision if i != 0]
context_recall = [i for i in context_recall if i != 0]
faithfulness = [i for i in faithfulness if i != 0]
answer_relevancy = [i for i in answer_relevancy if i != 0]

In [None]:
print("Context Precision Score", np.mean(context_precision))
print("Context Recall Score",np.mean(context_recall))
print("Faithfulness Score",np.mean(faithfulness))
print("Answer Relevancy Score",np.mean(answer_relevancy))