In [7]:
%pip install langchainhub gpt4all


Collecting gpt4all
  Obtaining dependency information for gpt4all from https://files.pythonhosted.org/packages/cf/de/09e681f1a97fb2dd80730732468d15666a9089625a30b5bcfb24015c1b06/gpt4all-2.7.0-py3-none-win_amd64.whl.metadata
  Downloading gpt4all-2.7.0-py3-none-win_amd64.whl.metadata (4.7 kB)
Downloading gpt4all-2.7.0-py3-none-win_amd64.whl (28.6 MB)
   ---------------------------------------- 0.0/28.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/28.6 MB ? eta -:--:--
   ---------------------------------------- 0.0/28.6 MB 667.8 kB/s eta 0:00:43
   ---------------------------------------- 0.2/28.6 MB 2.1 MB/s eta 0:00:14
   ---------------------------------------- 0.3/28.6 MB 2.1 MB/s eta 0:00:14
    --------------------------------------- 0.4/28.6 MB 2.5 MB/s eta 0:00:12
    --------------------------------------- 0.6/28.6 MB 2.9 MB/s eta 0:00:10
    --------------------------------------- 0.6/28.6 MB 2.9 MB/s eta 0:00:10
   - --------------------------------------


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


## Fetching Relevant Documents

In [2]:
from langchain_community.embeddings import GPT4AllEmbeddings
from langchain_elasticsearch import ElasticsearchStore
import numpy as np
from dotenv import load_dotenv
import os

In [3]:
load_dotenv()  # This loads the .env file at the application start
password = os.getenv('passwd')
api_key = os.getenv('api_key')

In [4]:
# basic model used for embeddings, can be improved by using a more complex model
model_name = "all-MiniLM-L6-v2.gguf2.f16.gguf"

In [8]:
embedding = GPT4AllEmbeddings(model_name=model_name)

Downloading: 100%|██████████| 45.9M/45.9M [00:03<00:00, 15.3MiB/s]
Verifying: 100%|██████████| 45.9M/45.9M [00:00<00:00, 796MiB/s]


In [9]:
cloud_id  = '802f868877384e9798b731802ffa4827:ZXVyb3BlLXdlc3QzLmdjcC5jbG91ZC5lcy5pbyQ0NzYyZTQ2YzQ5NDg0ODY5YTAzZDMxYzg5NjY2MjY3YyQ1ZjQ3NWI2NTQxOTI0NmZiODcxNDc3NjZlMTI4YWE2YQ=='
elastic_vector_search = ElasticsearchStore(
    es_cloud_id=cloud_id,
    index_name="embeddings_index",
    embedding=embedding,
    es_user="group13",
    es_password=password,
    es_api_key=api_key
)

In [35]:
question = "Inwieweit wird in der Organisation Informationssicherheit gemanagt?"

In [36]:
# using the most basic retrieval method for now, to be experimented with
retriever = elastic_vector_search.as_retriever(search_type="similarity", search_kwargs={"k": 20})

retrieved_docs = retriever.invoke(question)

In [37]:
retrieved_docs

[Document(page_content='Informationssicherheit in \norganisationsweite Abläufe und \nProzesse  Ja Die Informationssicherheit ist in so weit in die', metadata={'source': 'KnowledgeBase/Beschreibung_Recplast.pdf', 'page': 54}),
 Document(page_content='Managementsystems für Informationssicherheit wurden diverse weiterführende Regelungen geschaffen, die', metadata={'source': 'KnowledgeBase/A01_Sicherheitsleitlinie.pdf', 'page': 10}),
 Document(page_content='4 Organisation des Managementsystems für \nInformationssicherheit\nGrundsätzlich sind folgende Verantwortlichkeiten innerhalb des ISMS definiert:', metadata={'source': 'KnowledgeBase/A01_Sicherheitsleitlinie.pdf', 'page': 7}),
 Document(page_content='4 . Organisation des Managementsystems für Informationssicherheit ..................................................... 8', metadata={'source': 'KnowledgeBase/A01_Sicherheitsleitlinie.pdf', 'page': 2}),
 Document(page_content='Informationsverbund\nORP.1 Organisation\nORP.1.A2 Zuweisung der 

In [9]:
document_texts = [result.page_content for result in results]  # adjust the key according to your result structure

# Concatenate these texts into a single string to provide as context
context = " ".join(document_texts)

## Generating Prompt for Question Answering

In [13]:
from langchain_core.prompts import PromptTemplate
from langchain.chains import LLMChain
from langchain_community.llms import GPT4All
from langchain_core.callbacks import StreamingStdOutCallbackHandler
from langchain import hub

# Says max 3 sentences, can change accoriding to the requirement
prompt = hub.pull("rlm/rag-prompt")

example_messages = prompt.invoke(
    {"context": "filler context", "question": "filler question"}
).to_messages()

In [14]:
print(example_messages[0].content)

You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.
Question: filler question 
Context: filler context 
Answer:


In [22]:
import os
os.chdir('c:\\Users\\rafay\\OneDrive\\Desktop\\Masters\\DS')

In [23]:
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


# model downloaded using gpt4all ui, path pointing to model
# model_path = "/Users/I748655/Library/Application Support/nomic.ai/GPT4All/Meta-Llama-3-8B-Instruct.Q4_0.gguf"
model_path = "./Meta-Llama-3-8B-Instruct.Q4_0.gguf"
# Callbacks support token-wise streaming
callbacks = [StreamingStdOutCallbackHandler()]

llm = GPT4All(model=model_path, callbacks=callbacks, verbose=True)

In [24]:
def build_context(results):
    return "\n\n".join(result.page_content for result in results)

In [25]:
rag_chain = (
    {"context": retriever | build_context, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)

In [38]:
for chunk in rag_chain.stream(question):
    print(chunk, end="", flush=True)

 In der Organisation wird die Informationssicherheit in organisationsweite Abläufe und Prozesse integriert. Der ISB informiert die Geschäftsführung monatlich über den aktuellen Stand der Informationssicherheit anhand eines Management-Reports. Die Geschäftsführung hat die Gesamtverantwortung für die Informationssicherheit übernommen. In der Organisation wird die Informationssicherheit in organisationsweite Abläufe und Prozesse integriert. Der ISB informiert die Geschäftsführung monatlich über den aktuellen Stand der Informationssicherheit anhand eines Management-Reports. Die Geschäftsführung hat die Gesamtverantwortung für die Informationssicherheit übernommen.

 The risk analysis is conducted by the management and auditors. The methodology used includes document review or interviews with users, administrators, and other relevant stakeholders. The risk analysis is conducted by the management and auditors. The methodology used includes document review or interviews with users, administrators, and other relevant stakeholders.

<h2>Testing</h2>

In [27]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [28]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rafay\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [41]:
from test_data import test_questions,test_responses

print(len(test_questions))
print(len(test_responses))

41
41


In [50]:
generated_responses = []
for i, question in enumerate(test_questions):
    response = ""
    print("\nQuestion no: ", i+1)
    for chunk in rag_chain.stream(question):
        response+=chunk
        #I added a print just in case that the chunks from the previous question remain inside the buffer for the next question
        print(chunk, end="", flush=True)
    
    generated_responses.append(response)
        
    


Question no:  1
 Die Richtlinien zur Informationssicherheit sind vorhanden. In dieser Leitlinie werden die Stellenwert der Informationstechnologie und Informationssicherheit, die Verantwortung des Informationssicherheitsbeauftragten (ISB) und weitere Aspekte der Informationssicherheit dargestellt. Die Richtlinien sollen den Mitarbeitern und anderen relevanten Stellen bekanntgegeben werden. Die Richtlinien zur Informationssicherheit sind vorhanden. In dieser Leitlinie werden die Stellenwert der Informationstechnologie und Informationssicherheit, die Verantwortung des Informationssicherheitsbeauftragten (ISB) und weitere Aspekte der Informationssicherheit dargestellt. Die Richtlinien sollen den Mitarbeitern und anderen relevanten Stellen bekanntgegeben werden.
Question no:  2
 In der Organisation wird die Informationssicherheit in organisationsweite Abläufe und Prozesse integriert. Der ISB informiert die Geschäftsführung monatlich über den aktuellen Stand der Informationssicherheit anha

In [69]:
from nltk.tokenize import word_tokenize

precisions = []
recalls = []
f1s=[]

for test_response, gen_response in zip(test_responses,generated_responses):
    # Tokenize the sentence
    response_words = word_tokenize(test_response)
    golden_words = word_tokenize(gen_response)
    
    # Filter out punctuation
    response_words = [word for word in response_words if word.isalnum()]
    golden_words = [word for word in golden_words if word.isalnum()]

    # Convert arrays to sets
    response_set = set(response_words)
    gen_set = set(golden_words)

    # Find the intersection of the two sets
    intersection = response_set.intersection(gen_set)

    # Get the number of shared elements
    num_shared_elements = len(intersection)
    pred_length = len(response_words)
    gold_length = len(golden_words)
    
    precision= num_shared_elements/pred_length
    recall= num_shared_elements/gold_length
    f1=2*precision*recall/(precision+recall)
    
    precisions.append(precision)
    recalls.append(recall)
    f1s.append(f1)

print("Precision:", np.mean(precisions))
print("Recall:", np.mean(recalls))
print("F1 score:", np.mean(f1s))

Precision: 0.1646918042478449
Recall: 0.1625383027652491
F1 score: 0.14993494129962556


In [56]:
import numpy as np

np.save('DataScienceGroup13/src/generated_responses.npy',generated_responses)

In [79]:
from nltk.translate.bleu_score import corpus_bleu

tokenized_references = [[word_tokenize(resp)] for resp in test_responses]
tokenized_predictions = [word_tokenize(gen) for gen in generated_responses]

# Calculate BLEU score
bleu_score = corpus_bleu(tokenized_references, tokenized_predictions)
print(f"BLEU score: {bleu_score:.4f}")

TypeError: Fraction.__new__() got an unexpected keyword argument '_normalize'

In [81]:
import math
from collections import Counter
from nltk.tokenize import word_tokenize

def tokenize(sentence):
    return word_tokenize(sentence)

def ngrams(tokens, n):
    return [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]

def count_ngrams(tokens, n):
    return Counter(ngrams(tokens, n))

def modified_precision(reference, hypothesis, n):
    ref_ngrams = count_ngrams(reference, n)
    hyp_ngrams = count_ngrams(hypothesis, n)
    overlap = sum((hyp_ngrams & ref_ngrams).values())
    total = sum(hyp_ngrams.values())
    if total == 0:
        return 0
    return overlap / total

def brevity_penalty(reference, hypothesis):
    ref_len = len(reference)
    hyp_len = len(hypothesis)
    if hyp_len == 0:
        return 0
    if hyp_len > ref_len:
        return 1
    return math.exp(1 - ref_len / hyp_len)

def bleu_score(reference, hypothesis, max_n=4):
    precisions = [modified_precision(reference, hypothesis, i) for i in range(1, max_n+1)]
    if min(precisions) == 0:
        return 0
    log_precisions = [math.log(p) for p in precisions]
    geometric_mean = math.exp(sum(log_precisions) / max_n)
    bp = brevity_penalty(reference, hypothesis)
    return bp * geometric_mean

# Tokenize references and hypotheses
tokenized_references = [tokenize(ref) for ref in test_responses]
tokenized_hypotheses = [tokenize(hyp) for hyp in generated_responses]

# Calculate BLEU score for each hypothesis-reference pair and average them
scores = [bleu_score(ref, hyp) for ref, hyp in zip(tokenized_references, tokenized_hypotheses)]
average_bleu_score = sum(scores) / len(scores)
print(f"Average BLEU score: {average_bleu_score:.4f}")


Average BLEU score: 0.0066


In [72]:
%pip install sacrebleu

Collecting sacrebleu
  Obtaining dependency information for sacrebleu from https://files.pythonhosted.org/packages/df/d5/f07d3c37bd98db883330276d77e7b04b6c50564c68fb95a76e05422a2850/sacrebleu-2.4.2-py3-none-any.whl.metadata
  Downloading sacrebleu-2.4.2-py3-none-any.whl.metadata (58 kB)
     ---------------------------------------- 0.0/58.0 kB ? eta -:--:--
     ------- -------------------------------- 10.2/58.0 kB ? eta -:--:--
     -------------------------- ----------- 41.0/58.0 kB 495.5 kB/s eta 0:00:01
     -------------------------------------- 58.0/58.0 kB 508.5 kB/s eta 0:00:00
Collecting portalocker (from sacrebleu)
  Obtaining dependency information for portalocker from https://files.pythonhosted.org/packages/17/9e/87671efcca80ba6203811540ed1f9c0462c1609d2281d7b7f53cef05da3d/portalocker-2.8.2-py3-none-any.whl.metadata
  Downloading portalocker-2.8.2-py3-none-any.whl.metadata (8.5 kB)
Downloading sacrebleu-2.4.2-py3-none-any.whl (106 kB)
   ------------------------------------


[notice] A new release of pip is available: 23.2.1 -> 24.0
[notice] To update, run: python.exe -m pip install --upgrade pip


In [80]:
import sacrebleu
tokenized_references = [word_tokenize(resp) for resp in test_responses]
transposed_references = [tokenized_references]

# Calculate BLEU score using sacrebleu
bleu = sacrebleu.corpus_bleu(tokenized_predictions, transposed_references)
print(f"BLEU score: {bleu.score:.4f}")

TypeError: BLEU: `refs` should be a sequence of sequence of strings.

In [None]:
Information security guidelines are in place. This guideline presents the importance of information technology and information security, the responsibility of the information security officer (ISB) and other aspects of information security. The guidelines should be communicated to employees and other relevant departments. Information security guidelines are in place. This guideline presents the importance of information technology and information security, the responsibility of the information security officer (ISB) and other aspects of information security. The guidelines should be communicated to employees and other relevant departments.

In [78]:
type(generated_responses)

list