In [None]:
%pip  install cohere
%pip  install kaleido
%pip  install gradio
%pip  install openai
%pip  install torch
%pip  install farm-haystack[colab,ocr,preprocessing,inference,faiss-gpu,pdf]

# Retrieval-Augmented Generation

In [None]:
doc_index = "doc_index"
label_index = "label_index"

In [None]:
from haystack.document_stores import FAISSDocumentStore
document_store_dense_openai = FAISSDocumentStore(faiss_index_factory_str="Flat",
                                                 index=doc_index,
                                                 similarity="dot_product",
                                                 vector_dim=1536)

## Dataset adden

In [None]:
from haystack.nodes import  PDFToTextConverter

# Einzelnes Dokument Laden
converter = PDFToTextConverter()
doc_pdf = converter.convert(file_path="SPO-AIN3.pdf", meta=None)[0]

In [None]:
from haystack.utils import convert_files_to_docs

doc_dir = "ordner"
# Ordner mit Dokumenten Laden (Format converter wird automatisch erkannt)
all_docs = convert_files_to_docs(dir_path=doc_dir)

In [None]:
from haystack.nodes import PreProcessor

preprocessor = PreProcessor(
    clean_empty_lines=True,
    clean_whitespace=True,
    clean_header_footer=False,
    split_by="word",
    split_length=100,
    split_respect_sentence_boundary=True,
)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [None]:
docs_default = preprocessor.process([doc_pdf])
print(f"n_docs_input: 1\nn_docs_output: {len(docs_default)}")

In [None]:
docs = preprocessor.process(all_docs)

Preprocessing: 100%|██████████| 5/5 [00:00<00:00, 19.50docs/s]


## Retriever & Reader

In [None]:
from haystack.nodes import EmbeddingRetriever

openai_api_key = "openai_api_key"

emb_retriever_openai = EmbeddingRetriever(
   document_store=document_store_dense_openai,
   batch_size=8,
   embedding_model="text-embedding-ada-002",     #Embedding Size needs to match with FAISSDocumentstore: 1536
   api_key=openai_api_key,
   max_seq_len=1536
)


### Embedding updaten

In [None]:
#document_store_dense_openai.write_documents(docs_default)
document_store_dense_openai.write_documents(docs)

document_store_dense_openai.update_embeddings(emb_retriever_openai, index=doc_index) # update_existing_embeddings=False

Writing Documents: 10000it [00:01, 8534.30it/s]
Updating Embedding:   0%|          | 0/406 [00:00<?, ? docs/s]
Calculating embeddings:   0%|          | 0/51 [00:00<?, ?it/s][A
Calculating embeddings:   2%|▏         | 1/51 [00:00<00:25,  2.00it/s][A
Calculating embeddings:   4%|▍         | 2/51 [00:01<00:26,  1.88it/s][A
Calculating embeddings:   6%|▌         | 3/51 [00:01<00:24,  2.00it/s][A
Calculating embeddings:   8%|▊         | 4/51 [00:02<00:24,  1.96it/s][A
Calculating embeddings:  10%|▉         | 5/51 [00:02<00:23,  1.92it/s][A
Calculating embeddings:  12%|█▏        | 6/51 [00:03<00:30,  1.46it/s][A
Calculating embeddings:  14%|█▎        | 7/51 [00:04<00:26,  1.66it/s][A
Calculating embeddings:  16%|█▌        | 8/51 [00:04<00:23,  1.80it/s][A
Calculating embeddings:  18%|█▊        | 9/51 [00:04<00:20,  2.07it/s][A
Calculating embeddings:  20%|█▉        | 10/51 [00:05<00:19,  2.08it/s][A
Calculating embeddings:  22%|██▏       | 11/51 [00:05<00:21,  1.85it/s][A
Calcula

In [None]:
from haystack import Pipeline
from haystack.nodes import PromptNode, PromptTemplate, AnswerParser, PromptModel
openai_api_key = "openai_api_key"

rag_prompt = PromptTemplate(
    prompt="""Synthesize a comprehensive answer from the following text for the given question.
                             Provide a clear and concise response in german that summarizes the key points and information presented in the text.
                             Your answer should be using the given words and be no longer than 50 words.
                             \n\n Related text: {join(documents)} \n\n Question: {query} \n\n Answer:""",
    output_parser=AnswerParser(),
)

prompt_node = PromptNode(model_name_or_path="gpt-3.5-turbo-1106", api_key=openai_api_key, default_prompt_template=rag_prompt)


In [None]:
pipe_gpt_openai = Pipeline()
pipe_gpt_openai.add_node(component=emb_retriever_openai, name="Retriever", inputs=["Query"])
pipe_gpt_openai.add_node(component=prompt_node, name="Reader", inputs=["Retriever"])

### Query

In [None]:
res = pipe_gpt_openai.run(query="Welche Vertiefungsrichtungen gibt es im Studiengang AIN?", params={"Retriever": {"top_k": 5}})
res["answers"][0].answer

Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.66it/s]


'Im Studiengang AIN gibt es die Vertiefungsrichtungen Artificial Intelligence, Embedded Systems und Software Engineering.'

# Frontend

In [None]:
n_retriever_results = 5
n_answers = 3

In [None]:
def answer_question(question):
    prediction = pipe_gpt_openai.run(query=question, params={"Retriever": {"top_k": n_retriever_results}})
    answers = prediction["answers"]
    if answers:
        return answers[0].answer
    else:
        return "No answer found"


In [None]:
def answer_question(question):
  return "No answer found, asdlkjasdokn  asojdha sh ahfiouahsfi uahlkjasdokn  asojdha sh ahfiouahsfi uahlkjasdokn  asojdha sh ahfiouahsfi uah iufha isuhf iauhs iauhsfahsfip uha auhf puah fp9auh s9fuh aiushf ap9hf aiushfiah."

In [None]:
from ipywidgets import Layout, VBox, HBox, Text, HTML, Button, Output
import random

def on_submit(sender):
    question = input_text.value
    output = answer_question(question)
    output_html = HTML(f'<div style="background-color: #e5e5ea; border-radius: 0 10px 10px 10px; padding: 10px; margin-bottom: 10px; margin-right: 400px; margin-left: 100px;">{output}</div>')
    question_html = HTML(f'<div style="background-color: #dcf8c6; border-radius: 10px 0 10px 10px; padding: 10px; margin-left: 800px; margin-right: auto; ">{question}</div>')
    chat_display.children += (question_html, output_html)
    input_text.value = ''

input_text = Text(layout=Layout(width='50%', height='80px', border_radius='40px'), placeholder='Type your message...', style={'border-radius': '20px'})

button = Button(description='🠝', layout=Layout(width='7%', height='50px'), style={'background-color': '#00af91','bottom': '100px','border-radius': '50px', 'color': 'white', 'font-weight': 'bold'})
button.on_click(on_submit)

input_box = HBox([input_text, button], layout=Layout(justify_content='center'))
chat_display = VBox([], layout=Layout(align_items='flex-start', width='90%', margin='10px'))

display(VBox([chat_display, input_box]))


VBox(children=(VBox(layout=Layout(align_items='flex-start', margin='10px', width='90%')), HBox(children=(Text(…

Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.47it/s]
Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  2.23it/s]
Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.23it/s]
Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.63it/s]
Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  4.87it/s]
Calculating embeddings: 100%|██████████| 1/1 [00:00<00:00,  3.45it/s]


#### Mögliche Fragen:
Welche Vertiefungen gibts bei AIN?

Welche Vertiefungen gibts bei WIN?

Welche Fächer hat man im ersten Semester in
Wirtschaftsinformatik ?

Was ist Geschäftsprozessmanagement?

Welche Studiengänge gibt es an der HTWG?

Wo und wie kann ich Geld auf meinen Studentenausweis laden?

Ich hab viel stress mit Lernen

Welche SPO hab ich wenn ich seit Sommer 2021 AIN studiere ?

Welche SPO hab ich wenn ich seit Sommer 2023 WIN studiere ?

