## RAG pipeline

In [7]:
import pymupdf4llm
import pathlib
import os
from langchain_community.document_transformers import Html2TextTransformer
from langchain_community.document_loaders import AsyncHtmlLoader
from langchain_community.utilities import SearxSearchWrapper
from langchain.embeddings import HuggingFaceEmbeddings
from llama_index.core import StorageContext, load_index_from_storage
from llama_index.core import Settings
from tqdm import tqdm

In [9]:
# variables
folder_path = './pdfs/pt1_2/'
md_folder_path = './mds/pt1_2/'

### Text extractions from pdfs

In [10]:
files = os.listdir(folder_path)


for file in tqdm(files):
    file_path = folder_path + file
    md_text = pymupdf4llm.to_markdown(file_path)
    md_file_path = md_folder_path + file.replace('.pdf', '.md')
    pathlib.Path(md_file_path).write_bytes(md_text.encode())

  0%|          | 0/9 [00:00<?, ?it/s]

Processing ./pdfs/pt1_2/LU01_machining basics.pdf...


 11%|█         | 1/9 [00:06<00:49,  6.21s/it]

Processing ./pdfs/pt1_2/LU02_machining_defined.pdf...

 22%|██▏       | 2/9 [00:10<00:34,  4.95s/it]

]
Processing ./pdfs/pt1_2/LU03_machining_undefined.pdf...

 33%|███▎      | 3/9 [00:13<00:25,  4.30s/it]

]
Processing ./pdfs/pt1_2/LU04_coating_changing_material_properties.pdf...

 44%|████▍     | 4/9 [00:21<00:27,  5.54s/it]

]
Processing ./pdfs/pt1_2/LU05-LU08_Casting Technology.pdf...


 56%|█████▌    | 5/9 [00:33<00:31,  7.97s/it]

Processing ./pdfs/pt1_2/LU05_Waterjet_Plasmajet.pdf...

 67%|██████▋   | 6/9 [00:35<00:18,  6.01s/it]

]
Processing ./pdfs/pt1_2/LU06_Electron_Beam.pdf...


 78%|███████▊  | 7/9 [00:37<00:09,  4.52s/it]

Processing ./pdfs/pt1_2/LU07_Laser_based Joining and Cutting.pdf...


 89%|████████▉ | 8/9 [00:42<00:04,  4.88s/it]

Processing ./pdfs/pt1_2/LU08_Addtive_manufacturing.pdf...

100%|██████████| 9/9 [00:45<00:00,  5.02s/it]

]





In [23]:
# llamaIndex imports
from llama_index.core.node_parser import SentenceSplitter, TokenTextSplitter
from llama_index.core import SimpleDirectoryReader, VectorStoreIndex
from llama_index.core.node_parser import SemanticSplitterNodeParser
from llama_index.embeddings.ollama import OllamaEmbedding
from llama_index.core import Settings

### Sentence text splitter

In [32]:
reader = SimpleDirectoryReader(folder_path)
docs = reader.load_data()
splitter = SentenceSplitter(
    chunk_size=1024,
    chunk_overlap=20,
)
nodes = splitter.get_nodes_from_documents(docs)
print(nodes[1].text)
print('length of nodes:', len(nodes))

Einführung
Introduction
length of nodes: 488


In [33]:
ollama_embedding = OllamaEmbedding(
    model_name="nomic-embed-text:latest",
    base_url="http://localhost:11434",
)

Settings.embed_model = ollama_embedding

In [34]:
vector_index = VectorStoreIndex(nodes, embeddings=ollama_embedding)
vector_index.storage_context.persist(persist_dir="vector_stores/sentence_splitter_pdf")
retriever = vector_index.as_retriever(similarity_top_k=10)

In [None]:
# Loading the retriever
vector_store_path = "vector_stores/sentence_splitter_pdf"
storage_context = StorageContext.from_defaults(persist_dir=vector_store_path)
index = load_index_from_storage(storage_context)
retriever = index.as_retriever(similarity_top_k=10)

In [35]:
RAG_PROMPT_TEMPLATE = """
<|system|>
Using the information contained in the context,
give a comprehensive answer to the question.
Respond only to the question asked, response should be concise and relevant to the question.
Provide the number of the source document when relevant.
If the answer cannot be deduced from the context, do not give an answer.</s>
<|user|>
Context:
{context}
---
Now here is the question you need to answer.

Question: {question}
</s>
<|assistant|>
"""

In [36]:
query = 'what is the Solidification morphologies in metals?'

text_ret = ''
ret_docs = retriever.retrieve(query)
for i in ret_docs:
    text_ret += i.text + '\n'

final_prompt = RAG_PROMPT_TEMPLATE.format(context=text_ret, question=query)

In [None]:
EVALUATION_PROMPT = """###Task Description:
An instruction (might include an Input inside it), a response to evaluate, a reference answer that gets a score of 5, and a score rubric representing a evaluation criteria are given.
1. Write a detailed feedback that assess the quality of the response strictly based on the given score rubric, not evaluating in general.
2. After writing a feedback, write a score that is an integer between 1 and 5. You should refer to the score rubric.
3. The output format should look as follows: \"Feedback: {{write a feedback for criteria}} [RESULT] {{an integer number between 1 and 5}}\"
4. Please do not generate any other opening, closing, and explanations. Be sure to include [RESULT] in your output.

###The instruction to evaluate:
{instruction}

###Response to evaluate:
{response}

###Reference Answer (Score 5):
{reference_answer}

###Score Rubrics:
[Is the response correct, accurate, and factual based on the reference answer?]
Score 1: The response is completely incorrect, inaccurate, and/or not factual.
Score 2: The response is mostly incorrect, inaccurate, and/or not factual.
Score 3: The response is somewhat correct, accurate, and/or factual.
Score 4: The response is mostly correct, accurate, and factual.
Score 5: The response is completely correct, accurate, and factual.

###Feedback:"""

[
    [chunk1] : [chunk1_embedding],
    [chunk1_summary]: [summary1_embeedding]
],
[
    [chunk2] : [chunk2_embedding],
    [chunk2_summary]: [summary2_embeedding]
]

We can ask our main LLM to summarize the chunk. This can be the actual vector index as JSONl file. 