- pdf_name = "Analysis1_Theorie"
- pdf_name = "Algebra1_Theorie"
- pdf_name = "Stochastik1_Theorie"
...

# 1) OCR Mistral 

for PDF Files in data/Aufgaben & 

In [28]:
# Import required libraries
from pathlib import Path
from pydantic import BaseModel
from mistralai import Mistral
import json
import re

In [29]:
# Initialize Mistral client with API key
from mistralai import Mistral

api_key = "k392FkLbQh8uCZDPjXA0g44yUnRnekPJ" # https://admin.mistral.ai/organization/api-keys
client = Mistral(api_key=api_key)

### 1.1 OCR Processor for PDFs in data/Analysis1

In [40]:
# Represents OCR output for a single PDF page
class PDFPageOutput(BaseModel):
    page_number: int
    markdown: str
    raw_page_json: dict

# Represents OCR output for a full PDF document
class PDFDocumentOutput(BaseModel):
    file_name: str
    pages: list[PDFPageOutput]


# PDF OCR Processor (similar to your image OCR structure)
class PDFOCRProcessor:
    def __init__(self, client: Mistral, output_dir: Path):
        # Store client and output directory
        self.client = client
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)

    # Replace image placeholders with base64 inline images
    def _replace_images(self, markdown_str: str, images: dict[str, str]) -> str:
        for img_id, base64_data in images.items():
            markdown_str = markdown_str.replace(
                f"![{img_id}]({img_id})",
                f"![{img_id}](data:image/png;base64,{base64_data})"
            )
        return markdown_str

    # Main pipeline: PDF -> OCR -> markdown per page
    def process_pdf(self, pdf_path: Path) -> PDFDocumentOutput:
        # Validate PDF
        assert pdf_path.is_file(), f"PDF not found: {pdf_path}"

        # Upload PDF to Mistral
        uploaded = self.client.files.upload(
            file={
                "file_name": pdf_path.stem,
                "content": pdf_path.read_bytes(),
            },
            purpose="ocr",
        )

        # Generate signed URL for OCR
        signed_url = self.client.files.get_signed_url(
            file_id=uploaded.id,
            expiry=1
        )

        # Run OCR
        ocr_response = self.client.ocr.process(
            document={"document_url": signed_url.url},
            model="mistral-ocr-latest",
            include_image_base64=True
        )

        pages_output = []
        doc_dir = self.output_dir / pdf_path.stem
        doc_dir.mkdir(parents=True, exist_ok=True)

        # Process each page
        for i, page in enumerate(ocr_response.pages):
            page_number = i + 1

            # Extract images
            image_map = {img.id: img.image_base64 for img in page.images}

            # Replace placeholders in markdown
            markdown = self._replace_images(page.markdown, image_map)

            # Save markdown
            (doc_dir / f"page_{page_number}.md").write_text(
                markdown, encoding="utf-8"
            )

            # Save raw JSON
            raw_json = json.loads(page.model_dump_json())
            (doc_dir / f"page_{page_number}.json").write_text(
                json.dumps(raw_json, indent=4), encoding="utf-8"
            )

            # Add output model
            pages_output.append(
                PDFPageOutput(
                    page_number=page_number,
                    markdown=markdown,
                    raw_page_json=raw_json
                )
            )

        # Return structured document result
        return PDFDocumentOutput(
            file_name=pdf_path.stem,
            pages=pages_output
        )


### 1.2 Output PDFs for ANALYSIS Semester 1

In [41]:
from pathlib import Path
from mistralai import Mistral

# Initialize processor
processor = PDFOCRProcessor(
    client=client,
    output_dir=Path("data_processed/Analysis1")
)

# Directory containing your PDFs
pdf_dir = Path("data/Analysis1")
pdf_files = list(pdf_dir.glob("*.pdf"))

# Process every PDF in the folder
for pdf in pdf_files:
    print(f"Processing: {pdf.name}")
    processor.process_pdf(pdf)

print("All PDFs processed successfully.")


Processing: Übungsblatt-CDS-Mathe1_Ana8_Lösungen.pdf
Processing: Probeprüfung_CDS-Ana_HS2023.pdf
Processing: Übungsblatt-CDS-Mathe1_Ana1_Lösungen-2.pdf
Processing: Übungsblatt-CDS-Mathe1_Ana5_Lösungen.pdf
Processing: Übungsblatt-CDS-Mathe1_Ana11_Lösungen.pdf
Processing: Übungsblatt-CDS-Mathe1_Ana12_Lösungen.pdf
Processing: Übungsblatt-CDS-Mathe1_Ana2_Lösungen.pdf
Processing: Übungsblatt-CDS-Mathe1_Ana6_Lösungen.pdf
Processing: Übungsblatt-CDS-Mathe1_Ana13_Lösungen.pdf
Processing: Übungsblatt-CDS-Mathe1_Ana7_Lösungen.pdf
Processing: Übungsblatt-CDS-Mathe1_Ana3_Lösungen.pdf
Processing: Übungsblatt-CDS-Mathe1_Ana4_Lösungen.pdf
Processing: Übungsblatt-CDS-Mathe1_Ana10_Lösungen.pdf
Processing: analysis1.pdf
Processing: Übungsblatt-CDS-Mathe1_Ana9_Lösungen.pdf
All PDFs processed successfully.


PDFs for LINEARE ALGEBRA 1 
PDFs for STOCHASTIK 1
PDFs for ANALYSIS 2
PDFs for LINEARE ALGEBRA 2
PDFs for ANALYSIS 3
PDFs for LINEARE ALGEBRA 3
PDFs for STOCHASTIK 3


### 1.3 Merge all Theory pages

In [1]:
from pathlib import Path

def merge_theory_pages(theory_folder: Path, output_file: Path):
    full_text = ""

    for md_path in sorted(theory_folder.glob("page_*.md")):
        page_content = md_path.read_text(encoding="utf-8")
        full_text += f"\n\n# --- PAGE {md_path.stem} ---\n\n"
        full_text += page_content

    output_file.write_text(full_text, encoding="utf-8")
    print(f"Merged markdown saved to: {output_file}")


In [2]:
merge_theory_pages(
    theory_folder=Path("data_processed/Analysis1/analysis1"),
    output_file=Path("data_processed/Analysis1/Theorie_analysis1_full.md")
)

Merged markdown saved to: data_processed/Analysis1/Theorie_analysis1_full.md


# 2) Chunking

### 2.1) Base64 Bilder entfernen bei Theorie

In [3]:
import re

def remove_base64_images(text: str) -> str:
    """
    Removes all base64-encoded images from OCR markdown.
    """
    pattern = r"!\[.*?\]\(data:image\/.*?;base64,.*?\)"
    clean_text = re.sub(pattern, "", text)
    return clean_text


In [4]:
raw_text = Path("data_processed/Analysis1/Theorie_analysis1_full.md").read_text(encoding="utf-8")
clean_text = remove_base64_images(raw_text)

Path("data_processed/Analysis1/Theorie_analysis1_clean.md").write_text(clean_text, encoding="utf-8")
print("Cleaned markdown saved.")


Cleaned markdown saved.


### 2.1 Theorie Clean Markdown laden und rekursive chunken

In [77]:
from langchain_community.document_loaders import TextLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter


# load all theory pages
loader = TextLoader(
    "data_processed/Analysis1/Theorie_analysis1_clean.md", 
    encoding="utf-8")
docs = loader.load()

theory_splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    separators=["\n## ", "\n### ", "\n", " ", ""]
)

theory_chunks = theory_splitter.split_documents(docs)
print("Chunks:", len(theory_chunks))
# Ergebis 73 Chunks ohne Bilder anstatt 2250


Chunks: 108


Chunks abspeichern für Evaluierung

In [78]:
# save chunks as markdown file

from pathlib import Path

out_path = Path("data_processed/Analysis1/Theorie_analysis1_chunks.md")

with out_path.open("w", encoding="utf-8") as f:
    for i, chunk in enumerate(theory_chunks):
        f.write(f"\n\n## Chunk {i+1}\n")
        f.write("-" * 80 + "\n")
        f.write(chunk.page_content)
        f.write("\n\n")


In [79]:
# save chunks as text file

with open("data_processed/Analysis1/Theorie_analysis1_chunks.txt", "w", encoding="utf-8") as f:
    for i, chunk in enumerate(theory_chunks):
        f.write(f"\n\n=== CHUNK {i+1} ===\n")
        f.write(chunk.page_content)
        f.write("\n\n")


### 2.2.1 Metadaten für Theorie

In [105]:
theory_docs_with_meta = []

for i, chunk in enumerate(theory_chunks):
    new_meta = chunk.metadata.copy()

    new_meta.update({
        "source": "data_processed/Analysis1/Theorie_analysis1_clean.md",
        "pdf_name": "analysis1",      # <-- ORIGINAL THEORY PDF NAME
        "chunk_id": i + 1,
        "type": "theory"
    })

    theory_docs_with_meta.append(
        type(chunk)(page_content=chunk.page_content, metadata=new_meta)
    )


In [106]:
# Chunks mit Metadaten alles in JSON abspeichern

import json

output_path = Path("data_processed/Analysis1/Theorie_analysis1_chunks_with_meta.json")

json_list = []
for d in theory_docs_with_meta:
    json_list.append({
        "chunk_id": d.metadata["chunk_id"],
        "pdf_name": d.metadata["pdf_name"],
        "source": d.metadata["source"],
        "type": d.metadata["type"],
        "content": d.page_content
    })

with output_path.open("w", encoding="utf-8") as f:
    json.dump(json_list, f, indent=4, ensure_ascii=False)

print("Saved:", output_path)


Saved: data_processed/Analysis1/Theorie_analysis1_chunks_with_meta.json


In [107]:
len(theory_docs_with_meta), theory_docs_with_meta[0].metadata


(108,
 {'source': 'data_processed/Analysis1/Theorie_analysis1_clean.md',
  'pdf_name': 'analysis1',
  'chunk_id': 1,
  'type': 'theory'})

### 2.3 Aufgabenblätter laden 

In [115]:
from pathlib import Path

base_dir = Path("data_processed/Analysis1")

# Ordner finden
uebungsblatt_dirs = [
    p for p in base_dir.iterdir()
    if p.is_dir() and "Übungsblatt" in p.name
]

print("Gefundene Übungsblätter:", len(uebungsblatt_dirs))
for d in uebungsblatt_dirs:
    print(" -", d)


Gefundene Übungsblätter: 13
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana12_Lösungen
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana6_Lösungen
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana7_Lösungen
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana13_Lösungen
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana10_Lösungen
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana4_Lösungen
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana9_Lösungen
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana2_Lösungen
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana5_Lösungen
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana11_Lösungen
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana8_Lösungen
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana3_Lösungen
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana1_Lösungen-2


In [116]:
# alles Seiten der Übungsblätter einzeln sammeln
def collect_md_pages(folder: Path):
    pages = sorted(
        folder.glob("page_*.md"),
        key=lambda x: int(x.stem.split("_")[1])
    )
    return pages

# test: Ana 12
test_folder = uebungsblatt_dirs[0]
pages = collect_md_pages(test_folder)

print("Gefundene Seiten:", len(pages))
for p in pages[:10]:
    print(" -", p)


Gefundene Seiten: 14
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana12_Lösungen/page_1.md
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana12_Lösungen/page_2.md
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana12_Lösungen/page_3.md
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana12_Lösungen/page_4.md
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana12_Lösungen/page_5.md
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana12_Lösungen/page_6.md
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana12_Lösungen/page_7.md
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana12_Lösungen/page_8.md
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana12_Lösungen/page_9.md
 - data_processed/Analysis1/Übungsblatt-CDS-Mathe1_Ana12_Lösungen/page_10.md


In [118]:
# Seiten mergen 

def merge_md_pages(folder: Path) -> str:
    pages = collect_md_pages(folder)
    merged = ""

    for p in pages:
        text = p.read_text(encoding="utf-8").strip()
        if len(text) > 4:
            merged += f"\n\n# --- PAGE {p.stem} ---\n\n{text}"

    return merged.strip()


In [119]:
# alle Übungsblätter mergen und speichern

merged_dir = base_dir / "merged_exercises"
merged_dir.mkdir(exist_ok=True)

for folder in uebungsblatt_dirs:
    merged_text = merge_md_pages(folder)

    out_path = merged_dir / f"{folder.name}_full.md"
    out_path.write_text(merged_text, encoding="utf-8")

    print("Gespeichert:", out_path)

Gespeichert: data_processed/Analysis1/merged_exercises/Übungsblatt-CDS-Mathe1_Ana12_Lösungen_full.md
Gespeichert: data_processed/Analysis1/merged_exercises/Übungsblatt-CDS-Mathe1_Ana6_Lösungen_full.md
Gespeichert: data_processed/Analysis1/merged_exercises/Übungsblatt-CDS-Mathe1_Ana7_Lösungen_full.md
Gespeichert: data_processed/Analysis1/merged_exercises/Übungsblatt-CDS-Mathe1_Ana13_Lösungen_full.md
Gespeichert: data_processed/Analysis1/merged_exercises/Übungsblatt-CDS-Mathe1_Ana10_Lösungen_full.md
Gespeichert: data_processed/Analysis1/merged_exercises/Übungsblatt-CDS-Mathe1_Ana4_Lösungen_full.md
Gespeichert: data_processed/Analysis1/merged_exercises/Übungsblatt-CDS-Mathe1_Ana9_Lösungen_full.md
Gespeichert: data_processed/Analysis1/merged_exercises/Übungsblatt-CDS-Mathe1_Ana2_Lösungen_full.md
Gespeichert: data_processed/Analysis1/merged_exercises/Übungsblatt-CDS-Mathe1_Ana5_Lösungen_full.md
Gespeichert: data_processed/Analysis1/merged_exercises/Übungsblatt-CDS-Mathe1_Ana11_Lösungen_full

### 2.3.1 Base64 Bildern entfernen für merged Blätter

In [120]:
import re
from pathlib import Path

def remove_base64_images(text: str) -> str:
    """
    Entfernt Base64-Bilder aus Markdown:
    ...
    """
    pattern = r"!\[.*?\]\(data:image\/.*?;base64,.*?\)"
    return re.sub(pattern, "", text)


merged_dir = Path("data_processed/Analysis1/merged_exercises")

cleaned_dir = Path("data_processed/Analysis1/cleaned_exercises")
cleaned_dir.mkdir(exist_ok=True)

for file in merged_dir.glob("*_full.md"):
    raw = file.read_text(encoding="utf-8")
    cleaned = remove_base64_images(raw)

    out = cleaned_dir / file.name.replace("_full.md", "_clean.md")
    out.write_text(cleaned, encoding="utf-8")

    print("Cleaned:", out)


Cleaned: data_processed/Analysis1/cleaned_exercises/Übungsblatt-CDS-Mathe1_Ana12_Lösungen_clean.md
Cleaned: data_processed/Analysis1/cleaned_exercises/Übungsblatt-CDS-Mathe1_Ana9_Lösungen_clean.md
Cleaned: data_processed/Analysis1/cleaned_exercises/Übungsblatt-CDS-Mathe1_Ana13_Lösungen_clean.md
Cleaned: data_processed/Analysis1/cleaned_exercises/Übungsblatt-CDS-Mathe1_Ana8_Lösungen_clean.md
Cleaned: data_processed/Analysis1/cleaned_exercises/Übungsblatt-CDS-Mathe1_Ana10_Lösungen_clean.md
Cleaned: data_processed/Analysis1/cleaned_exercises/Übungsblatt-CDS-Mathe1_Ana11_Lösungen_clean.md
Cleaned: data_processed/Analysis1/cleaned_exercises/Übungsblatt-CDS-Mathe1_Ana6_Lösungen_clean.md
Cleaned: data_processed/Analysis1/cleaned_exercises/Übungsblatt-CDS-Mathe1_Ana3_Lösungen_clean.md
Cleaned: data_processed/Analysis1/cleaned_exercises/Übungsblatt-CDS-Mathe1_Ana7_Lösungen_clean.md
Cleaned: data_processed/Analysis1/cleaned_exercises/Übungsblatt-CDS-Mathe1_Ana1_Lösungen-2_clean.md
Cleaned: data_

### 2.3.2 Metadaten aus Dateiname 

In [122]:
from langchain.schema import Document

cleaned_dir = Path("data_processed/Analysis1/cleaned_exercises")

exercise_docs_raw = []

for file in cleaned_dir.glob("*_clean.md"):
    text = file.read_text(encoding="utf-8")

    exercise_docs_raw.append(
        Document(
            page_content=text,
            metadata={
                "source": str(file),
                "uebungsblatt": file.stem.split("_clean")[0],
                "type": "exercise_raw"
            }
        )
    )

print("Geladene gesamt:", len(exercise_docs_raw))
print("Beispiel-Metadaten:")
exercise_docs_raw[0].metadata


Geladene gesamt: 13
Beispiel-Metadaten:


{'source': 'data_processed/Analysis1/cleaned_exercises/Übungsblatt-CDS-Mathe1_Ana6_Lösungen_clean.md',
 'uebungsblatt': 'Übungsblatt-CDS-Mathe1_Ana6_Lösungen',
 'type': 'exercise_raw'}

### 2.3.3 Chunking nach Headern bzw. Aufgaben

In [134]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split = [
    ("#", "task"),
    ("##", "subtask")
]

splitter = MarkdownHeaderTextSplitter(headers_to_split)

exercise_docs_with_meta = []   

for doc in exercise_docs_raw:
    chunks = splitter.split_text(doc.page_content)

    for i, c in enumerate(chunks):
        c.metadata.update({
            "chunk_id": i + 1,
            "type": "exercise",
            "uebungsblatt": doc.metadata["uebungsblatt"],
            "source": doc.metadata["source"]
        })

    exercise_docs_with_meta.extend(chunks)

print("Erzeugte Chunks:", len(exercise_docs_with_meta))
print("Beispiel:", exercise_docs_with_meta[0].metadata)


Erzeugte Chunks: 226
Beispiel: {'task': 'Übungsblatt Ana 6', 'subtask': '1. Funktionsgraphen verschieben', 'chunk_id': 1, 'type': 'exercise', 'uebungsblatt': 'Übungsblatt-CDS-Mathe1_Ana6_Lösungen', 'source': 'data_processed/Analysis1/cleaned_exercises/Übungsblatt-CDS-Mathe1_Ana6_Lösungen_clean.md'}


In [135]:
import json

out_json = Path("data_processed/Analysis1/Exercises_chunks_with_meta.json")

json.dump(
    [
        {"content": d.page_content, "metadata": d.metadata}
        for d in exercise_docs_with_meta
    ],
    open(out_json, "w", encoding="utf-8"),
    ensure_ascii=False, indent=2
)

print("Saved:", out_json)


Saved: data_processed/Analysis1/Exercises_chunks_with_meta.json


# 3) Embedding

In [131]:
# Ollama Embedding Modell https://ollama.com/search?c=embedding
from langchain_community.embeddings import OllamaEmbeddings

# Embedding Model initialisieren
embedder = OllamaEmbeddings(model="mxbai-embed-large")

print("Embedding model loaded.")
embedder

Embedding model loaded.


OllamaEmbeddings(base_url='http://localhost:11434', model='mxbai-embed-large', embed_instruction='passage: ', query_instruction='query: ', mirostat=None, mirostat_eta=None, mirostat_tau=None, num_ctx=None, num_gpu=None, num_thread=None, repeat_last_n=None, repeat_penalty=None, temperature=None, stop=None, tfs_z=None, top_k=None, top_p=None, show_progress=False, headers=None, model_kwargs=None)


### 3.1 Text Embedding



In [None]:
# Embeddings berechnen für Theorie

# theory_docs_with_meta = ...  # <-- Ergebniss-Variable vom Chunken mit Meta

theory_embeddings = [
    embedder.embed_query(d.page_content)
    for d in theory_docs_with_meta
]

print("Created embeddings for", len(theory_embeddings), "theory chunks.")


Created embeddings for 108 theory chunks.


In [136]:
# Embeddings berechnen für Aufgaben

# exercise_docs_with_meta = ...  # <-- Ergebniss-Variable vom Chunken mit Meta

exercise_embeddings = [
    embedder.embed_query(d.page_content)
    for d in exercise_docs_with_meta
]

print("Created embeddings for", len(exercise_embeddings), "exercise chunks.")


Created embeddings for 226 exercise chunks.


### 3.2 Bild Embedding

# 4) Vectorstore


In [25]:
from langchain_community.vectorstores import Chroma

# Vectorstore erstellen
theory_db = Chroma.from_documents(
    documents=theory_docs_with_meta,
    embedding=embedder,
    persist_directory="vectorstore/theory_local"
)

theory_db.persist()
print("Vectorstore saved at vectorstore/theory_local")


Vectorstore saved at vectorstore/theory_local


  theory_db.persist()


# 5) Retriever / Ranking

- Basis Retriever
- MMR 
- Context Compression
- 


### 5.1 MMR Retriever
Haupt Retriever Basis

In [35]:
mmr_retriever = theory_db.as_retriever(
    search_type="mmr",
    search_kwargs={
        "k": 4,
        "fetch_k": 40,
        "lambda_mult": 0.25
    }
)



### 5.2 Query Rewriter
Gemini/Mistral erzeugt bessere Queries

In [36]:
from langchain.retrievers.multi_query import MultiQueryRetriever
from langchain_google_genai import ChatGoogleGenerativeAI

llm_rewriter = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    google_api_key=GOOGLE_API_KEY,
    temperature=0.0
)

multi_query_retriever = MultiQueryRetriever.from_llm(
    retriever=mmr_retriever,
    llm=llm_rewriter
)


### 5.3 Kompression Retriever

In [37]:
compressor = EmbeddingsFilter(
    embeddings=embedder,
    similarity_threshold=0.72    # stärkere Relevanz
)

compression_retriever = ContextualCompressionRetriever(
    base_retriever=multi_query_retriever,
    base_compressor=compressor
)


In [39]:
# finaler Retriever
super_retriever = compression_retriever


### 5.4 Nur zum Testen: Score Ranking (Debugging)

In [41]:
query = "Was ist eine injektive Abbildung?"
docs_scores = theory_db.similarity_search_with_score(query, k=5)

for rank, (doc, score) in enumerate(docs_scores, start=1):
    print(f"\nRANK {rank} — SCORE {score:.4f}")
    print("Chunk:", doc.metadata["chunk_id"])
    print(doc.page_content[:250])
    print("-" * 50)



RANK 1 — SCORE 0.5050
Chunk: 9
Verketting : $\left(f_{2} \circ f_{1}\right)(x)=f_{2}\left(f_{1}(x)\right)$
$f_{2}$ nach $f_{1}$


# --- PAGE page_22 ---

Bsp: $\quad f_{1}: \mathbb{N} \rightarrow \mathbb{R} \quad f_{2}: \mathbb{R}^{+} \rightarrow \mathbb{R}^{+}$

$$
n \rightarrow 
--------------------------------------------------

RANK 2 — SCORE 0.5471
Chunk: 10
$\rightarrow$ Sujetutivität kann duch Verkeainern der Werte - auf Bildmenge erreicht werden

Bsp: $\quad f: N \rightarrow N \quad f(x)=x^{2}$
injetitiv, nicht sujeltiv (weile zB 3,5 wird nicht aingenommen)
$\circ f: N \rightarrow \xi-1 ; 1 \xi \quad 
--------------------------------------------------

RANK 3 — SCORE 0.6229
Chunk: 25
## Visualisierung

mittels Funktionsgraphen : in katesisches koordinalensystem trägt man Punkt $(x, f(x))$ ein :


$\rightarrow$ wegen eindeutiger Zuordnung : Parallele zu y - Achse schneidet f höchstens einmal
$\rightarrow$ schneidet Parallele zu $x
-----------------------------------------------

# 6) LLM

In [None]:
# LLM initialieren 

from langchain_google_genai import ChatGoogleGenerativeAI

llm = ChatGoogleGenerativeAI(
    model="gemini-1.5-flash",
    google_api_key=GOOGLE_API_KEY,
    temperature=0.1
)


### 6.2 Prompt definieren 

In [45]:
from langchain_core.prompts import PromptTemplate

rag_prompt = """
Du bist ein mathematischer Assistent für Analysis 1.

Nutze AUSSCHLIESSLICH den folgenden Kontext, um die Frage zu beantworten.
Erfinde nicht dazu. Wenn Information fehlt, sage:
'Im Kontext nicht vorhanden.'

Antworte kurz, präzise, mathematisch korrekt und klar strukturiert.

KONTEXT:
{context}

FRAGE:
{question}

ANTWORT:
"""

prompt = PromptTemplate.from_template(rag_prompt)


### 6.3 RAG Chain

In [46]:
from langchain_core.output_parsers import StrOutputParser
from langchain.chains import RunnableParallel, RunnablePassthrough

rag_chain = (
    RunnableParallel({
        "context": lambda x: "\n\n".join(
            d.page_content for d in super_retriever.get_relevant_documents(x["question"])
        ),
        "question": RunnablePassthrough(),
    })
    | prompt
    | llm
    | StrOutputParser()
)


ImportError: cannot import name 'RunnableParallel' from 'langchain.chains' (/opt/anaconda3/envs/math-qa/lib/python3.11/site-packages/langchain/chains/__init__.py)

### 6.4 Test

In [None]:
response = rag_chain.invoke({"question": "Was ist eine injektive Abbildung?"})
print(response)


In [None]:
# Debugging

query = "Was ist eine injektive Abbildung?"

docs = super_retriever.get_relevant_documents(query)

for i, d in enumerate(docs[:5]):
    print(f"\n----- DOC {i+1} (source page: {d.metadata.get('page_number', '?')}) -----")
    print(d.page_content[:500])


Retrying langchain_google_genai.chat_models._chat_with_retry.<locals>._chat_with_retry in 2.0 seconds as it raised NotFound: 404 models/gemini-1.5-flash is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods..


NotFound: 404 models/gemini-1.5-flash is not found for API version v1beta, or is not supported for generateContent. Call ListModels to see the list of available models and their supported methods.

rag chain als py document und dazu dann frontend anbinden