In [5]:
# Embedding

# Importing Libraries
from transformers import AutoTokenizer, AutoModel
import torch
import torch.nn.functional as F
from pathlib import Path

# Mean Pooling Function
# This function is used to create a single embedding for each sentence by averaging the embeddings of all its tokens. The attention mask makes sure that padding tokens don't affect the final result.
def mean_pooling(model_output, attention_mask):
    # The first element of model_output contains the token-level embeddings
    token_embeddings = model_output[0]

    # Expanding the attention mask so it matches the size of token_embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()

    # Computing the average of the embeddings for the real (non-padded) tokens
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)


# Loading the chunks
base = Path("data_processed/Analysis1")

chunks = []

# Loading the theory markdown file
theory_file = base / "Theory_analysis1_full.md"

if theory_file.exists():
    with open(theory_file, "r", encoding="utf-8") as f:
        text = f.read()
        chunks.append(text)

# Loading exercise markdown files
exercise_dir = base / "merged_exercises"
if exercise_dir.exists():
    for md_file in sorted(exercise_dir.glob("*.md")):
        with open(md_file, "r", encoding="utf-8") as f:
            text = f.read()
            chunks.append(text)

print("Loaded", len(chunks), "chunks from markdown files")

# Loading the pretrained transformer model and tokenizer from HuggingFace
tokenizer = AutoTokenizer.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')
model = AutoModel.from_pretrained('sentence-transformers/all-MiniLM-L6-v2')

# Tokenizing the chunks: this gives us input IDs, attention masks, etc.
encoded_input = tokenizer(chunks, padding=True, truncation=True, return_tensors='pt')

# Executing the model (no gradients needed)
with torch.no_grad():
    model_output = model(**encoded_input)

# Applying mean pooling
sentence_embeddings = mean_pooling(model_output, encodedInput['attention_mask'])

# Normalizing the embeddings
sentence_embeddings = F.normalize(sentence_embeddings, p=2, dim=1)

# Using / inspecting the embeddings (to check)
print("Shape of embeddings tensor:", sentence_embeddings.shape)
print("Number of chunks:", len(chunks))

# Printing the first chunk and embedding (to test)
print("First chunk preview:")
print(chunks[0][:500], "...")
print("\nFirst embedding vector (first 5 values):")
print(sentence_embeddings[0][:5])

Loaded 13 chunks from markdown files
Shape of embeddings tensor: torch.Size([13, 384])
Number of chunks: 13
First chunk preview:
# --- PAGE page_1 ---

# -`sleq`j_rr ?1_10 Computational and Data Science BSc HS 2023 

## Lösungen

Mathematik 1

## 1. Aussagen über Ableitungen von trigonometrischen Funktionen

| Welche der folgenden Aussagen sind wahr und welche falsch? | wahr | falsch |
| :-- | :-- | :-- |
| a) Die Ableitungen der trigonometrischen Funktionen sind nur bezüglich des <br> Gradmass definiert. | 0 | $\bullet$ |
| b) Die Ableitungen der trigonometrischen Funktionen sind nur bezüglich des <br> Bogenmass definier ...

First embedding vector (first 5 values):
tensor([-0.0605,  0.0077, -0.0434, -0.0493, -0.0028])
