# Create Embeddings
Embeddings will be useful in experiment notebooks for things like topic clustering. They are computed and saved in this dedicated notebook, because computation might take quite some time depending on hardware and used model.

In [None]:
import pickle
from tqdm.notebook import tqdm
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings

## Input/Output Setup
Define where to load *pickled* documents from and how to store outputs (document splits and embedding vectors).

In [None]:
# input
DOCS_PICKLE_FILE = "reddit-docs.pickle"
# output
SPLITS_PICKLE_PREFIX = "reddit-splits"
VECS_PICKLE_PREFIX = "reddit-vecs"

## Load Documents

In [None]:
with open(DOCS_PICKLE_FILE, "rb") as file:
    docs = pickle.load(file)
print(f"{len(docs)} documents")

## Split Documents
Split all docs to make them fit as context (or input) of a local llm.

**Dependencies**: `CHUNK_SIZE` and `CHUNK_OVERLAP` depend on the used LLM for embeddings (context size) and might also impact the overall outcome depending on the experiment setup. Should be chosen carefully and maybe also experimented with.

In [None]:
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200

### Compute Splits

In [None]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    add_start_index=True
)
splits = text_splitter.split_documents(docs)
print(f"{len(splits)} splits")

### Save Splits

In [None]:
splits_filename = f"{SPLITS_PICKLE_PREFIX}-{CHUNK_SIZE}-{CHUNK_OVERLAP}.pickle"
with open(splits_filename, "wb") as file:
    pickle.dump(splits, file)
    print(f"Wrote splits to {splits_filename}")

## Embeddings
**Experiments**: Different LLMs (obviously).

### Setup LLM
Uses *Ollama*: https://ollama.com/

In [None]:
OLLAMA_MODEL = "llama3"

### Compute Embedding Vectors
Computes embedding vectors for all document *splits*.

In [None]:
embeddings = OllamaEmbeddings(model=OLLAMA_MODEL)
vecs = []
for split in tqdm(splits):
    vecs.append(embeddings.embed_documents([split])[0])
print(f"embedding space dim: {len(vecs[0])}")

### Save Embedding Vectors

In [None]:
vecs_filename = f"{VECS_PICKLE_PREFIX}-{CHUNK_SIZE}-{CHUNK_OVERLAP}.pickle"
with open(vecs_filename, "wb") as file:
    pickle.dump(vecs, file)
    print(f"Wrote embedding vecs for splits to {vecs_filename}")