In [None]:
import pickle
from tqdm.notebook import tqdm
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OllamaEmbeddings

DOCS_PICKLE_FILE = "reddit-docs.pickle"
SPLITS_PICKLE_PREFIX = "reddit-splits"
VECS_PICKLE_PREFIX = "reddit-vecs"

Load previously downloaded docs.

In [None]:
with open(DOCS_PICKLE_FILE, "rb") as file:
    docs = pickle.load(file)
print(f"{len(docs)} documents")

Split all docs to make them fit as context (or input) of a local llm.

In [None]:
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=CHUNK_SIZE,
    chunk_overlap=CHUNK_OVERLAP,
    add_start_index=True
)
splits = text_splitter.split_documents(docs)
print(f"{len(splits)} splits")

splits_filename = f"{SPLITS_PICKLE_PREFIX}-{CHUNK_SIZE}-{CHUNK_OVERLAP}.pickle"
with open(splits_filename, "wb") as file:
    pickle.dump(splits, file)
    print(f"Wrote splits to {splits_filename}")

In [None]:
embeddings = OllamaEmbeddings(model="llama3")
vecs = []
for split in tqdm(splits):
    vecs.append(embeddings.embed_documents([split])[0])
print(f"embedding space dim: {len(vecs[0])}")

vecs_filename = f"{VECS_PICKLE_PREFIX}-{CHUNK_SIZE}-{CHUNK_OVERLAP}.pickle"
with open(vecs_filename, "wb") as file:
    pickle.dump(vecs, file)
    print(f"Wrote embedding vecs for splits to {vecs_filename}")