### Load Datasets

In [None]:
from datasets import load_dataset

data = load_dataset("wikipedia", "20220301.simple", split="train[:10]")
data

In [83]:
from fastembed import TextEmbedding

from datasets import load_dataset

data = load_dataset("wikipedia", "20220301.simple", split="train[:10]")
embedding = TextEmbedding()

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

In [84]:
text = data[0]["text"]
embedding.embed(text)

<generator object TextEmbedding.embed at 0x7e557409b5b0>

### Tokenization

In [None]:
import tiktoken  # !pip install tiktoken

tokenizer = tiktoken.get_encoding("p50k_base")


# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(text, disallowed_special=())
    return len(tokens)


tiktoken_len(
    "hello I am a chunk of text and using the tiktoken_len function "
    "we can find the length of this chunk of text in tokens"
)

### Split to Chunk

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=400,
    chunk_overlap=20,
    length_function=tiktoken_len,
    separators=["\n\n", "\n", " ", ""],
)

chunks = text_splitter.split_text(data[6]["text"])[:3]
chunks

### Embeddings

In [None]:
from langchain_community.embeddings.fastembed import FastEmbedEmbeddings

embed = FastEmbedEmbeddings()
texts = ["this is the first chunk of text", "then another second chunk of text is here"]

res = embed.embed_documents(texts)
len(res), len(res[0])

In [None]:
from langchain.embeddings.ollama import OllamaEmbeddings

embed = OllamaEmbeddings(model="nomic-embed-text")
texts = ["this is the first chunk of text", "then another second chunk of text is here"]

res = embed.embed_documents(texts)
len(res), len(res[0])

### Vector Store

In [None]:
import chromadb

chroma_client = chromadb.Client()
collection = chroma_client.create_collection(name="langchain-retrieval-augmentation",)

In [None]:
from tqdm.auto import tqdm
from uuid import uuid4

batch_limit = 1000

texts = []
metadatas = []

for i, record in enumerate(tqdm(data)):
    # first get metadata fields for this record
    metadata = {
        "wiki-id": str(record["id"]),
        "source": record["url"],
        "title": record["title"],
    }
    # now we create chunks from the record text
    record_texts = text_splitter.split_text(record["text"])
    # create individual metadata dicts for each chunk
    record_metadatas = [
        {"chunk": j, "text": text, **metadata} for j, text in enumerate(record_texts)
    ]
    # append these to current batches
    texts.extend(record_texts)
    metadatas.extend(record_metadatas)
    # if we have reached the batch_limit we can add texts
    if len(texts) >= batch_limit:
        ids = [str(uuid4()) for _ in range(len(texts))]
        embeds = embed.embed_documents(texts)
        collection.upsert(
            ids=ids,
            embeddings=embeds,
            metadatas=metadatas,
            documents=texts,
        )
        texts = []
        metadatas = []

In [None]:
import asyncio

async def embed_text(text):
    embeddings = await embed(text)
    return embeddings

# Example usage
async def main():
    text = "This is a sample text."
    embeddings = await embed_text(text)
    print(embeddings)

# Run the async function
await main()


In [None]:
import ollama
import chromadb

client = chromadb.Client()
# collection = client.create_collection(name="ollama")

# store each document in a vector embedding database
for i, record in enumerate(tqdm(data)):
    d = record["text"]
    response = ollama.embeddings(model="nomic-embed-text", prompt=d)
    embedding = response["embedding"]
    collection.add(ids=[str(i)], embeddings=[embedding], documents=[d])

In [None]:
import ollama
import chromadb

client = chromadb.HttpClient()
collection = client.create_collection(name="ollama")


# store each document in a vector embedding database
batch_size = 100  # adjust this based on your system's memory
for i in range(0, len(data), batch_size):
    batch = data[i : i + batch_size]
    ids = [str(j) for j in range(i, i + len(batch))]
    documents = [data[i]["text"] for i in range(i, i + len(batch))]
    embeddings = [
        ollama.embeddings(model="nomic-embed-text", prompt=d)["embedding"]
        for d in documents
    ]
    collection.add(ids=ids, embeddings=embeddings, documents=documents)

In [None]:
# list all collections
client.list_collections()

# get the collection
collection = client.get_collection("ollama")

# get the document by id
doc = collection.get("1")
doc

In [None]:
from datasets import load_dataset
docs = load_dataset(f"Cohere/wikipedia-22-12-simple-embeddings", split="train[:100]")