### Imports

In [2]:
from datasets import load_dataset
from langchain.text_splitter import RecursiveCharacterTextSplitter
from json import loads
from transformers import LlamaTokenizer
from langchain.vectorstores import Qdrant
from langchain.embeddings import HuggingFaceBgeEmbeddings
from qdrant_client import models, QdrantClient
import time
from datetime import timedelta

  from .autonotebook import tqdm as notebook_tqdm


### Disclaimer
This document is mainly for maintaining and testing functions of the retrieval dump loading. For the actual optimzed loading please use the scripts provided with this repository

# Create retriever corpus

In [3]:
def len_func(example):
    return len(example.split())

In [4]:
# Create retriever
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=200,
    chunk_overlap=0,
    length_function=len_func,
    is_separator_regex=False,
)

col_name = "retriever_test"

#tokenizer = LlamaTokenizer.from_pretrained("../models/llama7b", device_map='cuda')
embedding = HuggingFaceBgeEmbeddings(model_name="../models/retriever/bge-base-en-v1.5", model_kwargs={"device": "cpu"})

# Create the retriever
client = QdrantClient(url="http://localhost:6333")
db = Qdrant(client, 
            collection_name=col_name,
            embeddings=embedding,
            )

## Testing the wiki dump

In [8]:
# Number of documents in the wikipedia corpus
num = sum(1 for line in open("../data/corpora/wiki/enwiki-dec2021/text-list-100-sec.jsonl"))
num

33176581

In [20]:
# Optional: Delete the collection if it already exists
# client.delete_collection(collection_name=col_name)

# Create the collection
vector_size = len(embedding.embed_query("Test query"))
client.create_collection(collection_name=col_name, vectors_config=models.VectorParams(
        size=vector_size,  # Vector size is defined by used model
        distance=models.Distance.COSINE,
    ))

True

In [9]:
# Get the number of points in the collection
count = client.get_collection(collection_name=col_name).points_count
count

0

In [10]:
db.as_retriever().invoke("Trichocladus")

[]

In [15]:
# Get test sample
with open("../data/corpora/wiki/enwiki-dec2021/text-list-100-sec.jsonl", mode="r") as f:
    for line in f:
        doc = loads(line)
        print(doc)
        break

{'id': '0', 'title': 'Trichocladus crinitus', 'section': '', 'text': ' Trichocladus crinitus is a species of the genus Trichocladus, in the family Hamamelidaceae. It is also called black witch-hazel.'}


## Load wiki dump

In [90]:
# Load wikipedia corpus (in steps of 1000 documents)
documents_count = 0
chunks_count = 0
start = time.time()
with open("../data/corpora/wiki/enwiki-dec2021/text-list-100-sec.jsonl", mode="r") as f:
    documents = []
    metadatas = []
    for line in f:
        
        json_line = loads(line)

        # Split document into chunks  
        documents.append(json_line["text"])
        metadatas.append({"title": json_line["title"], "section": json_line["section"]})  
        documents_count += 1

        if documents_count % 1_000 == 0:
            new_docs = text_splitter.create_documents(documents, metadatas)         
            chunks_count += len(new_docs)
            await db.aadd_documents(new_docs)
            # results = await db.aadd_documents(new_docs)
            # print(results)
            documents = []
            metadatas = []

        # Save retriever every 100_000 documents
        if documents_count % 1_000 == 0:
             elapsed = (time.time() - start)
             clock = str(timedelta(seconds=elapsed))
             print(f"Loaded {documents_count} documents, {chunks_count} chunks, {clock} elapsed")

        if documents_count == 4_000:
            break

    new_docs = text_splitter.create_documents(documents, metadatas)            
    chunks_count += len(new_docs)
    await db.aadd_documents(new_docs)
    

print("---------------------------------------------")
print()
elapsed = (time.time() - start)
clock = str(timedelta(seconds=elapsed))
print(f"Loaded {documents_count} documents, {chunks_count} chunks, {clock} elapsed")
print()
print("---------------------------------------------")    

Loaded 1000 documents, 1000 chunks, 0:00:03.381536 elapsed
Loaded 2000 documents, 2000 chunks, 0:00:06.598974 elapsed


CancelledError: 

## Testing the CC dump

In [8]:
# Optional: Delete the collection if it already exists
client.delete_collection(collection_name=col_name)

# Create the collection
vector_size = len(embedding.embed_query("Test query"))

client.create_collection(collection_name=col_name, vectors_config=models.VectorParams(
        size=vector_size,  # Vector size is defined by used model
        distance=models.Distance.COSINE,
    ))

True

**Oscar Dataset**
en deduplicated

- Size of downloaded dataset files: 496.50 GB
- Size of the generated dataset: 1299.75 GB
- Total amount of disk used: 1796.24 GB

Total amount of words in the dataset: 215,841,256,971

In a total of 304,230,423 documents

Needed space (1.2T)

RA-DIT: 100 words per example, 360M samples


In [14]:
# The fraction of the corpus that was loaded
frac = 110_000_000*100/215_841_256_971

1200*frac # The space approx. needed for the corpus raw

frac

0.05096338000606593

In [5]:
# Build retriever
dataset = load_dataset('oscar', "unshuffled_deduplicated_en", split='train', streaming=True)
shuffled_dataset = dataset.shuffle(buffer_size=10_000, seed=2024)
sample = next(iter(shuffled_dataset))

# Get keys of sample
keys = list(sample.keys())

keys

['id', 'text']

In [6]:
text_splitter.create_documents([sample["text"]], [{"id": sample["id"]}])

[Document(page_content='3Ponds Farm is proud to partner with trainer Jan Uiterwyk, of Dutch Meadows Farms, to offer professional riding instruction to both children and adults, from beginning riders to serious, active competitors. For those who are interested in showing, we participate in a year-round show schedule. Instruction is available for Hunters, Jumpers, and Equitation. Throughout the year our 6 week Beginner Rider & Horsemanship classes, Summer Camps, and a variety of Clinics are also offered.\nJan has been an active member of the equestrian community for over forty years training with, and personally competing against some of the most legendary riders in the sport of show jumping today. His students and horses compete at many venues all over the US from local shows to the "A" circuit. Jan\'s riders have earned titles including, the #1 ranked child rider in the US, #1 Jumper USHJA State of Florida, and the National Champion for Marshall Sterling.', metadata={'id': 2415}),
 Doc

## Load the CC subsample dump

In [8]:
# Load cc corpus
dataset = load_dataset('oscar', "unshuffled_deduplicated_en", split='train', streaming=True)
shuffled_dataset = dataset.shuffle(buffer_size=10_000, seed=2024)
print(next(iter(shuffled_dataset))) # Prints everytime the same because of seed

# text_splitter.create_documents(documents, metadatas)

{'id': 2415, 'text': '3Ponds Farm is proud to partner with trainer Jan Uiterwyk, of Dutch Meadows Farms, to offer professional riding instruction to both children and adults, from beginning riders to serious, active competitors. For those who are interested in showing, we participate in a year-round show schedule. Instruction is available for Hunters, Jumpers, and Equitation. Throughout the year our 6 week Beginner Rider & Horsemanship classes, Summer Camps, and a variety of Clinics are also offered.\nJan has been an active member of the equestrian community for over forty years training with, and personally competing against some of the most legendary riders in the sport of show jumping today. His students and horses compete at many venues all over the US from local shows to the "A" circuit. Jan\'s riders have earned titles including, the #1 ranked child rider in the US, #1 Jumper USHJA State of Florida, and the National Champion for Marshall Sterling.\nMr. Uiterwyk is a founding memb

In [6]:
# Load common crawl corpus (in steps of 1000 documents)
documents_count = 0
chunks_count = 0
start = time.time()

def index_doc(docs, nr_documents):
    process_start = time.time()
    # db.add_documents(docs)
    print(documents)
    elapsed = (time.time() - start)
    clock = str(timedelta(seconds=elapsed))
    print(f"Loaded documents {nr_documents} (total: {documents_count}), with {len(docs)} chunks (total: {chunks_count}), {clock} elapsed (process: {time.time()-process_start}s)")


dataset = load_dataset('oscar', "unshuffled_deduplicated_en", split='train', streaming=True)
shuffled_dataset = iter(dataset.shuffle(buffer_size=10_000, seed=2024))

documents = []
chunk_nr = 0
doc_nr = 0

print("Loading documents...")
while chunks_count < 100:            
    sample = next(shuffled_dataset)
        
    # Split document into chunks  
    docs = text_splitter.create_documents([sample["text"]], [{"id": sample["id"]}])
    documents.extend(docs)
    documents_count += 1

    doc_nr += 1
    chunk_nr = len(documents)
    print("Nr of chunks in document", chunk_nr)

    if chunk_nr > 10:
        index_doc(documents, doc_nr)

        chunks_count += chunk_nr
        # results = await db.aadd_documents(new_docs)
        # print(results)
        documents = []
        chunk_nr = 0
        doc_nr = 0
            
#db.add_documents(documents)
print(documents)   

print("---------------------------------------------")
print()
elapsed = (time.time() - start)
clock = str(timedelta(seconds=elapsed))
print(f"Loaded {documents_count} documents, {chunks_count} chunks, {clock} elapsed")
print()
print("---------------------------------------------") 



Loading documents...
Nr of chunks in document 2
Nr of chunks in document 4
Nr of chunks in document 8
Nr of chunks in document 27
[Document(page_content='3Ponds Farm is proud to partner with trainer Jan Uiterwyk, of Dutch Meadows Farms, to offer professional riding instruction to both children and adults, from beginning riders to serious, active competitors. For those who are interested in showing, we participate in a year-round show schedule. Instruction is available for Hunters, Jumpers, and Equitation. Throughout the year our 6 week Beginner Rider & Horsemanship classes, Summer Camps, and a variety of Clinics are also offered.\nJan has been an active member of the equestrian community for over forty years training with, and personally competing against some of the most legendary riders in the sport of show jumping today. His students and horses compete at many venues all over the US from local shows to the "A" circuit. Jan\'s riders have earned titles including, the #1 ranked child 

In [18]:
client.get_collection(collection_name="retriever").indexed_vectors_count

30740568