In [84]:
from datasets import load_dataset
from langchain.text_splitter import RecursiveCharacterTextSplitter
from json import loads
from transformers import LlamaTokenizer
from langchain.vectorstores import Qdrant
from langchain.embeddings import HuggingFaceBgeEmbeddings
from qdrant_client import models, QdrantClient
import time
from datetime import timedelta

# Create retriever corpus

In [1]:
# Number of documents in the wikipedia corpus
num = sum(1 for line in open("./data/corpora/wiki/enwiki-dec2021/text-list-100-sec.jsonl"))
num

33176581

In [3]:
def len_func(example):
    return len(example.split())

In [88]:
# Create retriever
text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=200,
    chunk_overlap=0,
    length_function=len_func,
    is_separator_regex=False,
)

tokenizer = LlamaTokenizer.from_pretrained("./models/llama7b", device_map='cuda')
embedding = HuggingFaceBgeEmbeddings(model_name="./models/retriever/bge-base-en-v1.5")

# Create the retriever
client = QdrantClient(url="http://localhost:6333")
client.create_collection(collection_name="retriever", vectors_config=models.VectorParams(
        size=768,  # Vector size is defined by used model
        distance=models.Distance.COSINE,
    ))
db = Qdrant(client, 
            collection_name="retriever",
            embeddings=embedding,
            )

In [87]:
# Optional: Delete the collection if it already exists
client.delete_collection(collection_name="retriever")

True

In [67]:
len(embedding.embed_query("hello iouerhfu"))

768

In [89]:
db.as_retriever().invoke("Trichocladus")

[]

In [54]:
# Get test sample
with open("./data/corpora/wiki/enwiki-dec2021/text-list-100-sec.jsonl", mode="r") as f:
    for line in f:
        doc = loads(line)
        print(doc)
        break

{'id': '0', 'title': 'Trichocladus crinitus', 'section': '', 'text': ' Trichocladus crinitus is a species of the genus Trichocladus, in the family Hamamelidaceae. It is also called black witch-hazel.'}


In [90]:
# Load wikipedia corpus (in steps of 1000 documents)
documents_count = 0
chunks_count = 0
start = time.time()
with open("./data/corpora/wiki/enwiki-dec2021/text-list-100-sec.jsonl", mode="r") as f:
    documents = []
    metadatas = []
    for line in f:
        
        json_line = loads(line)

        # Split document into chunks  
        documents.append(json_line["text"])
        metadatas.append({"title": json_line["title"], "section": json_line["section"]})  
        documents_count += 1

        if documents_count % 1_000 == 0:
            new_docs = text_splitter.create_documents(documents, metadatas)         
            chunks_count += len(new_docs)
            await db.aadd_documents(new_docs)
            # results = await db.aadd_documents(new_docs)
            # print(results)
            documents = []
            metadatas = []

        # Save retriever every 100_000 documents
        if documents_count % 1_000 == 0:
             elapsed = (time.time() - start)
             clock = str(timedelta(seconds=elapsed))
             print(f"Loaded {documents_count} documents, {chunks_count} chunks, {clock} elapsed")

    new_docs = text_splitter.create_documents(documents, metadatas)            
    chunks_count += len(new_docs)
    await db.aadd_documents(new_docs)
    

print("---------------------------------------------")
print()
elapsed = (time.time() - start)
clock = str(timedelta(seconds=elapsed))
print(f"Loaded {documents_count} documents, {chunks_count} chunks, {clock} elapsed")
print()
print("---------------------------------------------")    

Loaded 1000 documents, 1000 chunks, 0:00:03.381536 elapsed
Loaded 2000 documents, 2000 chunks, 0:00:06.598974 elapsed


CancelledError: 

In [None]:
# Load cc corpus
dataset = load_dataset('OSCAR-2201', "unshuffled_deduplicated_en", split='train', streaming=True)
shuffled_dataset = dataset.shuffle(buffer_size=10_000, seed=2024)
print(next(iter(dataset)))

In [2]:
count = 0
while count < 360_000_000:
    # Load next page
    # Split into documents
    # Pass into document retriever
    pass

KeyboardInterrupt: 