sudo docker pull qdrant/qdrant

sudo docker run -p 6333:6333 -p 6334:6334 \
    -v $(pwd)/qdrant_storage:/qdrant/storage:z \
    qdrant/qdrant

In [1]:
import os
from PyPDF2 import PdfReader
import numpy as np

def extract_text_from_pdf(pdf_path):
    reader = PdfReader(pdf_path)
    extracted_text = ""
    for page in reader.pages:
        extracted_text += page.extract_text()
    return extracted_text

def extract_text_from_pdfs_in_directory(directory):
    for filename in os.listdir(directory):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(directory, filename)
            extracted_text = extract_text_from_pdf(pdf_path)
            txt_filename = os.path.splitext(filename)[0] + ".txt"
            txt_filepath = os.path.join(directory, txt_filename)
            with open(txt_filepath, "w") as txt_file:
                txt_file.write(extracted_text)

# Specify the directory containing PDF files
directory_path = "Docs/"

# Extract text from PDFs in the directory and save as text files
extract_text_from_pdfs_in_directory(directory_path)

In [2]:
import os
from nltk.tokenize import sent_tokenize

directory_path = "Docs"

# List all .txt files in the directory
txt_files = [file for file in os.listdir(directory_path) if file.endswith('.txt')]

# List to store sentences from all files
all_sentences = []

# Read each text file, split into sentences, and store
for txt_file in txt_files:
    file_path = os.path.join(directory_path, txt_file)
    with open(file_path, "r") as file:
        text = file.read()
        sentences = sent_tokenize(text)
        all_sentences.extend(sentences)

# Print the first few sentences as an example
print(all_sentences[:10])  # Print first 10 sentences


['The Claude 3 Model Family: Opus, Sonnet, Haiku\nAnthropic\nAbstract\nWe introduce Claude 3, a new family of large multimodal models – Claude 3 Opus , our\nmost capable offering, Claude 3 Sonnet , which provides a combination of skills and speed,\nandClaude 3 Haiku , our fastest and least expensive model.', 'All new models have vision\ncapabilities that enable them to process and analyze image data.', 'The Claude 3 family\ndemonstrates strong performance across benchmark evaluations and sets a new standard on\nmeasures of reasoning, math, and coding.', 'Claude 3 Opus achieves state-of-the-art results\non evaluations like GPQA [1], MMLU [2], MMMU [3] and many more.', 'Claude 3 Haiku\nperforms as well or better than Claude 2 [4] on most pure-text tasks, while Sonnet and\nOpus significantly outperform it.', 'Additionally, these models exhibit improved fluency in\nnon-English languages, making them more versatile for a global audience.', 'In this report,\nwe provide an in-depth analysis o

In [3]:
import ray
from fastembed import TextEmbedding
from typing import List
import numpy as np
import time

ray.init(ignore_reinit_error=True)

@ray.remote
class EmbeddingWorker:
    def __init__(self):
        self.embedding_model = TextEmbedding(model_name="BAAI/bge-base-en")

    def embed_documents(self, documents):
        embeddings = []
        for document in documents:
            embeddings.append(np.array(list(self.embedding_model.embed([document]))))
        return embeddings

# Define the number of workers
num_workers = 4  # Adjust this according to your resources
documents = all_sentences

# Split documents into chunks for each worker
chunk_size = len(documents) // num_workers
document_chunks = [documents[i:i+chunk_size] for i in range(0, len(documents), chunk_size)]

# Start the workers
embedding_workers = [EmbeddingWorker.remote() for _ in range(num_workers)]

# Perform embedding generation in parallel
start_time = time.time()
embedding_tasks = [worker.embed_documents.remote(chunk) for worker, chunk in zip(embedding_workers, document_chunks)]
embeddings = ray.get(embedding_tasks)
end_time = time.time()

# Flatten the embeddings list
embeddings = [embedding for sublist in embeddings for embedding in sublist]

print("Time taken to generate embeddings with Ray Distributed Computing:", end_time - start_time, "seconds")

# Shutdown Ray
ray.shutdown()
embeddings = [sublist[0] for sublist in embeddings]

2024-03-09 15:28:00,552	INFO worker.py:1715 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


Time taken to generate embeddings with Ray Distributed Computing: 195.92240643501282 seconds


In [4]:
from qdrant_client import QdrantClient
from qdrant_client.http.models import Distance, VectorParams
from qdrant_client.models import PointStruct

# client = QdrantClient(path="./DB")
client = QdrantClient("localhost", port=6333)
collection_name = 'fastembed_collection'
client.recreate_collection(
    collection_name=collection_name,
    vectors_config=VectorParams(size=768, distance=Distance.COSINE),
)

True

In [5]:
client.upload_points(
    collection_name=collection_name,
    points=[
        PointStruct(
            id=idx,
            vector=vector.tolist(),
            payload={"color": "red", "rand_number": idx % 10}
        )
        for idx, vector in enumerate(embeddings)
    ]
)

In [10]:
import time
from fastembed import TextEmbedding
import ray 
import numpy as np

ray.init(ignore_reinit_error=True)

@ray.remote
class EmbeddingWorker:
    def __init__(self):
        self.embedding_model = TextEmbedding(model_name="BAAI/bge-base-en")

    def embed_query(self, documents):
        embeddings = []
        for document in documents:
            embeddings.append(np.array(list(self.embedding_model.embed([document]))))
        return embeddings

# Define the number of workers
num_workers = 2  # Adjust this according to your resources
query = ["Can AI Models be hacked?","How to secure AI models?"]

# Split query into chunks for each worker
chunk_size = len(query) // num_workers
document_chunks = [query[i:i+chunk_size] for i in range(0, len(query), chunk_size)]

# Start the workers
embedding_workers = [EmbeddingWorker.remote() for _ in range(num_workers)]

# Perform embedding generation in parallel
start_time = time.time()
embedding_tasks = [worker.embed_query.remote(chunk) for worker, chunk in zip(embedding_workers, document_chunks)]
embeddings = ray.get(embedding_tasks)
end_time = time.time()

# Flatten the embeddings list
embeddings = [embedding for sublist in embeddings for embedding in sublist]

print("Time taken to generate embeddings with Ray Distributed Computing:", end_time - start_time, "seconds")

# Shutdown Ray
ray.shutdown()
query_embeddings = [sublist[0] for sublist in embeddings]

2024-03-09 16:15:47,702	INFO worker.py:1715 -- Started a local Ray instance. View the dashboard at [1m[32m127.0.0.1:8265 [39m[22m


Time taken to generate embeddings with Ray Distributed Computing: 2.065190315246582 seconds


In [11]:
from typing import List
from qdrant_client import QdrantClient

client = QdrantClient("localhost", port=6333)

collection_name = 'fastembed_collection'
for query_embedding in query_embeddings:
    query_vector: List[np.ndarray] = list(query_embedding)
    hits = client.search(
        collection_name=collection_name,
        query_vector=query_vector,
        limit=8 
    )
    print(hits)

    for i in range(8):
        print(all_sentences[hits[i].id])
        print("-------------")
    print("=====================================")

[ScoredPoint(id=4153, version=194, score=0.8726248, payload={'color': 'red', 'rand_number': 3}, vector=None, shard_key=None), ScoredPoint(id=6577, version=308, score=0.8577993, payload={'color': 'red', 'rand_number': 7}, vector=None, shard_key=None), ScoredPoint(id=6974, version=326, score=0.8554286, payload={'color': 'red', 'rand_number': 4}, vector=None, shard_key=None), ScoredPoint(id=6507, version=305, score=0.8542422, payload={'color': 'red', 'rand_number': 7}, vector=None, shard_key=None), ScoredPoint(id=3234, version=152, score=0.8520849, payload={'color': 'red', 'rand_number': 4}, vector=None, shard_key=None), ScoredPoint(id=3249, version=152, score=0.85169685, payload={'color': 'red', 'rand_number': 9}, vector=None, shard_key=None), ScoredPoint(id=6921, version=326, score=0.85126793, payload={'color': 'red', 'rand_number': 1}, vector=None, shard_key=None), ScoredPoint(id=3172, version=149, score=0.85126793, payload={'color': 'red', 'rand_number': 2}, vector=None, shard_key=Non