In [1]:
# Install requirements
# drop the -q if you want to se the results
! pip install -qr requirements.txt

In [2]:
# Check the API KEYs
import os

config = {}

if os.path.exists('.env'):
    from dotenv import dotenv_values
    config = dict(dotenv_values(".env"))
else:
    # Ask about the Qdrant API (get one in cloud.qdrant.io)
    config['QDRANT_API_KEY'] = input("What is your Qdrant API key?")
    config['QDRANT_CLUSTER'] = input("What is your Qdrant cluster?")
    config['QDRANT_COLLECTION'] = input("What is your Qdrant collection?")    

In [None]:
# Load the dataset
import pandas as pd

json_path = './dfs/data_cpp.json'

df = pd.read_json(json_path)  # Read JSON
df = df.fillna('')                # Fill empty fields
df = df.astype(str)               # Only string allowed
print("Shape:", df.shape)         # Shape of the df
df.head()                         # Show first four

Shape: (38595, 8)


Unnamed: 0,postId,postTypeId,title,body,tagName,creationDate,score,viewCount
0,25,1,How to use the C socket API in C++ on z/OS,I'm having issues getting the C sockets API to...,c++,20080801,176,16412
1,264,1,BerkeleyDB Concurrency,What's the optimal level of concurrency that ...,c++,20080801,38,2899
2,330,1,Should I use nested classes in this case?,I am working on a collection of classes used f...,c++,20080802,58,5019
3,601,1,Robust Random Number Generation,"I'm looking for a performant, reasonably robus...",c++,20080803,42,2145
4,609,1,Build for Windows NT 4.0 using Visual Studio 2...,An MFC application that I'm trying to migrate ...,c++,20080803,21,4505


In [5]:
from qdrant_client import QdrantClient
from sentence_transformers import SentenceTransformer

# Model
model = SentenceTransformer('all-MiniLM-L6-v2')

client = QdrantClient(
    url=config['QDRANT_CLUSTER'],
    https=True,
    api_key=config['QDRANT_API_KEY']
)

# Prepare documents and payload
documents = df['title'] + " " + df['body']  # Index only title and body
payload = df[[                              # Use all you have as metadata
    'title',
    'body', 
    'postId',
    'postTypeId', 
    'tagName', 
    'creationDate', 
    'score', 
    'viewCount'
]].to_dict(orient='records')

# Create embeddings
embeddings = model.encode(documents.tolist(), convert_to_numpy=True)

In [6]:
# Create a collection
from qdrant_client.models import PointStruct, VectorParams, Distance

collection_name = config['QDRANT_COLLECTION']

try:
    client.create_collection(
        collection_name=collection_name,
        vectors_config=VectorParams(
            size=embeddings.shape[1],
            distance=Distance.COSINE
        )
    )
except:
    # It exists already, just ignore the error
    pass

In [7]:
# Add docs in Qdrant in batches
import math

def upsert_in_batch(client, collection_name, points, batch_size=1000):

  num_batches = math.ceil(len(points) / batch_size)
  
  for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(points))
    chunk = points[start_idx:end_idx]
    client.upsert(collection_name=collection_name, points=chunk)
    print(f'Adding batch {i} of {num_batches}...')

points = [
    PointStruct(id=i, vector=embedding.tolist(), payload=payload[i])
    for i, embedding in enumerate(embeddings)
]

upsert_in_batch(client=client, collection_name=collection_name, points=points)

print("Data successfully ingested in batches")

Adding batch 0 of 39...
Adding batch 1 of 39...
Adding batch 2 of 39...
Adding batch 3 of 39...
Adding batch 4 of 39...
Adding batch 5 of 39...
Adding batch 6 of 39...
Adding batch 7 of 39...
Adding batch 8 of 39...
Adding batch 9 of 39...
Adding batch 10 of 39...
Adding batch 11 of 39...
Adding batch 12 of 39...
Adding batch 13 of 39...
Adding batch 14 of 39...
Adding batch 15 of 39...
Adding batch 16 of 39...
Adding batch 17 of 39...
Adding batch 18 of 39...
Adding batch 19 of 39...
Adding batch 20 of 39...
Adding batch 21 of 39...
Adding batch 22 of 39...
Adding batch 23 of 39...
Adding batch 24 of 39...
Adding batch 25 of 39...
Adding batch 26 of 39...
Adding batch 27 of 39...
Adding batch 28 of 39...
Adding batch 29 of 39...
Adding batch 30 of 39...
Adding batch 31 of 39...
Adding batch 32 of 39...
Adding batch 33 of 39...
Adding batch 34 of 39...
Adding batch 35 of 39...
Adding batch 36 of 39...
Adding batch 37 of 39...
Adding batch 38 of 39...
Data successfully ingested in batch