In [None]:
# Install requirements
# drop the -q if you want to se the results
! pip install -qr requirements.txt

In [34]:
# Check the API KEYs
import os

config = {}

if os.path.exists('.env'):
    from dotenv import dotenv_values
    config = dict(dotenv_values(".env"))
else:
    # Ask about the Qdrant API (get one in cloud.qdrant.io)
    config['QDRANT_API_KEY'] = input("What is your Qdrant API key?")
    config['QDRANT_CLUSTER'] = input("What is your Qdrant cluster?")
    config['QDRANT_COLLECTION'] = input("What is your Qdrant collection?")    

In [36]:
# Load the dataset
import pandas

json_path = './dfs/data_cpp.json'

df = pandas.read_json(json_path)  # Read JSON
df = df.fillna('')                # Fill empty fields
df = df.astype(str)               # Only string allowed
print("Shape:", df.shape)         # Shape of the df
df.head()                         # Show first four

Shape: (38595, 8)


Unnamed: 0,postId,postTypeId,title,body,tagName,creationDate,score,viewCount
0,25,1,How to use the C socket API in C++ on z/OS,I'm having issues getting the C sockets API to...,c++,20080801,176,16412
1,264,1,BerkeleyDB Concurrency,What's the optimal level of concurrency that ...,c++,20080801,38,2899
2,330,1,Should I use nested classes in this case?,I am working on a collection of classes used f...,c++,20080802,58,5019
3,601,1,Robust Random Number Generation,"I'm looking for a performant, reasonably robus...",c++,20080803,42,2145
4,609,1,Build for Windows NT 4.0 using Visual Studio 2...,An MFC application that I'm trying to migrate ...,c++,20080803,21,4505


In [None]:
# Ingest data on Qdrant
from qdrant_client import QdrantClient

client = QdrantClient(
    url=config['QDRANT_CLUSTER'],
    https=True,
    api_key=config['QDRANT_API_KEY']
)

# Prepare documents and payload
documents = df['title'] + " " + df['body']  # Index only title and body
payload = df[[                              # Use all you have as metadata
    'postId', 
    'postTypeId', 
    'tagName', 
    'creationDate', 
    'score', 
    'viewCount'
]].to_dict(orient='records')

# Create a collection
collection_name = config['QDRANT_COLLECTION']

try:
    client.create_collection(
        collection_name=collection_name,
        vector_size=1536,  # Default for fastembedding
        distance="Cosine"  # Choose your distance metric
    )
except:
    # It exists already, just ignore the error
    pass

In [42]:
# If everything looks good, ingest it
client.add(
    collection_name=collection_name,    
    documents=documents.tolist(),    # List of strings (documents to index)
    metadata=payload                 # Metadata payload
)

print("Data successfully ingested")

100%|██████████| 77.7M/77.7M [00:03<00:00, 24.4MiB/s]


Data successfully ingested
