# Uploading Embeddings to Qdrand
In this notebook, we will upload the following embeddings to qdrant
- SBERT Book Metadata embeddings
- GMF User embeddings
- GMF Book embeddings

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import ast
import os
import dask.dataframe as dd

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


## 1. Remap GMF Embeddings to original User and Item ids

In [2]:
user_id_map = pd.read_csv('data/user_id_map_reduced.csv')
item_id_map = pd.read_csv('data/item_id_map_reduced.csv')

In [3]:
user_id_map.head()

Unnamed: 0,original_userId,new_userId
0,cc8c48d41aea15e76fad1d9cb6efde7a,0
1,0c07498e94309381e4f79c9176f57462,1
2,37e4d1438f5918fd1400c12b49b80f61,2
3,8bab1ccd64657f45c5ce407d401ccdf6,3
4,0586f608c4a9dd85274f3dd03a3267f1,4


In [4]:
item_id_map.head()

Unnamed: 0,original_itemId,new_itemId
0,22557272,0
1,11869272,1
2,370493,2
3,19543,3
4,3,4


In [5]:
gmf_user_embeddings_df = dd.read_parquet("embeddings/gmf_user_embeddings.parquet")
gmf_user_embeddings_df.head()

Unnamed: 0,user_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
0,0,-1.839949,-0.107026,-1.24743,1.23541,-2.745796,3.354668,1.215874,-0.187588,0.332815,1.62546,-1.902273,-0.46424,0.143594,0.419341,-1.652989,0.224399,1.73905,0.911732,-0.702986,-1.368686,-0.918555,-2.103576,-0.8704,-3.136705,0.554513,0.136379,1.170567,0.576086,-0.698404,1.002196,-1.492236,-0.387739
1,1,-1.699326,1.03807,-1.368538,1.604467,0.749374,0.457662,-2.472639,-2.245041,-0.428177,-0.663786,-1.366295,-0.751835,-0.991412,-1.874524,-1.275608,2.14652,0.150153,0.786096,0.701022,-0.126197,-1.296423,-2.885526,-0.077704,-3.164231,1.667563,-0.501934,-1.987554,0.989491,-0.06603,1.673684,-0.285737,-1.12189
2,2,-1.054894,-1.410094,-0.74357,-0.271791,-0.689568,-2.036283,1.566226,-3.174807,-0.035449,-0.23126,-0.721715,-0.452032,1.611229,1.231235,-0.733774,-0.648985,-0.071161,-1.841602,-0.615026,0.001302,-1.424239,-0.647386,0.144783,-2.833383,0.730584,0.678978,-0.217898,2.083225,0.484727,2.656081,0.280037,1.383757
3,3,0.231832,-2.166265,-1.151293,1.492978,-1.054405,1.843999,-1.826434,-2.092778,0.958185,-0.976558,-3.363012,0.994735,2.212548,0.243322,-2.262258,-0.014679,0.259985,-0.202961,0.922577,-3.733649,-0.847572,-0.460779,0.25178,-1.823008,2.216891,0.558714,1.727994,1.456882,0.315823,1.906732,-3.104737,-2.717363
4,4,-2.12236,0.273351,-0.320856,1.361271,-1.672432,2.319666,1.383571,-1.32211,-0.568443,1.173762,0.438749,-1.613617,-0.440702,0.093146,-2.409795,1.29218,0.472124,-0.362853,1.468371,-2.281219,-1.581069,-1.401221,-0.27782,0.216558,-1.59283,0.143684,-0.245232,1.359082,-1.920618,0.859559,0.820305,-1.195001


In [6]:
# Identify the embedding columns (assuming they are '0' to '31')
embedding_cols = [str(i) for i in range(32)]

# Define a function to combine embedding columns into a list
def combine_embeddings(row):
    return row[embedding_cols].tolist()

# Apply the function row-wise to create the 'embedding' column
# meta specifies the output column name and data type for Dask
gmf_user_embeddings_df['embedding'] = gmf_user_embeddings_df.apply(
    combine_embeddings,
    axis=1,
    meta=('embedding', 'object')
)

# Select the user_id and the new embedding column, dropping the old ones
gmf_user_embeddings_final = gmf_user_embeddings_df[['user_id', 'embedding']]

# Display the head of the transformed DataFrame
gmf_user_embeddings_final.head()

Unnamed: 0,user_id,embedding
0,0,"[-1.8399492502212524, -0.10702560096979141, -1..."
1,1,"[-1.6993255615234375, 1.0380702018737793, -1.3..."
2,2,"[-1.054893970489502, -1.4100935459136963, -0.7..."
3,3,"[0.2318318486213684, -2.166264772415161, -1.15..."
4,4,"[-2.12235951423645, 0.27335119247436523, -0.32..."


In [7]:
# Merge user embeddings with user ID map
gmf_user_embeddings_final = gmf_user_embeddings_final.merge(
    user_id_map,
    left_on='user_id',
    right_on='new_userId',
    how='inner'
)

# Select and rename columns
gmf_user_embeddings_final = gmf_user_embeddings_final[['original_userId', 'embedding']]
gmf_user_embeddings_final = gmf_user_embeddings_final.rename(columns={'original_userId': 'user_id'})

# Display the head of the remapped user embeddings DataFrame
gmf_user_embeddings_final.head()

Unnamed: 0,user_id,embedding
0,cc8c48d41aea15e76fad1d9cb6efde7a,"[-1.8399492502212524, -0.10702560096979141, -1..."
1,0c07498e94309381e4f79c9176f57462,"[-1.6993255615234375, 1.0380702018737793, -1.3..."
2,37e4d1438f5918fd1400c12b49b80f61,"[-1.054893970489502, -1.4100935459136963, -0.7..."
3,8bab1ccd64657f45c5ce407d401ccdf6,"[0.2318318486213684, -2.166264772415161, -1.15..."
4,0586f608c4a9dd85274f3dd03a3267f1,"[-2.12235951423645, 0.27335119247436523, -0.32..."


In [8]:
gmf_book_embeddings_df = dd.read_parquet("embeddings/gmf_book_embeddings.parquet")
gmf_book_embeddings_df.head()

Unnamed: 0,item_id,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31
0,0,1.187312,1.296599,0.489951,0.942673,-0.745626,0.092537,-0.240039,0.408664,0.455195,-0.135315,0.37836,0.176712,-0.542278,-1.139771,0.742209,0.040432,-0.375338,-0.578799,0.56505,-0.554557,-0.248407,-0.422547,0.44317,0.348404,-0.310314,-0.800455,-0.931527,-0.396947,0.016292,-0.955298,0.307498,0.962002
1,1,-1.497686,1.300688,-0.859403,0.655968,-1.736125,0.714726,0.900011,-0.187634,0.062437,-0.420363,1.267667,-0.321999,-0.100276,-1.159548,0.097108,0.535662,-0.590675,0.505428,0.199959,-0.795945,-0.534631,-0.782391,-0.470678,-0.191889,0.359508,-0.270147,0.372783,-0.324263,0.677709,-1.034807,0.02958,0.970921
2,2,-0.52129,1.178262,0.38003,1.476034,-0.086451,0.832165,-0.508152,1.122721,-0.096842,-0.258327,0.393842,0.73254,-0.680057,0.396424,0.491598,-0.861893,0.240101,0.207462,-0.073415,-1.162048,-1.229987,0.494536,0.202322,0.309359,-1.331804,0.439654,0.363794,0.22645,-0.481517,1.141233,0.148095,-0.16804
3,3,-0.682116,1.271976,0.968041,1.016192,-0.948475,1.073638,-0.468036,1.261153,-0.059184,0.40729,0.141075,0.510334,-0.732128,0.589459,0.68546,-0.896256,0.329483,-0.246165,-0.09991,-0.880761,-1.218686,0.514843,-0.066397,1.219768,-1.382662,-0.072943,0.566564,0.10141,-0.539726,1.317127,0.8247,-0.313159
4,4,0.127419,1.261809,0.346063,0.622713,0.978303,0.48373,0.169683,1.327286,0.062631,-0.812786,0.046545,0.719478,-0.42076,0.244159,0.536063,0.621266,0.38158,-0.625833,0.419863,-0.622441,0.144758,0.263645,-0.336533,-0.448636,-1.099236,-0.355271,0.156408,-0.661411,-0.724047,0.865045,0.776408,0.972261


In [9]:
# Identify the embedding columns (assuming they are '0' to '31')
embedding_cols = [str(i) for i in range(32)]

# Define a function to combine embedding columns into a list
def combine_embeddings(row):
    return row[embedding_cols].tolist()

# Apply the function row-wise to create the 'embedding' column
# meta specifies the output column name and data type for Dask
gmf_book_embeddings_df['embedding'] = gmf_book_embeddings_df.apply(
    combine_embeddings,
    axis=1,
    meta=('embedding', 'object')
)

# Select the item_id and the new embedding column, dropping the old ones
gmf_book_embeddings_final = gmf_book_embeddings_df[['item_id', 'embedding']]

# Display the head of the transformed DataFrame
gmf_book_embeddings_final.head()

Unnamed: 0,item_id,embedding
0,0,"[1.1873117685317993, 1.2965991497039795, 0.489..."
1,1,"[-1.4976861476898193, 1.3006882667541504, -0.8..."
2,2,"[-0.5212898850440979, 1.1782617568969727, 0.38..."
3,3,"[-0.6821157932281494, 1.2719764709472656, 0.96..."
4,4,"[0.12741948664188385, 1.2618093490600586, 0.34..."


In [10]:
# Merge book embeddings with item ID map
gmf_book_embeddings_final = gmf_book_embeddings_final.merge(
    item_id_map,
    left_on='item_id',
    right_on='new_itemId',
    how='inner'
)

# Select and rename columns
gmf_book_embeddings_final = gmf_book_embeddings_final[['original_itemId', 'embedding']]
gmf_book_embeddings_final = gmf_book_embeddings_final.rename(columns={'original_itemId': 'item_id'})

# Display the head of the remapped book embeddings DataFrame
gmf_book_embeddings_final.head()

Unnamed: 0,item_id,embedding
0,22557272,"[1.1873117685317993, 1.2965991497039795, 0.489..."
1,11869272,"[-1.4976861476898193, 1.3006882667541504, -0.8..."
2,370493,"[-0.5212898850440979, 1.1782617568969727, 0.38..."
3,19543,"[-0.6821157932281494, 1.2719764709472656, 0.96..."
4,3,"[0.12741948664188385, 1.2618093490600586, 0.34..."


In [11]:
sbert_embeddings_df = dd.read_parquet("embeddings/sbert_embeddings.parquet")
sbert_embeddings_df.head()

Unnamed: 0,book_id,text,embedding
0,6066819,Title: Best Friends Forever | Genres: coming-o...,"[-0.043174773, 0.013365388, 0.03437024, 0.0240..."
1,89375,Title: 90 Minutes in Heaven: A True Story of D...,"[0.051977597, 0.08296822, -0.041392915, -0.043..."
2,11731782,"Title: Collide (Collide, #1) | Genres: contemp...","[-0.02058969, -0.10596351, 0.12436018, 0.03277..."
3,54270,"Title: Mein Kampf | Genres: art, biography, hi...","[0.038011063, 0.030201998, -0.11067172, -0.051..."
4,38568,Title: A Quick Bite (Argeneau #1) | Genres: co...,"[-0.047273964, -0.056135908, -0.0028051443, 0...."


## Setting Up Qdrant

In [12]:
from qdrant_client import QdrantClient
client = QdrantClient(url="http://localhost:6333")

In [13]:
from qdrant_client.models import Distance, VectorParams

client.create_collection(
    collection_name="sbert_embeddings",
    vectors_config=VectorParams(size=384, distance=Distance.COSINE),
)

client.create_collection(
    collection_name="gmf_user_embeddings",
    vectors_config=VectorParams(size=32, distance=Distance.DOT),
)

client.create_collection(
    collection_name="gmf_book_embeddings",
    vectors_config=VectorParams(size=32, distance=Distance.DOT),
)

True

In [14]:
print("gmf_book", gmf_book_embeddings_final.columns)
print("gmf_user", gmf_user_embeddings_final.columns)
print("sbert_embeddings", sbert_embeddings_df.columns)

gmf_book Index(['item_id', 'embedding'], dtype='object')
gmf_user Index(['user_id', 'embedding'], dtype='object')
sbert_embeddings Index(['book_id', 'text', 'embedding'], dtype='object')


In [15]:
from qdrant_client.models import PointStruct
import uuid

batch_size = 500 # Define batch size for uploads

# --- Upload GMF User Embeddings ---
print("Uploading GMF User Embeddings...")
user_points_to_upload = []
# Compute the Dask DataFrame
computed_user_df = gmf_user_embeddings_final.compute()
for index, row in computed_user_df.iterrows():
    # User IDs are strings (hashes) based on previous error
    user_id_val = str(row['user_id']) # Ensure it's treated as a string
    user_points_to_upload.append(PointStruct(
        id=user_id_val, # Use string user_id as the point ID
        vector=row['embedding'],
        payload={"user_id": user_id_val} # Store string user_id in payload
    ))

# Upsert GMF User embeddings in batches
print(f"Upserting {len(user_points_to_upload)} GMF user points in batches of {batch_size}...")
for i in range(0, len(user_points_to_upload), batch_size):
    batch = user_points_to_upload[i:i + batch_size]
    client.upsert(collection_name="gmf_user_embeddings", points=batch, wait=True)
print(f"Uploaded {len(user_points_to_upload)} GMF user points.")
print("-" * 30)


# --- Upload GMF Book Embeddings ---
print("Uploading GMF Book Embeddings...")
book_points_to_upload = []
# Compute the Dask DataFrame
computed_book_df = gmf_book_embeddings_final.compute()
correct_book_id_column_gmf = 'item_id' # Based on previous output

for index, row in computed_book_df.iterrows():
    try:
        # Assume item_id is integer, handle potential errors
        point_id = int(row[correct_book_id_column_gmf])
        payload_id = point_id
    except ValueError:
        print(f"Warning: Could not convert ID '{row[correct_book_id_column_gmf]}' to int for GMF book embedding. Using as string.")
        point_id = str(row[correct_book_id_column_gmf])
        payload_id = point_id

    book_points_to_upload.append(PointStruct(
        id=point_id, # Use original item_id as the point ID (int or string)
        vector=row['embedding'],
        payload={correct_book_id_column_gmf: payload_id} # Store item_id in payload
    ))

# Upsert GMF Book embeddings in batches
print(f"Upserting {len(book_points_to_upload)} GMF book points in batches of {batch_size}...")
for i in range(0, len(book_points_to_upload), batch_size):
    batch = book_points_to_upload[i:i + batch_size]
    client.upsert(collection_name="gmf_book_embeddings", points=batch, wait=True)
print(f"Uploaded {len(book_points_to_upload)} GMF book points.")
print("-" * 30)


# --- Upload SBERT Book Embeddings ---
print("Uploading SBERT Book Embeddings...")
sbert_points_to_upload = []
# Compute the Dask DataFrame
computed_sbert_df = sbert_embeddings_df.compute()
correct_book_id_column_sbert = 'book_id' # Based on previous output

for index, row in computed_sbert_df.iterrows():
    try:
        # Assume book_id is integer, handle potential errors
        point_id = int(row[correct_book_id_column_sbert])
        payload_id = point_id
    except ValueError:
        print(f"Warning: Could not convert ID '{row[correct_book_id_column_sbert]}' to int for SBERT embedding. Using as string.")
        point_id = str(row[correct_book_id_column_sbert])
        payload_id = point_id

    sbert_points_to_upload.append(PointStruct(
        id=point_id, # Use the determined point ID (int or string)
        vector=row['embedding'], # Assuming 'embedding' column is correct
        payload={
            correct_book_id_column_sbert: payload_id,
            "text": row.get("text", "") # Include 'text' in payload, handle if missing
        }
    ))

# Upsert SBERT embeddings in batches
print(f"Upserting {len(sbert_points_to_upload)} SBERT book points in batches of {batch_size}...")
for i in range(0, len(sbert_points_to_upload), batch_size):
    batch = sbert_points_to_upload[i:i + batch_size]
    client.upsert(collection_name="sbert_embeddings", points=batch, wait=True)
print(f"Uploaded {len(sbert_points_to_upload)} SBERT book points.")
print("-" * 30)

print("All uploads complete.")

Uploading GMF User Embeddings...
Upserting 205242 GMF user points in batches of 500...
Uploaded 205242 GMF user points.
------------------------------
Uploading GMF Book Embeddings...
Upserting 17663 GMF book points in batches of 500...
Uploaded 17663 GMF book points.
------------------------------
Uploading SBERT Book Embeddings...
Upserting 17235 SBERT book points in batches of 500...
Uploaded 17235 SBERT book points.
------------------------------
All uploads complete.
