# Product Quantization

In [10]:
import pandas as pd

In [11]:
# Load data
df = pd.read_json("job_postings_prepped.json")

In [12]:
# Data Exploration
# print(df.info())
print(df.head())
# print(df.shape)

# Data Analysis
print("Dimension of the Vector: ", len(df['Vector'][0]))


                                                Data  \
0  Licensed Insurance Agent While many industries...   
1  Sales Manager Are dynamic creative marketing p...   
2  Model Risk Auditor Join Us Model Risk Auditor ...   
3  Business Manager Business ManagerFirst Baptist...   
4  NY Studio Assistant YOU COULD BE ONE OF THE MA...   

                                              Vector  
0  [2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
1  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
2  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
3  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
4  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  
Dimension of the Vector:  105248


## Initial Memory Size

In [13]:
vector_dimension = len(df['Vector'][0])
vector_size = df.memory_usage(deep=True)['Vector'] 
print("Vector Dimension: ", vector_dimension)
print("Vector Storage Size (MB):")
print( df.memory_usage(deep=True) / 1024**2 )

Vector Dimension:  105248
Vector Storage Size (MB):
Index         0.625137
Data         86.682539
Vector    13651.414261
dtype: float64


# Product Quantization 
Product Quatization is a method to reduce the memory usage of the codebook.
The idea is to split the codebook into sub-codebooks and quantize the data into sub-vectors.
The sub-vectors are then quantized into sub-codebooks.
The sub-codebook indices are then concatenated to form the final codebook index.


In [14]:
# Import Libraries
import nanopq 
import numpy as np

## Preprocessing the data
The data required is preprocessed and stored in a file. To use the data, loading the file is done to prevent unnecessary preprocessing time

In [17]:
# Import Libraries vectorizer

# vectors data type
print("Vector Type: ", type(df['Vector'][0]))
X = np.array(df['Vector'].values.tolist(), dtype=np.float32)

# Save the X vector
np.save("PQ/X.npy", X)

print("X Shape: ", X.shape)
print("X Vector Type: ", type(X))

Vector Type:  <class 'list'>


X Shape:  (15885, 105248)
X Vector Type:  <class 'numpy.ndarray'>


In [22]:
import pickle

def train_fit_pq(M, Ks, X):
    # Initialize the PQ encoder (M=5 sub-spaces, 256 centroids per sub-space)
    pq = nanopq.PQ(M=M, Ks=Ks) # M=5 as vector dimension must be divisible by M

    # Fit the data and Encode the vectors
    pq.fit(vecs=X)
    X_code = pq.encode(X)
    return X_code, pq

# Encode the vectors
print("Encoding the vectors with 8 sub-spaces....")
pq_8_8_code, pq_8_8 = train_fit_pq(M=8, Ks=8, X=X)
pq_8_16_code, pq_8_16 = train_fit_pq(M=8, Ks=16, X=X)
pq_8_32_code, pq_8_32 = train_fit_pq(M=8, Ks=32, X=X)
pq_8_64_code, pq_8_64 = train_fit_pq(M=8, Ks=64, X=X)
pq_8_128_code, pq_8_128 = train_fit_pq(M=8, Ks=128, X=X)

print("Saving the PQ objects....")
# Save the X_code to a file
np.save("PQ/pq_8_8_code.npy", pq_8_8_code)
np.save("PQ/pq_8_16_code.npy", pq_8_16_code)
np.save("PQ/pq_8_32_code.npy", pq_8_32_code)
np.save("PQ/pq_8_64_code.npy", pq_8_64_code)
np.save("PQ/pq_8_128_code.npy", pq_8_128_code)

# Save the pq object to a file
pickle.dump(pq_8_8, open("PQ/pq_8_8.pkl", "wb"))
pickle.dump(pq_8_16, open("PQ/pq_8_16.pkl", "wb"))
pickle.dump(pq_8_32, open("PQ/pq_8_32.pkl", "wb"))
pickle.dump(pq_8_64, open("PQ/pq_8_64.pkl", "wb"))
pickle.dump(pq_8_128, open("PQ/pq_8_128.pkl", "wb"))
# ===============================================================
print("Encoding the vectors with 16 sub-spaces....")
pq_16_8_code, pq_16_8 = train_fit_pq(M=16, Ks=8, X=X)
pq_16_16_code, pq_16_16 = train_fit_pq(M=16, Ks=16, X=X)
pq_16_32_code, pq_16_32 = train_fit_pq(M=16, Ks=32, X=X)
pq_16_64_code, pq_16_64 = train_fit_pq(M=16, Ks=64, X=X)
pq_16_128_code, pq_16_128 = train_fit_pq(M=16, Ks=128, X=X)

np.save("PQ/pq_16_8_code.npy", pq_16_8_code)
np.save("PQ/pq_16_16_code.npy", pq_16_16_code)
np.save("PQ/pq_16_32_code.npy", pq_16_32_code)
np.save("PQ/pq_16_64_code.npy", pq_16_64_code)
np.save("PQ/pq_16_128_code.npy", pq_16_128_code)

pickle.dump(pq_16_8, open("PQ/pq_16_8.pkl", "wb"))
pickle.dump(pq_16_16, open("PQ/pq_16_16.pkl", "wb"))
pickle.dump(pq_16_32, open("PQ/pq_16_32.pkl", "wb"))
pickle.dump(pq_16_64, open("PQ/pq_16_64.pkl", "wb"))
pickle.dump(pq_16_128, open("PQ/pq_16_128.pkl", "wb"))
# ===============================================================

print("Encoding the vectors with 32 sub-spaces....")
pq_32_8_code, pq_32_8 = train_fit_pq(M=32, Ks=8, X=X)
pq_32_16_code, pq_32_16 = train_fit_pq(M=32, Ks=16, X=X)
pq_32_32_code, pq_32_32 = train_fit_pq(M=32, Ks=32, X=X)
pq_32_64_code, pq_32_64 = train_fit_pq(M=32, Ks=64, X=X)
pq_32_128_code, pq_32_128 = train_fit_pq(M=32, Ks=128, X=X)

np.save("PQ/pq_32_8_code.npy", pq_32_8_code)
np.save("PQ/pq_32_16_code.npy", pq_32_16_code)
np.save("PQ/pq_32_32_code.npy", pq_32_32_code)
np.save("PQ/pq_32_64_code.npy", pq_32_64_code)
np.save("PQ/pq_32_128_code.npy", pq_32_128_code)

pickle.dump(pq_32_8, open("PQ/pq_32_8.pkl", "wb"))
pickle.dump(pq_32_16, open("PQ/pq_32_16.pkl", "wb"))
pickle.dump(pq_32_32, open("PQ/pq_32_32.pkl", "wb"))
pickle.dump(pq_32_64, open("PQ/pq_32_64.pkl", "wb"))
pickle.dump(pq_32_128, open("PQ/pq_32_128.pkl", "wb"))

# ===============================================================

# print("Encoding the vectors with 64 sub-spaces....")
# pq_64_8_code, pq_64_8 = train_fit_pq(M=64, Ks=8, X=X)
# pq_64_16_code, pq_64_16 = train_fit_pq(M=64, Ks=16, X=X)
# pq_64_32_code, pq_64_32 = train_fit_pq(M=64, Ks=32, X=X)
# pq_64_64_code, pq_64_64 = train_fit_pq(M=64, Ks=64, X=X)
# pq_64_128_code, pq_64_128 = train_fit_pq(M=64, Ks=128, X=X)

# np.save("PQ/pq_64_8_code.npy", pq_64_8_code)
# np.save("PQ/pq_64_16_code.npy", pq_64_16_code)
# np.save("PQ/pq_64_32_code.npy", pq_64_32_code)
# np.save("PQ/pq_64_64_code.npy", pq_64_64_code)
# np.save("PQ/pq_64_128_code.npy", pq_64_128_code)

# pickle.dump(pq_64_8, open("PQ/pq_64_8.pkl", "wb"))
# pickle.dump(pq_64_16, open("PQ/pq_64_16.pkl", "wb"))
# pickle.dump(pq_64_32, open("PQ/pq_64_32.pkl", "wb"))
# pickle.dump(pq_64_64, open("PQ/pq_64_64.pkl", "wb"))
# pickle.dump(pq_64_128, open("PQ/pq_64_128.pkl", "wb"))

# ===============================================================

# print("Encoding the vectors with 128 sub-spaces....")
# pq_128_8_code, pq_128_8 = train_fit_pq(M=128, Ks=8, X=X)
# pq_128_16_code, pq_128_16 = train_fit_pq(M=128, Ks=16, X=X)
# pq_128_32_code, pq_128_32 = train_fit_pq(M=128, Ks=32, X=X)
# pq_128_64_code, pq_128_64 = train_fit_pq(M=128, Ks=64, X=X)
# pq_128_128_code, pq_128_128 = train_fit_pq(M=128, Ks=128, X=X)

# np.save("PQ/pq_128_8_code.npy", pq_128_8_code)
# np.save("PQ/pq_128_16_code.npy", pq_128_16_code)
# np.save("PQ/pq_128_32_code.npy", pq_128_32_code)
# np.save("PQ/pq_128_64_code.npy", pq_128_64_code)
# np.save("PQ/pq_128_128_code.npy", pq_128_128_code)

# pickle.dump(pq_128_8, open("PQ/pq_128_8.pkl", "wb"))
# pickle.dump(pq_128_16, open("PQ/pq_128_16.pkl", "wb"))
# pickle.dump(pq_128_32, open("PQ/pq_128_32.pkl", "wb"))
# pickle.dump(pq_128_64, open("PQ/pq_128_64.pkl", "wb"))
# pickle.dump(pq_128_128, open("PQ/pq_128_128.pkl", "wb"))



Encoding the vectors with 8 sub-spaces....
M: 8, Ks: 8, metric : <class 'numpy.uint8'>, code_dtype: l2
iter: 20, seed: 123
Training the subspace: 0 / 8


KeyboardInterrupt: 

In [None]:
# Create folder for results in PQ Search Results directory
import os
import pathlib

# Define the values of M and Ks
Ms = [8, 16, 32]
Ks = [8, 16, 32, 64, 128]

# Create directories for each PQ configuration
for M in Ms:
    for K in Ks:
        dirname = "PQ/PQ Search Results/pq_{}_{}_results".format(M, K)
        pathlib.Path(dirname).mkdir(parents=True, exist_ok=True)


# Preloading
The data is preloaded from the file to prevent unnecessary preprocessing time when running the code

In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn.feature_extraction.text import CountVectorizer

# Load data
print("Loading df data....")
df = pd.read_json("job_postings_prepped.json")

# Initialize Count Vectorizer for fitting Query at runtime
print("Loading Count Vectorizer....")
vectorizer = CountVectorizer()
# load from vectorizer.pkl
vectorizer = pickle.load(open("vectorizer.pkl", "rb"))

# Load the X vector from the file
print("Loading X vector....")
X = np.load("PQ/X.npy")

# Load the PQ object and the X_code from the file
print("Loading pq_8 objects....")
pq_8_8 = pickle.load(open("PQ/pq_8_8.pkl", "rb"))
pq_8_8_code = np.load("PQ/pq_8_8_code.npy")

pq_8_16 = pickle.load(open("PQ/pq_8_16.pkl", "rb"))
pq_8_16_code = np.load("PQ/pq_8_16_code.npy")

pq_8_32 = pickle.load(open("PQ/pq_8_32.pkl", "rb"))
pq_8_32_code = np.load("PQ/pq_8_32_code.npy")

pq_8_64 = pickle.load(open("PQ/pq_8_64.pkl", "rb"))
pq_8_64_code = np.load("PQ/pq_8_64_code.npy")

pq_8_128 = pickle.load(open("PQ/pq_8_128.pkl", "rb"))
pq_8_128_code = np.load("PQ/pq_8_128_code.npy")

# ===============================================================
print("Loading pq_16 objects....")
pq_16_8 = pickle.load(open("PQ/pq_16_8.pkl", "rb"))
pq_16_8_code = np.load("PQ/pq_16_8_code.npy")

pq_16_16 = pickle.load(open("PQ/pq_16_16.pkl", "rb"))
pq_16_16_code = np.load("PQ/pq_16_16_code.npy")

pq_16_32 = pickle.load(open("PQ/pq_16_32.pkl", "rb"))
pq_16_32_code = np.load("PQ/pq_16_32_code.npy")

pq_16_64 = pickle.load(open("PQ/pq_16_64.pkl", "rb"))
pq_16_64_code = np.load("PQ/pq_16_64_code.npy")

pq_16_128 = pickle.load(open("PQ/pq_16_128.pkl", "rb"))
pq_16_128_code = np.load("PQ/pq_16_128_code.npy")

# ===============================================================
print("Loading pq_32 objects....")
pq_32_8 = pickle.load(open("PQ/pq_32_8.pkl", "rb"))
pq_32_8_code = np.load("PQ/pq_32_8_code.npy")

pq_32_16 = pickle.load(open("PQ/pq_32_16.pkl", "rb"))
pq_32_16_code = np.load("PQ/pq_32_16_code.npy")

pq_32_32 = pickle.load(open("PQ/pq_32_32.pkl", "rb"))
pq_32_32_code = np.load("PQ/pq_32_32_code.npy")

pq_32_64 = pickle.load(open("PQ/pq_32_64.pkl", "rb"))
pq_32_64_code = np.load("PQ/pq_32_64_code.npy")

pq_32_128 = pickle.load(open("PQ/pq_32_128.pkl", "rb"))
pq_32_128_code = np.load("PQ/pq_32_128_code.npy")

# ===============================================================

Loading df data....
Loading Count Vectorizer....
Loading X vector....


https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


Loading pq_8 objects....
Loading pq_16 objects....
Loading pq_32 objects....


## Space Analysis

In [2]:

def get_space_analysis(orig_vector, pq_code_vector, name):
    # Get the space reduction
    orig_vector_size = orig_vector.nbytes / 1024**2
    pq_code_vector_size = pq_code_vector.nbytes / 1024**2
    space_reduction = (1 - pq_code_vector_size / orig_vector_size) * 100
    print("PQ Code Name: ", name)
    print("Vector Dimension: ", len(orig_vector[0]))
    print("PQ Code Vector Dimension: ", len(pq_code_vector[0]))
    print("Original Vector Size (MB): ", orig_vector_size)
    print("PQ Code Vector Size (MB): ", pq_code_vector_size)
    print("Compression Ratio: ", orig_vector_size / pq_code_vector_size, "x")
    print("Space Reduction: ", space_reduction, "%\n")
    
get_space_analysis(X, pq_8_128_code, "pq_8_128")
get_space_analysis(X, pq_16_128_code, "pq_16_128")
get_space_analysis(X, pq_32_128_code, "pq_32_128")


PQ Code Name:  pq_8_128
Vector Dimension:  105248
PQ Code Vector Dimension:  8
Original Vector Size (MB):  6377.6568603515625
PQ Code Vector Size (MB):  0.12119293212890625
Compression Ratio:  52624.0 x
Space Reduction:  99.9980997263606 %

PQ Code Name:  pq_16_128
Vector Dimension:  105248
PQ Code Vector Dimension:  16
Original Vector Size (MB):  6377.6568603515625
PQ Code Vector Size (MB):  0.2423858642578125
Compression Ratio:  26312.0 x
Space Reduction:  99.99619945272119 %

PQ Code Name:  pq_32_128
Vector Dimension:  105248
PQ Code Vector Dimension:  32
Original Vector Size (MB):  6377.6568603515625
PQ Code Vector Size (MB):  0.484771728515625
Compression Ratio:  13156.0 x
Space Reduction:  99.99239890544239 %



# Similarity Search

### Using Asymmetric Distance

Runtime function to compute similarity search using asymmetric distance.

In [37]:
import time

def compute_similarity(query_text, pq_code, pq):
    """
    Compute the similarity between the query text and the pq_code
    :param query_text: the query text
    :param pq_code: the pq_code
    :param pq: the pq encoder
    :return: the similarity
    """
    print("Computing similarity for: {}...".format(query_text))
    # Transform the query text to single vector
    query_vector = vectorizer.transform([query_text]).toarray().reshape(-1).astype(np.float32)

    # print("Query Vector Dimension: ", query_vector.shape)
    # print("PQ Code Shape: ", pq_code.shape)
    # Initialize the distance table
    distance_table = pq.dtable(query_vector)
    # print("Distance Table Shape: ", distance_table)
    distance = distance_table.adist(pq_code)
    end = time.time()
    return distance, end

def get_query_res_ranking(df, adist_result):
    print("Getting Query Results Ranking....")
    # Create a new dataframe with df and append adist_result to it
    results_df = df.copy()
    results_df['Distance'] = adist_result
    
    # Sort the results by distance
    results_df = results_df.sort_values(by=['Distance'])
    print("Top 5 Results:\n", results_df.head())
    end = time.time()
    return results_df, end

def save_to_csv_similarity_results(df, query, pq_setting):
    print("Saving {} Results to CSV....".format(pq_setting))
    
    # Parse query to strip spaces and replace spaces with underscores
    query = query.strip().replace(" ", "_")
    
    # Drop the vector column 
    df.drop(columns=['Vector'], inplace=True)
    # Save the results to a csv file
    df.to_csv("PQ/PQ Search Results/{}_results/{}.csv".format(pq_setting, query), index=False)
    end = time.time()
    print("{} Results Saved to CSV!\n".format(pq_setting))
    return end

# # Print the top 5 results
# def print_top_5_results(df, idx):
#     print("Top 5 Results:")
#     print(df.iloc[idx.argsort()[:5]])
    
# # print all the results with the similarity score
# def print_all_results(df, idx):
#     print("All Results:")
#     print(df.iloc[idx.argsort()])
    
def search_using_query(query, pq_setting, pq_code, pq, df):
    start = time.time()
    
    # Compute the similarity
    dist, compute_similarity_time = compute_similarity(query, pq_code, pq)
    
    # Get the query results ranking
    result_df, get_rank_time = get_query_res_ranking(df, dist)
    
    # Save the results to a csv file
    save_to_csv_time = save_to_csv_similarity_results(result_df, query, pq_setting)
    
    end = time.time()
    # Print the time taken
    print("===============================================================")
    print("Total Time Taken:\n")
    print("Start Time: ", start)
    print("Compute Similarity Time: ", compute_similarity_time - start)
    print("Get Ranking Time: ", get_rank_time - compute_similarity_time)
    print("Save to CSV Time: ", save_to_csv_time - get_rank_time)
    print("Basic Time Taken: ", end - start)
    print("True Computed Time Taken: ", (compute_similarity_time - start) + (get_rank_time - compute_similarity_time) + (save_to_csv_time - get_rank_time))
    
    
    
    
# =============================================================== Test Bench ===============================================================

query = "Hey "
# dist, s_t, e_t = compute_similarity(query, pq_8_128_code, pq_8_128)
# result_df = get_query_res_ranking(df, dist)
# save_to_csv_similarity_results(result_df, query, "pq_8_128")

search_using_query(query, "pq_8_128", pq_8_128_code, pq_8_128, df)


Computing similarity for: Hey ...
Getting Query Results Ranking....
Top 5 Results:
                                                     Data  \
11370  Account Manager The Bachrach Group collaborati...   
13367  IAM Ping Federate Consultant Location: Irving ...   
6872             Manager new location. looking positions   
1469   Performance Test Engineer Performance Tetser W...   
1473   Multiple Open Positions Inquiry open positions...   

                                                  Vector  Distance  
11370  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  9.561901  
13367  [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  9.561901  
6872   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  9.561901  
1469   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  9.561901  
1473   [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...  9.561901  
Saving pq_8_128 Results to CSV....
pq_8_128 Results Saved to CSV!

Total Time Taken:

Start Time:  1697001996.722618
Compute Similarity Time:

#### Query Dict Processing

In [16]:
# Testing of Query Dict Processing, TODO: refactor into a function and leave the vector processing to the function caller
# Import Libraries
from sklearn.feature_extraction.text import CountVectorizer

# Defining queries
query_dict = {'Q1': "resilient investment banker", 
              'Q2': "2 years experience product manager", 
              'Q3': "10 years risk analyst problem solver", 
              'Q4': "tax analyst for big company", 
              'Q5': "software engineer for google or amazon", 
              'Q6': "video editor for advertisements with 5 year experience",
              'Q7': "full time senior head nurse position",
              'Q8': "after school math and science tutor",
              'Q9': "dietitian for professional atheletes",
              'Q10': "costume designer and makeup artist"}

# Create a dataframe from the query dictionary
query_df = pd.DataFrame.from_dict(query_dict, orient='index', columns=['Query'])

# Count Vectorize the queries and store the vectors as float32 in the dataframe
vectorizer = CountVectorizer()
# Fit the df['Data'] into the vectorizer to get the same dimension
vectorizer.fit(df['Data'])

# Transform the querys into vectors
query_df['Vector'] = [vectorizer.transform([query]).toarray()[0].astype(np.float32) for query in query_df['Query'].values]

# # Create the Matrix
# matrix = vectorizer.fit_transform(list(query_df["Query"].array)).toarray()

# # Add into new column in df
# query_df["Vector"] = [row.tolist() for row in matrix]

# # Encode the vectors 
# # query_df['PQ_8_64'] = pq_8_64.encode(query_df['Vector'].values.tolist())

print("Vector Dimension: ", len(query_df['Vector'][-1]))
query_df.head()

Vector Dimension:  105248


Unnamed: 0,Query,Vector
Q1,resilient investment banker,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
Q2,2 years experience product manager,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
Q3,10 years risk analyst problem solver,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
Q4,tax analyst for big company,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
Q5,software engineer for google or amazon,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [15]:
# Working with one query
import numpy as np

# Compute Similarity between Query and the Compressed Vector
def compute_similarity(query_row, pq_code, pq):
    print("Computing Similarity for Query:\n", query_row['Query'])
    query_vector = query_row['Vector']
    distance_table = pq.dtable(query=query_vector)
    dists = distance_table.adist(codes=pq_code)
    return dists

# Print the top 5 results
def print_top_5_results(df, idx):
    print("Top 5 Results:")
    print(df.iloc[idx.argsort()[:5]])
    
# print all the results with the similarity score
def print_all_results(df, idx):
    print("All Results:")
    print(df.iloc[idx.argsort()])
    

# Save as csv the results
def save_to_csv_results(df, idx, query):
    print("Saving Results to CSV....")
    df.iloc[idx.argsort()].to_csv("PQ/Results/" + query + ".csv", index=False)
    print("Results Saved to CSV!")

query_row = query_df.iloc[0]
idx = compute_similarity(query_row, pq_8_64_code, pq_8_64)
print("Similarity: ", idx)

# print_top_5_results(df, idx)
# print_all_results(df, idx)

# save_to_csv_results(df, idx, query_row['Query'])


NameError: name 'query_df' is not defined

In [None]:
# Write result into csv file


Query:
 10    Commercial Property Manager/Senior Property Ma...
Name: Data, dtype: object
Top 5 results:
 [   10 13400  4324 13332 12157]
Top 5 Similar Data
 10       Commercial Property Manager/Senior Property Ma...
13401    Director Property Management Job Summary: Seek...
4324     Commercial Broker Property Manager Commercial ...
13333    Assistant Property Manager (CA) PURE Property ...
12158    Assistant Property Manager Assistant Property ...
Name: Data, dtype: object


## Calculating Performance
With the PQ Codes we can calculate the performance of the search.
We use the recall@k metric to calculate the performance.

In [None]:
# Import Libraries
from sklearn.metrics import precision_score

n_queries = 100
query_indices = np.random.choice(len(X_test), n_queries, replace=False)
# print("Query Indices: ", query_indices)

query_vec = X_test[19]
query = df['Data'][df['Vector'].apply(lambda x: np.array_equal(x, query_vec))]
# query_vectors = X_test[query_indices].astype(np.float32)
query_vectors = X_test[query_indices].astype(np.float32)
print("Query Vectors: ", query_vectors.shape)

# Compute performance metrics
precision = []
recall = []

for query_vec in query_vectors:
    indices = compute_similarity(query_vec, X_code)
    ground_truth = df['Data'].iloc[indices[:5]].values
    print("Ground Truth: ", ground_truth)
    
    # Calculate precision
    top5_results = df['Data'].iloc[indices[:5]].values
    print("Top 5 Results: ", top5_results)
    p = precision_score(ground_truth, top5_results, average='micro')
    precision.append(p)


print("Precision: ", np.mean(precision))
    


NameError: name 'X_test' is not defined

Query Vector:  [0. 0. 0. ... 0. 0. 0.] (32155,)


Query Data:  109    sunnydayzsoon StackieRobinsn BALENCIAGA You do...
Name: Data, dtype: object
Top 5 closest vectors:  [  4 508 247 522 697]
Actual Data:  4      Pup_Dior_ Happy Valentines Day  You are so gor...
508                  Little chimmy with the perfume Dior
247    Like a Lily in the mire a beautiful flower blo...
522    Louis Vuitton clothing has just made new Night...
697    and I still havent gotten a Junho and Eunwoo i...
Name: Data, dtype: object
