# Product Quantization

In [83]:
import pandas as pd

In [84]:
# Data exploration

# Load data
df = pd.read_excel("Polarity.xlsx")
df.head()

Unnamed: 0,Polarity,Data
0,4,__TiffanyAndCo Cousinnnn the return coming soon
1,4,at the balenciaga thinking about my friends fa...
2,0,TiffanyAndCo bracelet I bought in Milan in Oct...
3,2,QueenMoniB personifier channel_gibbs eccentric...
4,4,Pup_Dior_ Happy Valentines Day You are so gor...


## Data Preprocessing

In [85]:
# Import Libraries for Data Processing
# Stopwords removal
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# Tokenization
from nltk.tokenize import word_tokenize

# Vectorization
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to /Users/chris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [86]:
stop_words = stopwords.words('english')
stop_words.append('nan')
stop_words.append('removed')
stop_words.remove('not')
stop_words.remove('no')
stop_words[:10]

df['Cleaned_Data'] = df['Data'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df.head()

Unnamed: 0,Polarity,Data,Cleaned_Data
0,4,__TiffanyAndCo Cousinnnn the return coming soon,__TiffanyAndCo Cousinnnn return coming soon
1,4,at the balenciaga thinking about my friends fa...,balenciaga thinking friends family guy good ti...
2,0,TiffanyAndCo bracelet I bought in Milan in Oct...,TiffanyAndCo bracelet I bought Milan Oct sent ...
3,2,QueenMoniB personifier channel_gibbs eccentric...,QueenMoniB personifier channel_gibbs eccentric...
4,4,Pup_Dior_ Happy Valentines Day You are so gor...,Pup_Dior_ Happy Valentines Day You gorgeous


In [87]:
# Tokenization


# Check if Cleaned_Data is already tokenized else tokenize it
if not isinstance(df['Cleaned_Data'][0], list):
    # Tokenize
    print("Tokenizing")
    df['Cleaned_Data'] = df['Cleaned_Data'].apply(lambda x: word_tokenize(x))
    print("Tokenization done")
else:
    print("Data already tokenized")
    
print(df['Cleaned_Data'].head())
# Print and show largest token 
print("Largest token: ", max(df['Cleaned_Data'], key=len))



Tokenizing
Tokenization done
0    [__TiffanyAndCo, Cousinnnn, return, coming, soon]
1    [balenciaga, thinking, friends, family, guy, g...
2    [TiffanyAndCo, bracelet, I, bought, Milan, Oct...
3    [QueenMoniB, personifier, channel_gibbs, eccen...
4    [Pup_Dior_, Happy, Valentines, Day, You, gorge...
Name: Cleaned_Data, dtype: object
Largest token:  ['Kei_xD_', 'AM', 'Loyzaga_xD_', 'AA_xD_', 'AALIYAHANN_xD_', 'Abbie', 'Pura_xD_', 'Abby', 'Asi_xD_', 'Abby', 'Bordeos_xD_', 'Abby', 'Buza_xD_', 'Abby', 'Madriaga_xD_', 'ABC', 'VIII_xD_', 'Abegail', 'Alexandra', 'Zape_xD_', 'Abey', 'Dg_xD_', 'Abi', 'Asi_xD_', 'Abi', 'Kassem_xD_', 'Abi', 'Paet_xD_', 'Abigael', 'Audrey', 'Antonio_xD_', 'Abigail', 'B_xD_', 'Abigail', 'Balinas_xD_', 'Abigail', 'Christine', 'Tolomia_xD_', 'Abigail', 'Dado_xD_', 'Abilla', 'Pauline_xD_', 'Abizeyp_xD_', 'Abrielle', 'Gonzalvo_xD_', 'Abygail', 'Dado_xD_', 'Acel', 'Fuentes_xD_', 'Acent', 'Pearl', 'Morada_xD_', 'AD_xD_', 'Addison_xD_', 'Ade', 'Babiera_xD_', 'Aeriel', 

In [88]:
# Vectorization
# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform
X = vectorizer.fit_transform(df['Cleaned_Data'].apply(lambda x: ' '.join(x)))

# Add the vectors to the dataframe in a new column
df['Vector'] = [x for x in X.toarray()]

# Print the vocabulary size
print("Vocabulary size: ", len(vectorizer.vocabulary_))
print("Vector Dimension: ", len(df['Vector'][0]))
df

Vocabulary size:  32155
Vector Dimension:  32155


Unnamed: 0,Polarity,Data,Cleaned_Data,Vector
0,4,__TiffanyAndCo Cousinnnn the return coming soon,"[__TiffanyAndCo, Cousinnnn, return, coming, soon]","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,4,at the balenciaga thinking about my friends fa...,"[balenciaga, thinking, friends, family, guy, g...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,0,TiffanyAndCo bracelet I bought in Milan in Oct...,"[TiffanyAndCo, bracelet, I, bought, Milan, Oct...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,2,QueenMoniB personifier channel_gibbs eccentric...,"[QueenMoniB, personifier, channel_gibbs, eccen...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,Pup_Dior_ Happy Valentines Day You are so gor...,"[Pup_Dior_, Happy, Valentines, Day, You, gorge...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
4295,2,Tag timestamp \n\nLouis Vuitton pastel camo s...,"[Tag, timestamp, Louis, Vuitton, pastel, camo,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4296,2,What is the best place to buy a Rolex at in F...,"[What, best, place, buy, Rolex, Florida, Not, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4297,2,Suggest a Book Can you recommend me books abo...,"[Suggest, Book, Can, recommend, books, Greek, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4298,2,Chloe wears Louis Vuitton in HUNGER Magazine,"[Chloe, wears, Louis, Vuitton, HUNGER, Magazine]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# Product Quantization 
Product Quatization is a method to reduce the memory usage of the codebook.
The idea is to split the codebook into sub-codebooks and quantize the data into sub-vectors.
The sub-vectors are then quantized into sub-codebooks.
The sub-codebook indices are then concatenated to form the final codebook index.


In [103]:
import nanopq 
import numpy as np
from sklearn.model_selection import train_test_split

# Load data of the vector column
X = np.array(df['Vector'].values.tolist(), dtype=np.float32)
print("X Type" , type(X), X[0])

# Split the data into training and testing
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

print("X_Train Type" , type(X_train), X_train[0])
# Initialize the PQ encoder (M=5 sub-spaces, 256 centroids per sub-space)
pq = nanopq.PQ(M=5, Ks=256) # M=5 as vector dimension must be divisible by M

# Fit the training data
pq.fit(vecs=X_train)

# Encode the vectors
X_code = pq.encode(X_test)


X Type <class 'numpy.ndarray'> [0. 0. 0. ... 0. 0. 0.]
X_Train Type <class 'numpy.ndarray'> [0. 0. 0. ... 0. 0. 0.]
M: 5, Ks: 256, metric : <class 'numpy.uint8'>, code_dtype: l2
iter: 20, seed: 123
Training the subspace: 0 / 5




Training the subspace: 1 / 5
Training the subspace: 2 / 5
Training the subspace: 3 / 5
Training the subspace: 4 / 5
Encoding the subspace: 0 / 5
Encoding the subspace: 1 / 5
Encoding the subspace: 2 / 5
Encoding the subspace: 3 / 5
Encoding the subspace: 4 / 5


In [96]:
# Save the X_code to a file
np.save("pq_code", X_code)

X_code.shape

(4300, 5)

# Similarity Search

In [97]:
import numpy as np
# Load the X_code from the file
X_code = np.load("pq_code.npy")

# Print the shape of the X_code
print(X_code.shape)

(4300, 5)


### Using asymmetric Distance

In [187]:
# Choose a random vector as query
query_vec = X_test[10]
query = df['Data'][df['Vector'].apply(lambda x: np.array_equal(x, query_vec))]

# query = "Rolex ROLEX"
# query_vec = vectorizer.transform([query]).toarray()[0]
query_vec = np.array(query_vec, dtype=np.float32)
print("Query: ", query)
print("Query Vector: ", query_vec)


# # Encode the query vector into PQ-codes (Sub-vectors)
# query_code = pq.encode(query_vec.reshape(1, -1))
# print("Query code: ", query_code)
# print("Query code shape: ", query_code.shape)



Query:  Rolex ROLEX
Query Vector:  [0. 0. 0. ... 0. 0. 0.]


In [188]:
# Compute distance table between the query and the vectors
def compute_similarity(query_vec, X_code):
    dt = pq.dtable(query_vec)
    dist = dt.adist(codes=X_code)
    # print("Distance table shape: ", dist.shape)
    # print("Distance table: ", dist)

    indices = np.argsort(dist)
    return indices

idx = compute_similarity(query_vec, X_code)
# Print the top 5 results
top5 = idx[:5]
print("Query:\n", query)
print("Top 5 results:\n", top5)
print("Top 5 Similar Data\n", df['Data'].iloc[top5])


Query:
 Rolex ROLEX
Top 5 results:
 [395 575 781 252 701]
Top 5 Similar Data
 395    JohnSullysfb DailyCaller Or fashion tips to Ra...
575    With elonmusk fails and Tech Companies falling...
781    when i get da celine amiri bbsimon dior fit do...
252    Cardi ate when she said those balenciagas the ...
701    womens gold rolex watches used gold  rolex wat...
Name: Data, dtype: object


## Calculating Performance
With the PQ Codes we can calculate the performance of the search.
We use the recall@k metric to calculate the performance.

In [177]:
# Import Libraries
from sklearn.metrics import precision_score

n_queries = 100
query_indices = np.random.choice(len(X_test), n_queries, replace=False)
# print("Query Indices: ", query_indices)

query_vec = X_test[19]
query = df['Data'][df['Vector'].apply(lambda x: np.array_equal(x, query_vec))]
# query_vectors = X_test[query_indices].astype(np.float32)
query_vectors = X_test[query_indices].astype(np.float32)
print("Query Vectors: ", query_vectors.shape)

# Compute performance metrics
precision = []
recall = []

for query_vec in query_vectors:
    indices = compute_similarity(query_vec, X_code)
    ground_truth = df['Data'].iloc[indices[:5]].values
    print("Ground Truth: ", ground_truth)
    
    # Calculate precision
    top5_results = df['Data'].iloc[indices[:5]].values
    print("Top 5 Results: ", top5_results)
    p = precision_score(ground_truth, top5_results, average='micro')
    precision.append(p)


print("Precision: ", np.mean(precision))
    


Query Vectors:  (100, 32155)
Ground Truth:  ['wanna try some more lash extensions' 'BALENCIAGA Kanye was right'
 'amfipolis hermes_z I guess you need to learn mathematics immediately '
 'Rolex Explorer Exposed to seawater  Check to know if it was Restored r  via YouTube'
 'Everybody in the NBA and NFL seem like they got that one whiteLight blue Louis Vuitton Letterman']
Top 5 Results:  ['wanna try some more lash extensions' 'BALENCIAGA Kanye was right'
 'amfipolis hermes_z I guess you need to learn mathematics immediately '
 'Rolex Explorer Exposed to seawater  Check to know if it was Restored r  via YouTube'
 'Everybody in the NBA and NFL seem like they got that one whiteLight blue Louis Vuitton Letterman']
Ground Truth:  ['fascnate Balenciaga  shoe'
 'Stunning now stunning forever Crafted to stand the test of time our rings become family heirlooms for future generations that can be passed down and cherished for years to come Discover more  AboutLove TiffanyAndCo '
 'Dior Savauge has 

In [172]:
print("ground_truth: ", ground_truth)

Indices:  [718  27 828 332 290 248 555  35 487 512 148 633 454 361 564 299 573 725
 115  33 502 277  40 513 362 186 195 507 599 257 329 449 105 605 527 395
 575 781 252 701 797  73 500 351 164 455 429 598  17  14 623 795 342  19
  32 473 503 550 842 420 380 431 393 515 301 453  56 852 821 122 462  12
 547 377 525 291 120 759 496 531 592 524 298 385 601 569 778 568 176 334
 657 505 144  97 499 300 692 590 643 363 670 327 216 785 775 106 859 608
  43 471 225 664 684 677 201 244 179  29 671 729 182 218 577 191 430 825
 371 372 740 697 712 672 119 559 844 780 129 635 412 831 723 331 381 296
 360 353 822 188 365  62 406 578 328 528 532 779 574 184 815 238 234 295
 674 774 737 710 523 557 160 358 855  60 243 714 784 481   8 485 108 458
  64 355 540 658 247 819 162 558 520 682 717 282  54 483 765 655  87 522
 667 497 700 777 611 810 459 367 378 107 719 262 508 811 390 836 166 253
 190 101 457 848 552 322 183 838 389 631  46 443 118  92 415 137 773 304
 165 273 426 435 292 173  67 839 617 688 

In [None]:
# Load the X_code from the file
# X_code = np.load("pq_code.npy")

# Test Query with asymmetric distance
# query = "I am not happy with the product LV"
# query_vec = vectorizer.transform([query]).toarray()
# # Get query vector to be of same dimension as the vectors in the dataset
# query_vec = np.pad(query_vec, (0, 1), 'constant', constant_values=(0, 0))
query = X_test[4]
query_vec = np.array(query, dtype=np.float32)

print("Query Vector: ", query_vec, query_vec.shape)
print("Query Data: ", df['Data'][df['Vector'].apply(lambda x: np.array_equal(x, query_vec))])


# Compute the distance between the query and the vectors
distance_table = pq.dtable(query_vec)
dists = distance_table.adist(codes=X_code)

# Get the top 5 closest vectors
top5 = np.argsort(dists)[:5]
print("Top 5 closest vectors: ", top5)
print("Actual Data: ", df['Data'][top5])



Query Vector:  [0. 0. 0. ... 0. 0. 0.] (32155,)
Query Data:  109    sunnydayzsoon StackieRobinsn BALENCIAGA You do...
Name: Data, dtype: object
Top 5 closest vectors:  [  4 508 247 522 697]
Actual Data:  4      Pup_Dior_ Happy Valentines Day  You are so gor...
508                  Little chimmy with the perfume Dior
247    Like a Lily in the mire a beautiful flower blo...
522    Louis Vuitton clothing has just made new Night...
697    and I still havent gotten a Junho and Eunwoo i...
Name: Data, dtype: object
