# Product Quantization

In [109]:
import pandas as pd

In [110]:
# Data exploration

# Load data
df = pd.read_excel("Polarity.xlsx")
df.head()

Unnamed: 0,Polarity,Data
0,4,__TiffanyAndCo Cousinnnn the return coming soon
1,4,at the balenciaga thinking about my friends fa...
2,0,TiffanyAndCo bracelet I bought in Milan in Oct...
3,2,QueenMoniB personifier channel_gibbs eccentric...
4,4,Pup_Dior_ Happy Valentines Day You are so gor...


## Data Preprocessing

In [111]:
# Import Libraries for Data Processing
# Stopwords removal
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

# Tokenization
from nltk.tokenize import word_tokenize

# Vectorization
from sklearn.feature_extraction.text import CountVectorizer

[nltk_data] Downloading package stopwords to /Users/chris/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [112]:
stop_words = stopwords.words('english')
stop_words.append('nan')
stop_words.append('removed')
stop_words.remove('not')
stop_words.remove('no')
stop_words[:10]

df['Cleaned_Data'] = df['Data'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop_words)]))
df.head()

Unnamed: 0,Polarity,Data,Cleaned_Data
0,4,__TiffanyAndCo Cousinnnn the return coming soon,__TiffanyAndCo Cousinnnn return coming soon
1,4,at the balenciaga thinking about my friends fa...,balenciaga thinking friends family guy good ti...
2,0,TiffanyAndCo bracelet I bought in Milan in Oct...,TiffanyAndCo bracelet I bought Milan Oct sent ...
3,2,QueenMoniB personifier channel_gibbs eccentric...,QueenMoniB personifier channel_gibbs eccentric...
4,4,Pup_Dior_ Happy Valentines Day You are so gor...,Pup_Dior_ Happy Valentines Day You gorgeous


In [113]:
# Tokenization


# Check if Cleaned_Data is already tokenized else tokenize it
if not isinstance(df['Cleaned_Data'][0], list):
    # Tokenize
    print("Tokenizing")
    df['Cleaned_Data'] = df['Cleaned_Data'].apply(lambda x: word_tokenize(x))
    print("Tokenization done")
else:
    print("Data already tokenized")
    
print(df['Cleaned_Data'].head())
# Print and show largest token 
print("Largest token: ", max(df['Cleaned_Data'], key=len))



Tokenizing
Tokenization done
0    [__TiffanyAndCo, Cousinnnn, return, coming, soon]
1    [balenciaga, thinking, friends, family, guy, g...
2    [TiffanyAndCo, bracelet, I, bought, Milan, Oct...
3    [QueenMoniB, personifier, channel_gibbs, eccen...
4    [Pup_Dior_, Happy, Valentines, Day, You, gorge...
Name: Cleaned_Data, dtype: object
Largest token:  ['Kei_xD_', 'AM', 'Loyzaga_xD_', 'AA_xD_', 'AALIYAHANN_xD_', 'Abbie', 'Pura_xD_', 'Abby', 'Asi_xD_', 'Abby', 'Bordeos_xD_', 'Abby', 'Buza_xD_', 'Abby', 'Madriaga_xD_', 'ABC', 'VIII_xD_', 'Abegail', 'Alexandra', 'Zape_xD_', 'Abey', 'Dg_xD_', 'Abi', 'Asi_xD_', 'Abi', 'Kassem_xD_', 'Abi', 'Paet_xD_', 'Abigael', 'Audrey', 'Antonio_xD_', 'Abigail', 'B_xD_', 'Abigail', 'Balinas_xD_', 'Abigail', 'Christine', 'Tolomia_xD_', 'Abigail', 'Dado_xD_', 'Abilla', 'Pauline_xD_', 'Abizeyp_xD_', 'Abrielle', 'Gonzalvo_xD_', 'Abygail', 'Dado_xD_', 'Acel', 'Fuentes_xD_', 'Acent', 'Pearl', 'Morada_xD_', 'AD_xD_', 'Addison_xD_', 'Ade', 'Babiera_xD_', 'Aeriel', 

In [114]:
# Vectorization
# Initialize CountVectorizer
vectorizer = CountVectorizer()

# Fit and transform
X = vectorizer.fit_transform(df['Cleaned_Data'].apply(lambda x: ' '.join(x)))

# Add the vectors to the dataframe in a new column
df['Vector'] = [x for x in X.toarray()]

# Print the vocabulary size
print("Vocabulary size: ", len(vectorizer.vocabulary_))
print("Vector Dimension: ", len(df['Vector'][0]))
df

Vocabulary size:  32155
Vector Dimension:  32155


Unnamed: 0,Polarity,Data,Cleaned_Data,Vector
0,4,__TiffanyAndCo Cousinnnn the return coming soon,"[__TiffanyAndCo, Cousinnnn, return, coming, soon]","[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,4,at the balenciaga thinking about my friends fa...,"[balenciaga, thinking, friends, family, guy, g...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,0,TiffanyAndCo bracelet I bought in Milan in Oct...,"[TiffanyAndCo, bracelet, I, bought, Milan, Oct...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,2,QueenMoniB personifier channel_gibbs eccentric...,"[QueenMoniB, personifier, channel_gibbs, eccen...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,Pup_Dior_ Happy Valentines Day You are so gor...,"[Pup_Dior_, Happy, Valentines, Day, You, gorge...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...,...
4295,2,Tag timestamp \n\nLouis Vuitton pastel camo s...,"[Tag, timestamp, Louis, Vuitton, pastel, camo,...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4296,2,What is the best place to buy a Rolex at in F...,"[What, best, place, buy, Rolex, Florida, Not, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4297,2,Suggest a Book Can you recommend me books abo...,"[Suggest, Book, Can, recommend, books, Greek, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4298,2,Chloe wears Louis Vuitton in HUNGER Magazine,"[Chloe, wears, Louis, Vuitton, HUNGER, Magazine]","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."


# Product Quantization 
Product Quatization is a method to reduce the memory usage of the codebook.
The idea is to split the codebook into sub-codebooks and quantize the data into sub-vectors.
The sub-vectors are then quantized into sub-codebooks.
The sub-codebook indices are then concatenated to form the final codebook index.


In [115]:
import nanopq 
import numpy as np
from sklearn.model_selection import train_test_split

# Load data of the vector column
X = np.array(df['Vector'].values.tolist(), dtype=np.float32)
print("X Type" , type(X), X[0])

# Split the data into training and testing
X_train, X_test = train_test_split(X, test_size=0.2, random_state=42)

print("X_Train Type" , type(X_train), X_train[0])
# Initialize the PQ encoder (M=5 sub-spaces, 256 centroids per sub-space)
pq = nanopq.PQ(M=5, Ks=256) # M=5 as vector dimension must be divisible by M

# Fit the training data
pq.fit(vecs=X_train)

# Encode the vectors
X_code = pq.encode(X)


X Type <class 'numpy.ndarray'> [0. 0. 0. ... 0. 0. 0.]
X_Train Type <class 'numpy.ndarray'> [0. 0. 0. ... 0. 0. 0.]
M: 5, Ks: 256, metric : <class 'numpy.uint8'>, code_dtype: l2
iter: 20, seed: 123
Training the subspace: 0 / 5




Training the subspace: 1 / 5
Training the subspace: 2 / 5
Training the subspace: 3 / 5
Training the subspace: 4 / 5
Encoding the subspace: 0 / 5
Encoding the subspace: 1 / 5
Encoding the subspace: 2 / 5
Encoding the subspace: 3 / 5
Encoding the subspace: 4 / 5


In [116]:
# Save the X_code to a file
np.save("pq_code", X_code)

X_code.shape

(4300, 5)

In [137]:
# Load the X_code from the file
# X_code = np.load("pq_code.npy")

# Test Query with asymmetric distance
# query = "I am not happy with the product LV"
# query_vec = vectorizer.transform([query]).toarray()
# # Get query vector to be of same dimension as the vectors in the dataset
# query_vec = np.pad(query_vec, (0, 1), 'constant', constant_values=(0, 0))
query = X_test[4]
query_vec = np.array(query, dtype=np.float32)

print("Query Vector: ", query_vec, query_vec.shape)
print("Query Data: ", df['Data'][df['Vector'].apply(lambda x: np.array_equal(x, query_vec))])


# Compute the distance between the query and the vectors
distance_table = pq.dtable(query_vec)
dists = distance_table.adist(codes=X_code)

# Get the top 5 closest vectors
top5 = np.argsort(dists)[:5]
print("Top 5 closest vectors: ", top5)
print("Actual Data: ", df['Data'][top5])



Query Vector:  [0. 0. 0. ... 0. 0. 0.] (32155,)
Query Data:  109    sunnydayzsoon StackieRobinsn BALENCIAGA You do...
Name: Data, dtype: object
Top 5 closest vectors:  [ 109  989 2361 1357 2648]
Actual Data:  109     sunnydayzsoon StackieRobinsn BALENCIAGA You do...
989                     Balenciaga WTF ARE YOU GUYS DOING
2361    removed FS USA KAPITAL BALENCIAGA BOUND STUSSY...
1357    KimKardashian Umm your family is worth billion...
2648             Balenciaga X Crocs Wellington Boots Pink
Name: Data, dtype: object
