Product Quantization

In [1]:
# Load Libraries

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
import numpy as np

In [2]:
# Data Exploration 

df = pd.read_csv('Polarity.csv')
df.head()

Unnamed: 0,Polarity,Data
0,4,__TiffanyAndCo Cousinnnn the return coming soon
1,4,at the balenciaga thinking about my friends fa...
2,0,TiffanyAndCo bracelet I bought in Milan in Oct...
3,2,QueenMoniB personifier channel_gibbs eccentric...
4,4,Pup_Dior_ Happy Valentines Day You are so gor...


In [3]:
# Data Cleaning
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords

# Tokenize and clean up
def tokenize_and_clean(text):
    # Tokenize by sentence, then by lowercase word
    tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
    # Remove stopwords
    stop = stopwords.words('english')
    tokens = [token for token in tokens if token not in stop]
    # Remove words less than three letters
    tokens = [word for word in tokens if len(word) >= 3]
    # Remove remaining tokens that are not alphabetic
    tokens = [word for word in tokens if word.isalpha()]
    return tokens


df = pd.read_csv('Polarity.csv')
df.head()

# Apply tokenize_and_clean to Polarity DataFrame
# df['Tokens'] = df['Data'].map(tokenize_and_clean)

# Apply stopwords removal to Polarity DataFrame
stop = stopwords.words('english')
df['Data'] = df['Data'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

df.head()


Unnamed: 0,Polarity,Data
0,4,__TiffanyAndCo Cousinnnn return coming soon
1,4,balenciaga thinking friends family guy good ti...
2,0,TiffanyAndCo bracelet I bought Milan Oct sent ...
3,2,QueenMoniB personifier channel_gibbs eccentric...
4,4,Pup_Dior_ Happy Valentines Day You gorgeous


In [4]:
# Converting the text data into vectors using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Data'])

# Show the shape of the vectorized data
X.shape

(4300, 32159)

In [5]:
# Split the matrix of numerical features into subvectors using product quantization
n_subvectors = 8
subvector_size = X.shape[1] // n_subvectors
subvectors = np.array([X[:, i*subvector_size:(i+1)*subvector_size].toarray() for i in range(n_subvectors)])

# Train a k-means clustering model on each subvector to obtain a codebook of subvector centroids
n_clusters =64
codebooks = []
for subvector in subvectors:
    kmeans = KMeans(n_clusters=n_clusters, n_init=1, random_state=0).fit(subvector)
    codebooks.append(kmeans.cluster_centers_)

# Encode each subvector using the corresponding codebook to obtain a vector of subvector indices
subvector_indices = []
for i, subvector in enumerate(subvectors):
    nn = NearestNeighbors(n_neighbors=1, algorithm='brute', metric='euclidean').fit(codebooks[i])
    _, indices = nn.kneighbors(subvector)
    subvector_indices.append(indices.flatten())

print('Subvector indices shape:', np.array(subvector_indices).shape)
# show the values of the subvector indices
subvector_indices

Subvector indices shape: (8, 4300)


[array([ 0, 37,  0, ...,  0,  0,  0]),
 array([ 6,  6,  6, ...,  6,  6, 43]),
 array([24, 24, 24, ..., 24, 24, 24]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([ 0,  0,  0, ...,  0,  0, 50]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([ 8,  8,  8, ...,  8, 44,  8])]

In [6]:
# Concatenate the subvector indices to obtain the final product quantization code for each document
pq_codes = np.concatenate(subvector_indices, axis=0).reshape(-1, n_subvectors)
print("Encoded Vector Code Size:", pq_codes.shape)
pq_codes

Encoded Vector Code Size: (4300, 8)


array([[ 0, 37,  0, ...,  0, 37,  0],
       [ 0, 37,  0, ...,  0,  0,  0],
       [37,  0,  0, ...,  0,  0,  0],
       ...,
       [ 8,  8,  8, ...,  8,  8,  8],
       [ 8,  8, 44, ...,  8,  8, 44],
       [ 8,  8,  8, ...,  8, 44,  8]])

In [7]:
# Normalize the product quantization codes to unit length
pq_codes_norm = normalize(pq_codes)

# print the shape of the normalized codes
print("Normalized Encoded Vector Code Shape:", pq_codes_norm.shape)
pq_codes

Normalized Encoded Vector Code Shape: (4300, 8)


array([[ 0, 37,  0, ...,  0, 37,  0],
       [ 0, 37,  0, ...,  0,  0,  0],
       [37,  0,  0, ...,  0,  0,  0],
       ...,
       [ 8,  8,  8, ...,  8,  8,  8],
       [ 8,  8, 44, ...,  8,  8, 44],
       [ 8,  8,  8, ...,  8, 44,  8]])

In [8]:
# Save the product quantization codes to a file
np.savetxt('pq_codes.txt', pq_codes, fmt='%.8f')



In [9]:
# Create a NearestNeighbors object with PQ codes
nbrs = NearestNeighbors(n_neighbors=5, algorithm='brute', metric='euclidean')
nbrs.fit(pq_codes_norm)

# Query for similar vectors
query_vector = np.random.rand(pq_codes.shape[1])
distances, indices = nbrs.kneighbors([query_vector])

# Print the Query and top 5 similar vectors to the query from the dataset
# print("Query Vector:", query_vector)
print("Query Text:", df['Data'][indices[0][0]])
# print("Top 5 similar vectors to the query from the dataset:", indices[0][1:])
print("Top 5 similar texts to the query from the dataset:")
for i in indices[0][1:]:
    print(df['Data'][i])
    print("=====================================")
    
    

Query Text: Unique D Sculpture Size xxin sale miamiartbasel artcollector luxuryhomes artgallery popart popartist gardaniart artdealers interiordesign artlovers chanel hpmkt singulart hermes instaart instadaily dailyart luxury
Top 5 similar texts to the query from the dataset:
Just context I high school well senior I junior I knew wealthy wear Rolex school wealthy I told cool watch I liked I Not much discussion saying something cool I spotted Discussion Saw something cool today One friends high school wearing Rolex Explorer II black dial
tamachan_ BLACKPINK TiffanyAndCo This huge ROSÉ 로제 TIFFANYXROSÉ TIFFANYANDCO
darrengrimes_ What name Hermes Aphrodites child
Check listing I added Poshmark closet Polo Ralph Lauren striped lace back piece shopmycloset poshmarkapp
