Product Quantization

In [3]:
# Load Libraries

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
import numpy as np

In [4]:
# Data Exploration 

df = pd.read_csv('Polarity.csv')
df.head()

Unnamed: 0,Polarity,Data
0,4,__TiffanyAndCo Cousinnnn the return coming soon
1,4,at the balenciaga thinking about my friends fa...
2,0,TiffanyAndCo bracelet I bought in Milan in Oct...
3,2,QueenMoniB personifier channel_gibbs eccentric...
4,4,Pup_Dior_ Happy Valentines Day You are so gor...


In [None]:
# Data Preprocessing



In [8]:
# Converting the text data into vectors using CountVectorizer
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Data'])

# SHow the shape of the vectorized data
X.shape

(4300, 32167)

In [10]:
# Split the matrix of numerical features into subvectors using product quantization
n_subvectors = 4
subvector_size = X.shape[1] // n_subvectors
subvectors = np.array([X[:, i*subvector_size:(i+1)*subvector_size].toarray() for i in range(n_subvectors)])

# Train a k-means clustering model on each subvector to obtain a codebook of subvector centroids
n_clusters = 256
codebooks = []
for subvector in subvectors:
    kmeans = KMeans(n_clusters=n_clusters, n_init=1, random_state=0).fit(subvector)
    codebooks.append(kmeans.cluster_centers_)

# Encode each subvector using the corresponding codebook to obtain a vector of subvector indices
subvector_indices = []
for i, subvector in enumerate(subvectors):
    nn = NearestNeighbors(n_neighbors=1, algorithm='brute', metric='euclidean').fit(codebooks[i])
    _, indices = nn.kneighbors(subvector)
    subvector_indices.append(indices.flatten())



In [12]:
# Concatenate the subvector indices to obtain the final product quantization code for each document
pq_codes = np.concatenate(subvector_indices, axis=0).reshape(-1, n_subvectors)
pq_codes.shape

(4300, 4)

In [23]:
# Normalize the product quantization codes to unit length
pq_codes_norm = normalize(pq_codes)
pq_codes

array([[ 61, 126,  61,  88],
       [ 61,  88, 126,  61],
       [ 61, 126, 253,  61],
       ...,
       [131,  69,  58, 172],
       [ 69,  69,  69, 172],
       [ 58, 212, 159, 170]])

In [22]:
# from scipy.sparse import csr_matrix

# # Compute the L2 distance between the query vector and each document vector
# query = 'some text to search for'
# query_vec = vectorizer.transform([query])
# query_subvectors = np.array([query_vec[:, i*subvector_size:(i+1)*subvector_size].toarray() for i in range(n_subvectors)])
# query_indices = []
# for i, subvector in enumerate(query_subvectors):
#     nn = NearestNeighbors(n_neighbors=1, algorithm='brute', metric='euclidean').fit(codebooks[i])
#     _, indices = nn.kneighbors(subvector)
#     query_indices.append(indices.flatten())
# query_pq_code_norm = normalize(np.concatenate(query_indices, axis=0).reshape(-1, n_subvectors))

# # Compute the L2 distance between the query vector and each document vector
# distances = np.linalg.norm(csr_matrix(pq_codes_norm) - np.tile(csr_matrix(query_pq_code_norm), (pq_codes_norm.shape[0], 1)), axis=1)


ValueError: inconsistent shapes