In [41]:
import numpy as np
from scipy.cluster.vq import kmeans2, vq
from scipy.spatial.distance import cdist

In [42]:
def PQ_train(vectors, M, k):
    s = int(vectors.shape[1] / M)                      # Dimension (or length) of a segment.
    codebook = np.empty((M, k, s), np.float32)         
        
    for m in range(M):
        sub_vectors = vectors[:, m*s:(m+1)*s]          # Sub-vectors for segment m.
        codebook[m], label = kmeans2(sub_vectors, k)   # Run k-means clustering for each segment.
        
    return codebook 

In [43]:
def PQ_encode(vectors, codebook):
    M, k, s = codebook.shape
    PQ_code = np.empty((vectors.shape[0], M), np.uint8)
    
    for m in range(M):
        sub_vectors = vectors[:, m*s:(m+1)*s]           # Sub-vectors for segment m.
        centroid_ids, _ = vq(sub_vectors, codebook[m])  # vq returns the nearest centroid Ids.
        PQ_code[:, m] = centroid_ids                    # Assign centroid Ids to PQ_code.
        
    return PQ_code

In [55]:
def PQ_search(query_vector, codebook, PQ_code):
    M, k, s = codebook.shape
    
    distance_table = np.empty((M, k), np.float32)    # Shape is (M, k)    
        
    for m in range(M):
        query_segment = query_vector[m*s:(m+1)*s]    # Query vector for segment m.
        distance_table[m] = cdist([query_segment], codebook[m], "sqeuclidean")[0]
    
    N, M = PQ_code.shape
    distance_table = distance_table.T               # Transpose the distance table to shape (k, M)
    distances = np.zeros((N, )).astype(np.float32)

    for n in range(N):                              # For each PQ Code, lookup the partial distances.
        for m in range(M):
            distances[n] += distance_table[PQ_code[n][m]][m] # Sum the partial distances from all the segments.
            
    return distance_table, distances

In [61]:
# Test case

M = 8                     # Number of segments
k = 256                   # Number of centroids per segment
vector_dim = 128          # Dimension (length) of a vector
total_vectors = 10000   # Number of database vectors

# Generate random vectors
np.random.seed(2022)
vectors = np.random.random((total_vectors, vector_dim)).astype(np.float32)   # Database vectors
q = np.random.random((vector_dim, )).astype(np.float32)                      # Query vector

# Train, encode and search with Product Quantization
codebook = PQ_train(vectors, M, k)
PQ_code = PQ_encode(vectors, codebook)
distance_table, distances = PQ_search(q, codebook, PQ_code)

distance_table




array([[2.125426  , 2.2285783 , 1.7902957 , ..., 2.5514967 , 2.2632618 ,
        1.752617  ],
       [2.4390829 , 1.620167  , 1.7928649 , ..., 1.1864169 , 1.7381495 ,
        1.2336261 ],
       [1.3950148 , 1.8986391 , 1.2117358 , ..., 1.9527086 , 2.7055287 ,
        2.8457036 ],
       ...,
       [2.4354773 , 3.154084  , 1.1651919 , ..., 2.6248562 , 2.2503722 ,
        1.076196  ],
       [0.8399424 , 1.4560368 , 1.701608  , ..., 2.8262062 , 2.3308375 ,
        1.3183129 ],
       [2.0058787 , 0.98286813, 1.3074644 , ..., 2.6367848 , 1.6629539 ,
        1.7150615 ]], dtype=float32)

In [59]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
import numpy as np
  
df = pd.read_csv('Polarity.csv')
df.head()
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Data'])
n_subvectors = 4
subvector_size = X.shape[1] // n_subvectors
subvectors = np.array([X[:, i*subvector_size:(i+1)*subvector_size].toarray() for i in range(n_subvectors)])

# Train a k-means clustering model on each subvector to obtain a codebook of subvector centroids
n_clusters = 256
codebooks = []
for subvector in subvectors:
    kmeans = KMeans(n_clusters=n_clusters, n_init=1, random_state=0).fit(subvector)
    codebooks.append(kmeans.cluster_centers_)

# Encode each subvector using the corresponding codebook to obtain a vector of subvector indices
subvector_indices = []
for i, subvector in enumerate(subvectors):
    nn = NearestNeighbors(n_neighbors=1, algorithm='brute', metric='euclidean').fit(codebooks[i])
    _, indices = nn.kneighbors(subvector)
    subvector_indices.append(indices.flatten())
pq_codes = np.concatenate(subvector_indices, axis=0).reshape(-1, n_subvectors)
# pq_codes.shape
pq_codes_norm = normalize(pq_codes)
# pq_codes

codebook = PQ_train(vectors, M, k)
PQ_code = PQ_encode(vectors, codebook)
distance_table, distances = PQ_search(q, codebook, PQ_code)

array([[ 61,  21,  61,  88],
       [ 61,  88, 126,  61],
       [ 21, 126,  21,  61],
       ...,
       [131,  69,  58, 172],
       [ 69,  69,  69, 172],
       [ 58, 212, 159, 170]], dtype=int64)

In [60]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.cluster import KMeans
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import normalize
import numpy as np
  
df = pd.read_csv('Polarity.csv')
df.head()
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Data'])
# Preparing the data to fit_tranformation
listofData  = list(df["Data"].array)

# Create the Matrix
matrix = vectorizer.fit_transform(listofData)
matrix = matrix.toarray()

# Add into new column in df
df["Vector"] = [row.tolist() for row in matrix]
df


Unnamed: 0,Polarity,Data,Vector
0,4,__TiffanyAndCo Cousinnnn the return coming soon,"[0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,4,at the balenciaga thinking about my friends fa...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,0,TiffanyAndCo bracelet I bought in Milan in Oct...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
3,2,QueenMoniB personifier channel_gibbs eccentric...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4,4,Pup_Dior_ Happy Valentines Day You are so gor...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
...,...,...,...
4295,2,Tag timestamp \n\nLouis Vuitton pastel camo s...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4296,2,What is the best place to buy a Rolex at in F...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4297,2,Suggest a Book Can you recommend me books abo...,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
4298,2,Chloe wears Louis Vuitton in HUNGER Magazine,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
