In [2]:
import torch
from vector_quantize_pytorch import VectorQuantize
from sklearn.decomposition import PCA
from transformers import AutoTokenizer, AutoModel
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import tqdm
import numpy as np
from loguru import logger
from sentence_transformers import SentenceTransformer, util

In [3]:
logger.remove()
logger.add("log.txt")

1

In [4]:
DEVICE = "cuda:1"

## Load models for vector generation

In [5]:
# roberta_tokenizer = AutoTokenizer.from_pretrained("allenai/biomed_roberta_base")
# roberta_model = AutoModel.from_pretrained("allenai/biomed_roberta_base").to(DEVICE)

# tokenizer = AutoTokenizer.from_pretrained('deepset/sentence_bert')
# model = AutoModel.from_pretrained('deepset/sentence_bert').to(DEVICE)
model = SentenceTransformer('all-mpnet-base-v2').to(DEVICE)

In [6]:
# #	ratio 	mse 	mean_diff 	diff_of_means
# # 0.978979 	0.070866 	0.228269 	0.228269
def _embed(term, device):
    inputs = roberta_tokenizer(term, padding=True, truncation=True, return_tensors="pt")
    inputs['input_ids'] = inputs['input_ids'].to(device)
    inputs['attention_mask'] = inputs['attention_mask'].to(device)
    outputs = roberta_model(**inputs).pooler_output.cpu().detach().numpy()
    del inputs
    
    return outputs

def _embed2(term, device):
    sentence_rep = model.to(device).encode(term)
    return sentence_rep

def embed(term):
    try:
        outputs = _embed2(term, DEVICE)
    except RuntimeError as e:
        print("Switching to CPU! Reason: " + str(e))
        outputs = _embed2(term, "cpu")
    return outputs

## Load data

In [12]:
list_of_dataframes = []

for df in tqdm.tqdm(pd.read_json("../data/msmarco-triplets/msmarco-triplets.jsonl", chunksize=20000, lines=True), total=25):
    list_of_dataframes.append(df)
    
data = pd.concat(list_of_dataframes)

  8%|██████▋                                                                             | 2/25 [00:03<00:41,  1.81s/it]


UnicodeDecodeError: 'utf-8' codec can't decode byte 0xb9 in position 1894: invalid start byte

In [None]:
data['pos'] = data['pos'].apply(lambda x: x[0])
data['neg'] = data['neg'].apply(lambda x: x[0])

In [None]:
data['query'].values[0]

In [12]:
train_data, test_data = train_test_split(data, test_size=0.1, random_state=42)
print(train_data.shape)
print(test_data.shape)

(449265, 3)
(49919, 3)


## Measure cosine similarity differences between pos and neg

In [13]:
metrics = []
def calculate_metrics(method, data, name):
    pos_similarity = []
    neg_similarity = []

    queue = []
    for idx, sample in tqdm.tqdm(data.iterrows(), total=len(data)):
        queue.extend(list(sample.values))
        if idx%100==0:
            logger.info(f"{idx/len(data)*100:.1f}% ({idx})")
            
        if len(queue)<128:
            continue
            
        embeddings = method(queue).tolist()
        while len(embeddings) != 0:
            query, pos, neg = embeddings.pop(0), embeddings.pop(0), embeddings.pop(0)
            pos_similarity.append(cosine_similarity([query], [pos])[0][0])
            neg_similarity.append(cosine_similarity([query], [neg])[0][0])

        queue = []

    pos_similarity = np.array(pos_similarity)
    neg_similarity = np.array(neg_similarity)
    
    ratio = np.sum(pos_similarity>neg_similarity)/len(pos_similarity)
    mse = np.mean((pos_similarity-neg_similarity)**2)
    mean_diff = pos_similarity.mean()-neg_similarity.mean()
    diff_of_means = (pos_similarity-neg_similarity).mean()
    
    return pd.DataFrame.from_dict([{
                        "ratio": ratio,
                        "mse": mse,
                        "mean_diff": mean_diff,
                        "diff_of_means": diff_of_means,
                        "method": name
                    }])

In [16]:
metrics.append(calculate_metrics(embed, test_data, "CLS vector"))

100%|████████████████████████████████████████████████████████████████████████████| 49919/49919 [06:21<00:00, 130.76it/s]


In [17]:
metrics[0]

Unnamed: 0,ratio,mse,mean_diff,diff_of_means,method
0,0.965237,0.072999,0.227337,0.227337,CLS vector


## PCA

In [43]:
lda_metrics = []
embeddings = []

train_data = data.sample(1000, random_state=42)

for i in tqdm.trange(0, train_data.shape[0], 32):
    embeddings.extend(embed(

In [40]:
dim = 64
res_dim = 8

# for idx in range(0, train.shape

In [42]:
for

Unnamed: 0,query,pos,neg
15773,what does dsm stand for,"In psychology and psychiatry, DSM stands for t...","military mk, what does mark stand for in milit..."
35290,what is social competence in early childhood,Social competence in early childhood is as a s...,The Early Years Learning Framework (EYLF) is d...
358404,stonefield construction phone number,A: The phone number for Stonefield Constructio...,The Building Official reviews applications for...
381806,what book did adam smith write,"Adam Smith FRSA was a Scottish economist, phil...",Was Adam Smith laissez-faire? What did it mean...
244560,how long can you take naltrexone for,You and your doctor have to decide this. Most ...,"Naloxone, also known as Narcan among other nam..."
...,...,...,...
178265,what happens in the first stage of seed germin...,Stage One - Hydration and Metabolism. In the f...,Germination is the process in which the seeds ...
168368,aberdeen airport code,Aberdeen International Airport (Scottish Gaeli...,Welcome to USZip.com. USZip.com USZip.com is a...
314866,how old till baby can sleep on stomach,You may find yourself wondering if your baby c...,Third Month Baby Milestones: Sleep. Your 3-mon...
472372,what are the sisters last names in in the time...,In the Time of the Butterflies Characters. BÃ©...,The nine brightest stars of the Pleiades are n...


## Quantize vectors

In [18]:
metrics = []

In [19]:
start_dim = 768
dim = 64
codebook_size = 256
codebook_dim = 8

def train_embeddings(term):
    embedding = torch.from_numpy(embed(term))
    reshaped = embedding.view(-1, start_dim//dim, dim)
    _, ind, loss = vq_trainable(reshaped)
    loss.backward()
    optim.step()
    res = vq_trainable.project_in(reshaped)
    return res.view(len(embedding), -1)

def test_embeddings(term):
    embedding = torch.from_numpy(embed(term))
    reshaped = embedding.view(-1, start_dim//dim, dim)
    res = vq_trainable.project_in(reshaped)
    return res.view(len(embedding), -1)

for dim, codebook_dim in [[64,4],
                          [32,2],
                          [128,8]
                         ]:
    for codebook_size in [32, 64, 128, 256, 512, 1024]:
        for i in range(3):
            vq_trainable = VectorQuantize(
                dim = dim,
                codebook_size = codebook_size,
                codebook_dim=codebook_dim,
                use_cosine_sim=True
            )

            optim = torch.optim.Adam(vq_trainable.parameters(), lr=3e-4)

            calculate_metrics(train_embeddings, train_data.sample(1000), f"")
            metrics.append(calculate_metrics(test_embeddings, test_data.sample(1000), f"{dim}x{codebook_dim} ({codebook_size}) proper"))

100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 137.21it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 136.88it/s]
100%|██████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:07<00:00, 139.56it/s]
 17%|█████████████▌                                                                 | 171/1000 [00:00<00:04, 188.08it/s]


KeyboardInterrupt: 

In [None]:
start_dim = 768
dim = 64
codebook_size = 512
codebook_dim = 4

def train_embeddings(term):
    embedding = torch.from_numpy(embed(term))
    reshaped = embedding.view(-1, start_dim//dim, dim)
    _, ind, loss = vq_trainable(reshaped)
    loss.backward()
    optim.step()
    res = vq_trainable.project_in(reshaped)
    return res.view(len(embedding), -1)

def test_embeddings(term):
    embedding = torch.from_numpy(embed(term))
    reshaped = embedding.view(-1, start_dim//dim, dim)
    res = vq_trainable.project_in(reshaped)
    return res.view(len(embedding), -1)

for i in range(1):
    vq_trainable = VectorQuantize(
        dim = dim,
        codebook_size = codebook_size,
        codebook_dim=codebook_dim,
        use_cosine_sim=True
    )

    optim = torch.optim.Adam(vq_trainable.parameters(), lr=3e-4)

    calculate_metrics(train_embeddings, data.sample(10000, random_state=42), f"")
    metrics.append(calculate_metrics(test_embeddings, data.sample(100000, random_state=42), f"{dim}x{codebook_dim} ({codebook_size}) trained 3e-4"))

100%|████████████████████████████████████████████████████████████████████████████| 10000/10000 [01:14<00:00, 134.67it/s]
100%|██████████████████████████████████████████████████████████████████████████| 100000/100000 [12:26<00:00, 133.89it/s]

In [1]:
avg = pd.concat(metrics).groupby("method").mean()
avg["compression"] = avg.index.map(lambda x: float(x.split("x")[0])/float(x.split("x")[1].split()[0]))

NameError: name 'pd' is not defined

In [1]:
avg.sort_values("ratio")

NameError: name 'avg' is not defined