In [1]:
from comet_ml import Experiment
import torch
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import tqdm
import numpy as np
from loguru import logger
import json
import glob

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
from torch import nn
import torch.nn.functional as F

In [4]:
logger.remove()
logger.add("log.txt")

1

In [5]:
DEVICE = "cuda:1"

## Load models for vector generation

## Load data

### Read vectors

In [3]:
embeddings = np.load("data/vectors.npy")

In [4]:
indices = np.arange(embeddings.shape[0]//3)

train_idx, test_idx = train_test_split(indices, test_size=0.3, random_state=42)

train_data = [embeddings[train_idx*3], embeddings[train_idx*3+1], embeddings[train_idx*3+2]]
test_data = [embeddings[test_idx*3], embeddings[test_idx*3+1], embeddings[test_idx*3+2]]

print(train_data[0].shape)
print(test_data[0].shape)

(69982, 768)
(29993, 768)


## Measure cosine similarity differences between pos and neg

In [7]:
def calculate_metrics(method, data, name, show_pbar=True):
    orig_shape = data[0].shape[1]
    pos_similarity = []
    neg_similarity = []

    queue = []
    anchors = method(data[0])
    positives = method(data[1])
    negatives = method(data[2])
   
    for i in tqdm.trange(len(anchors), disable=not show_pbar):
        query, pos, neg = anchors[i], positives[i], negatives[i]
        pos_similarity.append(cosine_similarity([query], [pos])[0][0])
        neg_similarity.append(cosine_similarity([query], [neg])[0][0])

    pos_similarity = np.array(pos_similarity)
    neg_similarity = np.array(neg_similarity)
    
    ratio = np.sum(pos_similarity>neg_similarity)/len(pos_similarity)
    mean_diff = pos_similarity.mean()-neg_similarity.mean()
    mean_pos_sim = pos_similarity.mean()
    
    reduction_rate = orig_shape/len(anchors[0])
    return pd.DataFrame.from_dict([{
                        "ratio": ratio,
                        "mean_pos_sim": mean_pos_sim,
                        "mean_diff": mean_diff,
                        "method": name,
                        "reduction_rate": reduction_rate
                    }])

## Benchmarks

In [9]:
def validate_method(get_method, test_method):
    res = []
    for input_dim in tqdm.notebook.tqdm([8, 16, 32, 64, 128, 768]):
        for output_dim in tqdm.notebook.tqdm([2, 4, 8, 16, 32, 64, 128]):
            if input_dim <= output_dim:
                continue

            model = get_method(input_dim, output_dim)
            res.append(calculate_metrics(test_method(model, input_dim), test_data, f"PCA ({input_dim}x{output_dim})", show_pbar=False))
    return res

def aggregate_metrics(directory="results_biobert"):
    dframes = []
    for method in glob.glob(f"{directory}/*.csv"):
        if "total.csv" not in method:
            dframes.append(pd.read_csv(method))
    aggregated = pd.concat(dframes)
    aggregated = aggregated.sort_values(["reduction_rate", "ratio"], ascending=[False, True])
    aggregated.to_csv(f"{directory}/total.csv")
    return aggregated

def save_metrics(metrics, fname):
    cmetrics = pd.concat(metrics)

    max_ratio = cmetrics.groupby("reduction_rate").max()["ratio"]
    cmetrics = cmetrics[cmetrics.apply(lambda x: x["ratio"]==max_ratio[x["reduction_rate"]], axis=1)]
    cmetrics = cmetrics.sort_values("reduction_rate", ascending=False)
    cmetrics.to_csv(f"results_biobert/{fname}.csv")

    return cmetrics

### Baseline

In [44]:
metrics = []
metrics.append(calculate_metrics(lambda x: x, test_data, "CLS vector"))

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 29993/29993 [00:09<00:00, 3279.73it/s]


In [16]:
save_metrics(metrics, "baseline")

Unnamed: 0,ratio,mean_pos_sim,mean_diff,method,reduction_rate
0,0.7137,0.461492,0.078354,CLS vector,1.0


### Averaging

In [18]:
metrics = []

In [19]:
def average_vec(dim):
    def _inner(embeddings):
        x = embeddings.reshape(-1, dim)
        x = x.mean(axis=1).reshape(-1, 1)
        x = x.reshape(len(embeddings), -1)
        x = x/np.sqrt(np.sum(x**2))
        return x
    return _inner

In [20]:
for rate in [2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 384]:
    method = average_vec(rate)
    metrics.append(calculate_metrics(method, test_data, f"Average parts ({rate})"))

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29993/29993 [00:10<00:00, 2913.21it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29993/29993 [00:10<00:00, 2936.89it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29993/29993 [00:10<00:00, 2995.77it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29993/29993 [00:10<00:00, 2994.69it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████

In [21]:
res = save_metrics(metrics, "averaging")
aggregate_metrics()

Unnamed: 0.1,Unnamed: 0,ratio,mean_pos_sim,mean_diff,method,reduction_rate
0,0,0.527723,0.759369,0.031587,Average parts (384),384.0
1,0,0.548961,0.558271,0.055304,Average parts (192),192.0
2,0,0.564165,0.462989,0.063265,Average parts (128),128.0
3,0,0.579102,0.451062,0.070158,Average parts (96),96.0
4,0,0.599573,0.460109,0.075445,Average parts (64),64.0
5,0,0.610876,0.440423,0.076571,Average parts (48),48.0
6,0,0.630747,0.446489,0.079458,Average parts (32),32.0
7,0,0.644317,0.445943,0.078452,Average parts (24),24.0
8,0,0.661854,0.451641,0.078587,Average parts (16),16.0
9,0,0.671557,0.456626,0.078192,Average parts (12),12.0


### Summing

In [66]:
metrics = []
def average_vec(dim):
    def _inner(embeddings):
        embeddings = embeddings/embeddings.sum(axis=1)[:, None]
        x = embeddings.reshape(-1, dim)
        x = x.sum(axis=1).reshape(-1, 1)
        x = x.reshape(len(embeddings), -1)
        return x
    return _inner

In [64]:
for rate in [2, 4, 8, 12, 16, 24, 32, 48, 64, 96, 128, 192, 384]:
    method = average_vec(rate)
    metrics.append(calculate_metrics(method, test_data, f"Sum parts ({rate})"))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29993/29993 [00:10<00:00, 2838.00it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29993/29993 [00:10<00:00, 2833.88it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29993/29993 [00:10<00:00, 2840.09it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29993/29993 [00:10<00:00, 2841.37it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 2999

In [65]:
res = save_metrics(metrics, "sums")
aggregate_metrics()

Unnamed: 0.2,Unnamed: 0,ratio,mean_pos_sim,mean_diff,method,reduction_rate,Unnamed: 0.1
0,0,0.000000,1.000000,0.000000,Autoencoder (768x1),768.000000,0.0
0,0,0.141166,0.730871,0.135898,Whitening (768x1),768.000000,
0,0,0.143567,0.732938,0.137099,PCA (768x1),768.000000,
0,0,0.163105,0.399593,0.023772,UMAP (768x1),768.000000,
0,0,0.525589,0.190101,0.066344,Sum parts (384),384.000000,
...,...,...,...,...,...,...,...
12,0,0.964125,0.733570,0.229238,Average parts (2),2.000000,
19,0,0.965759,0.732612,0.226916,Whitening (64x32),2.000000,
15,0,0.965992,0.735349,0.226656,PCA (64x32),2.000000,
20,0,0.967392,0.734172,0.227783,Whitening (96x64),1.500000,


### Generic autoencoder

In [103]:
DEVICE = "cpu"

class Autoencoder(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.encoder = nn.Sequential(
                    nn.Linear(input_dim, 64),
                    nn.ReLU(),
                    nn.Linear(64, output_dim),
            )
        self.decoder = nn.Sequential(
                    nn.Linear(output_dim, 64),
                    nn.ReLU(),
                    nn.Linear(64, input_dim)
            )
    
    def forward(self, x):
        x = self.encoder(x)
        x = F.normalize(x)
        x = self.decoder(x)
        x = F.normalize(x)
        return x
    
    def encode(self, x):
        x = self.encoder(x)
        x = F.normalize(x)
        return x
    
def test_autoenc(model, dim):
    def _inner(embeddings):
        x = embeddings.reshape(-1, dim)
        x = torch.from_numpy(x).to(torch.float32).to(DEVICE)
        x = model.encode(x).cpu().detach().numpy()
        x = x.reshape(len(embeddings), -1)
        return x
    return _inner

In [104]:
# data = torch.tensor(train_data).reshape(-1, 768).to(DEVICE)
# train_loader = torch.utils.data.DataLoader(data.to(torch.float32), batch_size=32, shuffle=True)

batch_size = 32

hinge_margin = torch.tensor(0)
gamma = 1
def var_loss(x):
    var = torch.sqrt(x.var(0)+1e-6)
    loss = torch.maximum(hinge_margin, gamma-var).mean()
    return loss

def train_autoenc(model, dim, criterion, epochs=1, params={}):
    optim = torch.optim.Adam(model.parameters(), 3e-4)
    
    for epoch in range(1, epochs+1):
        with tqdm.notebook.tqdm(train_loader) as t:
            total_loss = []
            
            for step, batch in enumerate(t):
                x = batch.reshape(-1, dim)
                batch_loss = []
                for idx in range(0, len(x), batch_size):
                    minibatch = x[idx:idx+batch_size]

                    optim.zero_grad()
                    
                    encoded = model.encoder(minibatch)
                    encoded = F.normalize(encoded)
                    y_hat = model.decoder(encoded)
                    y_hat = F.normalize(y_hat)
                    
                    # y_hat = model(minibatch)
                    loss = criterion(minibatch, y_hat)
                    loss += var_loss(encoded)
                    
                    loss.backward()
                    optim.step()

In [105]:
metrics = []

In [113]:
dim=32
output_dim = 8
lr = 3e-4

# for input_dim in tqdm.notebook.tqdm([8, 12, 16, 32, 64, 96, 128, 192, 384, 768]):
#     for output_dim in tqdm.notebook.tqdm([2, 4, 6, 8, 12, 16, 32, 64, 96, 128]):
for (input_dim, output_dim) in [[768, 2],
                                [768, 4],
                                [768, 6],
                                [768, 8],
                                [768, 12],
                                [768, 16],
                                [128, 4],
                                [768, 32],
                                [128, 8],
                                [768, 64],
                                [768, 96],
                                [96, 16],
                                [128, 32],
                                [96, 32],
                                [32, 16]]:
    if input_dim <= output_dim:
        continue
    if input_dim%output_dim != 0:
        continue

    params = {"Input dim": input_dim, "Output dim": output_dim, "LR": lr}

    model = Autoencoder(input_dim, output_dim).to(DEVICE)

    criterion_ = nn.CosineSimilarity()
    criterion = lambda x, y: -criterion_(x, y).mean()

    train_autoenc(model, input_dim, criterion, epochs=4, params=params)
    metrics.append(calculate_metrics(test_autoenc(model, input_dim), test_data, f"Autoencoder ({input_dim}x{output_dim})", False))
    logger.info(metrics[-1])

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)



  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

  0%|          | 0/6561 [00:00<?, ?it/s]

In [116]:
save_metrics(metrics, "autoencoder.csv")
aggregate_metrics()

Unnamed: 0.1,Unnamed: 0,ratio,mean_pos_sim,mean_diff,method,reduction_rate
0,0,0.010569,-0.096723,-0.011736,Whitening (768x1),768.0
0,0,0.097456,-0.329110,-0.032274,PCA (768x1),768.0
0,0,0.506585,0.087603,0.005736,Autoencoder (768x2),384.0
1,0,0.517421,0.261078,0.013379,Whitening (768x2),384.0
1,0,0.523122,0.950220,0.005821,PCA (384x1),384.0
...,...,...,...,...,...,...
14,0,0.705998,0.482862,0.072484,Siamese network (32x16),2.0
12,0,0.709599,0.460446,0.078406,Average parts (2),2.0
15,0,0.709766,0.466430,0.078096,Whitening (64x32),2.0
15,0,0.710599,0.469082,0.076467,PCA (16x8),2.0


In [115]:
metrics

[      ratio  mean_pos_sim  mean_diff               method  reduction_rate
 0  0.497116      0.106156   0.001288  Autoencoder (768x2)           384.0,
       ratio  mean_pos_sim  mean_diff               method  reduction_rate
 0  0.533258      0.459347   0.029397  Autoencoder (768x4)           192.0,
       ratio  mean_pos_sim  mean_diff               method  reduction_rate
 0  0.506585      0.087603   0.005736  Autoencoder (768x2)           384.0,
       ratio  mean_pos_sim  mean_diff               method  reduction_rate
 0  0.534225      0.458236   0.027082  Autoencoder (768x4)           192.0,
       ratio  mean_pos_sim  mean_diff               method  reduction_rate
 0  0.555796      0.553255   0.037143  Autoencoder (768x6)           128.0,
     ratio  mean_pos_sim  mean_diff               method  reduction_rate
 0  0.5728      0.495247   0.043684  Autoencoder (768x8)            96.0,
       ratio  mean_pos_sim  mean_diff                method  reduction_rate
 0  0.594272      0.51

In [389]:
cmetrics = pd.concat(metrics)

max_ratio = cmetrics.groupby("reduction_rate").max()["ratio"]
cmetrics = cmetrics[cmetrics.apply(lambda x: x["ratio"]==max_ratio[x["reduction_rate"]], axis=1)]
cmetrics = cmetrics.sort_values("reduction_rate", ascending=False)
# cmetrics.to_csv("results/autoencoder.csv")

cmetrics

ValueError: No objects to concatenate

### PCA

In [43]:
metrics = []

In [44]:
from sklearn.decomposition import PCA

In [45]:
def test_pca(model, input_dim):
    def _inner(embeddings):
        x = embeddings.reshape(-1, input_dim)
        x = model.transform(x)
        x = x.reshape(len(embeddings), -1)
        x = x/np.sqrt(np.sum(x**2))
        return x
    return _inner

def get_pca(input_dim, output_dim):
    model = PCA(output_dim)
    # data = train_data[0].reshape(-1, input_dim)
    data = np.concatenate(train_data).reshape(-1, input_dim)
    model.fit(data)
    return model

In [358]:
input_dim = 64
output_dim = 8

model = get_pca(input_dim, output_dim)
calculate_metrics(test_pca(model, input_dim), test_data, f"PCA ({input_dim}x{output_dim})")

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 29993/29993 [00:10<00:00, 2855.68it/s]


Unnamed: 0,ratio,mean_pos_sim,mean_diff,method,reduction_rate
0,0.947754,0.738982,0.224128,PCA (64x8),8.0


In [46]:
for input_dim in tqdm.notebook.tqdm([8, 12, 16, 32, 64, 96, 128, 192, 384, 768]):
    for output_dim in tqdm.notebook.tqdm([1, 2, 4, 6, 8, 12, 16, 32, 64, 96, 128]):
        if input_dim <= output_dim:
            continue
        if input_dim%output_dim != 0:
            continue
            
        model = get_pca(input_dim, output_dim)
        metrics.append(calculate_metrics(test_pca(model, input_dim), test_data, f"PCA ({input_dim}x{output_dim})", show_pbar=False))

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

In [47]:
save_metrics(metrics, "PCA")
aggregate_metrics()

Unnamed: 0.1,Unnamed: 0,ratio,mean_pos_sim,mean_diff,method,reduction_rate
0,0,0.097456,-0.32911,-0.032274,PCA (768x1),768.0
1,0,0.523122,0.95022,0.005821,PCA (384x1),384.0
0,0,0.527723,0.759369,0.031587,Average parts (384),384.0
1,0,0.548961,0.558271,0.055304,Average parts (192),192.0
2,0,0.557663,0.842964,0.019809,PCA (192x1),192.0
2,0,0.564165,0.462989,0.063265,Average parts (128),128.0
3,0,0.566132,0.711833,0.036871,PCA (128x1),128.0
3,0,0.579102,0.451062,0.070158,Average parts (96),96.0
4,0,0.581969,0.692384,0.041328,PCA (96x1),96.0
4,0,0.599573,0.460109,0.075445,Average parts (64),64.0


#### Save the models

In [371]:
import pickle

In [370]:
model = get_pca(64, 4)

In [372]:
with open("models/PCA_64x4.pickle", "wb") as f:
    pickle.dump(model, f)

### UMAP

In [19]:
metrics = []

In [20]:
from cuml.manifold import UMAP

In [None]:
def get_umap(input_dim, output_dim):
    model = UMAP(n_components=output_dim)
    data = train_data[0][:5000].reshape(-1, input_dim)
    model.fit(data)
    return model

In [368]:
model = get_umap(128, 16)

In [None]:
for input_dim in tqdm.notebook.tqdm([8, 12, 16, 32, 64, 96, 128, 192, 384, 768]):
    for output_dim in tqdm.notebook.tqdm([1, 2, 4, 6, 8, 12, 16, 32, 64, 96, 128]):
        if input_dim <= output_dim:
            continue
        if input_dim%output_dim != 0:
            continue
            
        model = get_umap(input_dim, output_dim)
        metrics.append(calculate_metrics(test_pca(model, input_dim), test_data, f"UMAP ({input_dim}x{output_dim})", show_pbar=False))

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

  0%|          | 0/11 [00:00<?, ?it/s]

In [111]:
save_metrics(metrics, "UMAP")
aggregate_metrics()

Unnamed: 0.2,Unnamed: 0,ratio,mean_pos_sim,mean_diff,method,reduction_rate,Unnamed: 0.1
0,0,0.000000,1.000000,0.000000,Autoencoder (768x1),768.000000,0.0
0,0,0.141166,0.730871,0.135898,Whitening (768x1),768.000000,
0,0,0.143567,0.732938,0.137099,PCA (768x1),768.000000,
0,0,0.163105,0.399593,0.023772,UMAP (768x1),768.000000,
1,0,0.561331,0.662625,0.068469,UMAP (768x2),384.000000,
...,...,...,...,...,...,...,...
19,0,0.965759,0.732612,0.226916,Whitening (64x32),2.000000,
15,0,0.965992,0.735349,0.226656,PCA (64x32),2.000000,
20,0,0.967392,0.734172,0.227783,Whitening (96x64),1.500000,
21,0,0.967126,0.733070,0.228542,Whitening (128x96),1.333333,


### LocallyLinearEmbedding

In [118]:
from sklearn.manifold import LocallyLinearEmbedding

In [119]:
batch_size = 512
def test_lle(model, input_dim):
    def _inner(embeddings):
        res = []
        x = embeddings.reshape(-1, input_dim)
        
        for i in range(0, len(x), batch_size):
            res.append(model.transform(x[i:i+batch_size]))

        x = np.concatenate(res)
        x = x.reshape(len(embeddings), -1)
        return x
    return _inner

def get_lle(input_dim, output_dim):
    model = LocallyLinearEmbedding(n_components=output_dim, n_jobs=20)
    data = train_data[0].reshape(-1, input_dim)[:5000]
    model.fit(data)
    return model

In [121]:
metrics = []

In [124]:
# for input_dim in tqdm.notebook.tqdm([8, 16, 32, 64, 128, 768][::-1]):
#     for output_dim in tqdm.notebook.tqdm([2, 4, 8, 16, 32, 64, 128]):

# if input_dim <= output_dim:
#     continue

input_dim = 768
output_dim = 6

model = get_lle(input_dim, output_dim)
metrics.append(calculate_metrics(test_lle(model, input_dim), test_data, f"LocallyLinearEmbedding ({input_dim}x{output_dim})", show_pbar=False))

In [125]:
metrics

[     ratio  mean_pos_sim  mean_diff                          method  \
 0  0.54256      0.478783   0.050243  LocallyLinearEmbedding (768x4)   
 
    reduction_rate  
 0           192.0  ,
       ratio  mean_pos_sim  mean_diff                          method  \
 0  0.546561      0.472823   0.053497  LocallyLinearEmbedding (768x6)   
 
    reduction_rate  
 0           128.0  ]

In [66]:
cmetrics = pd.concat(metrics)

max_ratio = cmetrics.groupby("reduction_rate").max()["ratio"]
cmetrics = cmetrics[cmetrics.apply(lambda x: x["ratio"]==max_ratio[x["reduction_rate"]], axis=1)]
cmetrics = cmetrics.sort_values("reduction_rate", ascending=False)
# cmetrics.to_csv("results/LocallyLinear.csv")

cmetrics

Unnamed: 0,ratio,mean_pos_sim,mean_diff,method,reduction_rate
0,0.624612,0.796312,0.109099,LocallyLinearEmbedding (768x2),384.0
0,0.65802,0.819372,0.113057,LocallyLinearEmbedding (768x4),192.0
0,0.67309,0.826621,0.110064,LocallyLinearEmbedding (768x8),96.0
0,0.622545,0.653948,0.111262,LocallyLinearEmbedding (128x2),64.0
0,0.688461,0.812073,0.126374,LocallyLinearEmbedding (768x16),48.0
0,0.737639,0.451451,0.179626,LocallyLinearEmbedding (64x2),32.0
0,0.707932,0.776378,0.152379,LocallyLinearEmbedding (768x32),24.0
0,0.803888,0.439338,0.183319,LocallyLinearEmbedding (32x2),16.0
0,0.720935,0.741271,0.17213,LocallyLinearEmbedding (768x64),12.0
0,0.852099,0.424947,0.181779,LocallyLinearEmbedding (32x4),8.0


### Whitening Sentence Representations 

In [5]:
def compute_kernel_bias(input_dim):
    """
    y = (x + bias).dot(kernel)
    """
    vecs = train_data[0].reshape(-1, input_dim)
    # vecs = np.concatenate(vecs, axis=0)
    mu = vecs.mean(axis=0, keepdims=True)
    cov = np.cov(vecs.T)
    u, s, vh = np.linalg.svd(cov)
    W = np.dot(u, np.diag(1/np.sqrt(s)))
    return W, -mu

def transform_and_normalize(vecs, kernel, bias):
    if not (kernel is None or bias is None):
        vecs = (vecs + bias).dot(kernel)
    normalized=vecs / (vecs**2).sum(axis=1, keepdims=True)**0.5
    return normalized

def test_whitening(kernel, bias,input_dim):
    def _inner(embeddings):
        x = embeddings.reshape(-1, input_dim)
        x = transform_and_normalize(x,kernel=kernel,bias=bias)
        x = x.reshape(len(embeddings), -1)
        return x
    return _inner

In [6]:
metrics = []

In [7]:
# for input_dim in tqdm.notebook.tqdm([8, 12, 16, 32, 64, 96, 128, 192, 384, 768]):
#     for output_dim in tqdm.notebook.tqdm([1, 2, 4, 6, 8, 12, 16, 32, 64, 96, 128]):
for (input_dim, output_dim) in tqdm.notebook.tqdm([[768, 1],
                                [768, 2],
                                [768, 4],
                                [768, 6],
                                [768, 8],
                                [768, 12],
                                [768, 16],
                                [384, 12],
                                [768, 32],
                                [128, 8],
                                [768, 64],
                                [768, 96],
                                [768, 128],
                                [384, 96],
                                [384, 128],
                                [64, 32]]):

        if input_dim <= output_dim:
            continue
        if input_dim%output_dim != 0:
            continueoutput_dim
        kernel, bias = compute_kernel_bias(input_dim)
        kernel = kernel[:, :output_dim]
        metrics.append(calculate_metrics(test_whitening(kernel,bias,input_dim), test_data, f"Whitening ({input_dim}x{output_dim})", False))

 93%|████████████████████████████████████████████████████████████████████████████████████████████████▉       | 27966/29993 [00:09<00:00, 3097.27it/s]


KeyboardInterrupt: 

In [101]:
metrics

[      ratio  mean_pos_sim  mean_diff             method  reduction_rate
 0  0.010569     -0.096723  -0.011736  Whitening (768x1)           768.0,
       ratio  mean_pos_sim  mean_diff             method  reduction_rate
 0  0.517421      0.261078   0.013379  Whitening (768x2)           384.0,
       ratio  mean_pos_sim  mean_diff             method  reduction_rate
 0  0.547961      0.451754   0.035201  Whitening (768x4)           192.0,
       ratio  mean_pos_sim  mean_diff             method  reduction_rate
 0  0.580869      0.501798   0.054193  Whitening (768x6)           128.0,
       ratio  mean_pos_sim  mean_diff             method  reduction_rate
 0  0.600874      0.513576   0.066501  Whitening (768x8)            96.0,
       ratio  mean_pos_sim  mean_diff              method  reduction_rate
 0  0.631981      0.532373    0.07835  Whitening (768x12)            64.0,
       ratio  mean_pos_sim  mean_diff              method  reduction_rate
 0  0.654653      0.528519   0.085065  Whi

In [102]:
save_metrics(metrics, "whitening")
aggregate_metrics()

Unnamed: 0.1,Unnamed: 0,ratio,mean_pos_sim,mean_diff,method,reduction_rate
0,0,0.010569,-0.096723,-0.011736,Whitening (768x1),768.0
0,0,0.097456,-0.329110,-0.032274,PCA (768x1),768.0
1,0,0.517421,0.261078,0.013379,Whitening (768x2),384.0
1,0,0.523122,0.950220,0.005821,PCA (384x1),384.0
0,0,0.527723,0.759369,0.031587,Average parts (384),384.0
...,...,...,...,...,...,...
14,0,0.705998,0.482862,0.072484,Siamese network (32x16),2.0
12,0,0.709599,0.460446,0.078406,Average parts (2),2.0
15,0,0.709766,0.466430,0.078096,Whitening (64x32),2.0
15,0,0.710599,0.469082,0.076467,PCA (16x8),2.0


### Siamese network

In [75]:
data = torch.tensor(train_data).permute(1,0,2).to(DEVICE)
triplet_loader = torch.utils.data.DataLoader(data.to(torch.float32), batch_size=32, shuffle=True)


class EmbeddingHead(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim):
        super().__init__()
        self.encoder = nn.Sequential(
                    nn.Linear(input_dim, output_dim),
            )
    
    def forward(self, x):
        x = self.encoder(x)
        x = F.normalize(x)
        return x

def get_embedding_model(input_dim, hidden_dim, output_dim):
    encoder = EmbeddingHead(input_dim, hidden_dim, output_dim).to(DEVICE)
    return encoder

def test_embedding(model, dim):
    def _inner(embeddings):
        x = embeddings.reshape(-1, dim)
        x = torch.from_numpy(x).to(torch.float32)
        x = model(x).cpu().detach().numpy()
        x = x.reshape(len(embeddings), -1)
        return x
    return _inner

In [76]:
batch_size = 32

def train_siamese(model, dim, epochs=1, params=None):
    optim = torch.optim.Adam(model.parameters(), params["LR"])
    
    # experiment = Experiment(
    # api_key="sQnInDpH3eOHuFvstNS3jG5jb",
    # project_name="vector-compression",
    # workspace="wwydmanski",
    # display_summary_level=0,
    # auto_metric_logging=False
    # )
    # experiment.add_tag("biobert-siamese-network")
    # experiment.log_parameters(params)
    
    for epoch in tqdm.notebook.trange(1, epochs+1):
        total_loss = []
        model.train()
        for step, sample in enumerate(triplet_loader):
            sample = sample.reshape(-1, 3, dim)
            anchor, pos, neg = sample[:, 0], sample[:, 1], sample[:, 2]

            batch_loss = []
            for idx in range(0, len(anchor), batch_size):
                anchor_ = anchor[idx:idx+batch_size]
                pos_ = pos[idx:idx+batch_size]
                neg_ = neg[idx:idx+batch_size]

                optim.zero_grad()

                anchor_emb = model(anchor_)
                pos_emb = model(pos_)
                neg_emb = model(neg_)

                # loss = (1-(anchor_emb*pos_emb).sum(axis=1))**2 
                # loss += (anchor_emb*neg_emb).sum(axis=1)**2 
                # loss += (3-torch.norm(anchor_emb, dim=1)-torch.norm(pos_emb, dim=1)-torch.norm(neg_emb, dim=1))**2
                # loss = loss.mean()
                loss = F.triplet_margin_with_distance_loss(anchor_emb, pos_emb, neg_emb, 
                                                           distance_function=lambda a,b: 1-F.cosine_similarity(a, b), margin=0.1)
                loss += var_loss(anchor_emb)

                loss.backward()
                optim.step()

                batch_loss.append(loss.item())

            # experiment.log_metric("batch_loss_", np.mean(batch_loss), step=step, epoch=epoch)

            # experiment.log_metric("loss_", np.mean(total_loss), step=step, epoch=epoch)
        
#         model.eval()
#         res = calculate_metrics(test_embedding(model, dim), train_data, f"Siamese network", False)
#         logger.info(res)
#         experiment.log_metric("Train accuracy", res["ratio"].values[0], step=epoch*step)
#         experiment.log_metric("Train mean positive similarity", res["mean_pos_sim"].values[0], step=epoch*step)
#         experiment.log_metric("Train mean difference", res["mean_diff"].values[0], step=epoch*step)
        
#         res = calculate_metrics(test_embedding(model, dim), test_data, f"Siamese network", False)
#         logger.info(res)
#         experiment.log_metric("Test accuracy", res["ratio"].values[0], step=epoch*step)
#         experiment.log_metric("Test mean positive similarity", res["mean_pos_sim"].values[0], step=epoch*step)
#         experiment.log_metric("Test mean difference", res["mean_diff"].values[0], step=epoch*step)
        
#     experiment.log_parameter("Reduction rate", res["reduction_rate"].values[0])
#     experiment.end()

In [77]:
metrics = []

In [55]:
input_dim = 64
output_dim = 8
lr = 3e-4
hidden_dim = 64

params = {"Input dim": input_dim, "Output dim": output_dim, "LR": lr, "Hidden dim": hidden_dim}
model = get_embedding_model(input_dim, hidden_dim, output_dim)

train_siamese(model, input_dim, epochs=10, params=params)
metrics.append(calculate_metrics(test_embedding(model, input_dim), test_data, f"Siamese network ({input_dim}x{output_dim})", False))
logger.info(metrics[-1])
metrics[-1]

COMET INFO: Experiment is live on comet.ml https://www.comet.ml/wwydmanski/vector-compression/ab34483b271644be99412d584415d40c



  0%|          | 0/10 [00:00<?, ?it/s]

COMET ERROR: Error sending a notification, make sure you have opted-in for notifications
COMET INFO: Uploading metrics, params, and assets to Comet before program termination (may take several seconds)
COMET INFO: The Python SDK has 3600 seconds to finish before aborting...


Unnamed: 0,ratio,mean_pos_sim,mean_diff,method,reduction_rate
0,0.679759,0.482061,0.069513,Siamese network (64x8),8.0


In [78]:
dim=32
output_dim = 8
lr = 3e-4
input_dim = 768

# for input_dim in tqdm.notebook.tqdm([16, 32, 64, 96, 128, 192, 384, 768]):
#     for output_dim in tqdm.notebook.tqdm([2, 4, 6, 8, 12, 16, 32, 64, 96, 128]):
#         if input_dim <= output_dim:
#             continue
#         if input_dim%output_dim != 0:
#             continue
for shape in [[768, 2],
                [768, 4],
                [768, 6],
                [768, 8],
                [768, 12],
                [768, 16],
                [128, 4],
                [768, 32],
                [128, 8],
                [768, 64],
                [768, 96],
                [96, 16],
                [128, 32],
                [96, 32],
                [32, 16]]:
        input_dim = shape[0]
        output_dim = shape[1]
        params = {"Input dim": input_dim, "Output dim": output_dim, "LR": lr}
        logger.info(params)

        model = get_embedding_model(input_dim, hidden_dim, output_dim)

        train_siamese(model, input_dim, epochs=10, params=params)
        metrics.append(calculate_metrics(test_embedding(model, input_dim), test_data, f"Siamese network ({input_dim}x{output_dim})", False))        
        logger.info(metrics[-1])
        
        torch.save(model.state_dict(), f"models/siamese_{shape[0]}x{shape[1]}.pt")

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

In [70]:
metrics[-4:]

[      ratio  mean_pos_sim  mean_diff                   method  reduction_rate
 0  0.682059      0.477976   0.067408  Siamese network (96x16)             6.0,
       ratio  mean_pos_sim  mean_diff                    method  reduction_rate
 0  0.670556      0.434534   0.066193  Siamese network (128x32)             4.0,
       ratio  mean_pos_sim  mean_diff                   method  reduction_rate
 0  0.701197      0.470158    0.07258  Siamese network (96x32)             3.0,
       ratio  mean_pos_sim  mean_diff                   method  reduction_rate
 0  0.705798      0.481885   0.072594  Siamese network (32x16)             2.0]

In [66]:
save_metrics(metrics, "siamese")
aggregate_metrics()

Unnamed: 0.1,Unnamed: 0,ratio,mean_pos_sim,mean_diff,method,reduction_rate
0,0,0.010569,-0.096723,-0.011736,Whitening (768x1),768.0
0,0,0.097456,-0.329110,-0.032274,PCA (768x1),768.0
1,0,0.517421,0.261078,0.013379,Whitening (768x2),384.0
1,0,0.523122,0.950220,0.005821,PCA (384x1),384.0
0,0,0.527723,0.759369,0.031587,Average parts (384),384.0
...,...,...,...,...,...,...
14,0,0.705998,0.482862,0.072484,Siamese network (32x16),2.0
12,0,0.709599,0.460446,0.078406,Average parts (2),2.0
15,0,0.710599,0.469082,0.076467,PCA (16x8),2.0
15,0,0.718668,0.459841,0.085242,Whitening (192x96),2.0


In [303]:
cmetrics = pd.concat(metrics)
cmetrics[cmetrics["reduction_rate"]==2]

Unnamed: 0,ratio,mean_pos_sim,mean_diff,method,reduction_rate
0,0.957357,0.830561,0.114643,Siamese network (32x16) [hidden: 64],2.0
0,0.955323,0.798326,0.150121,Siamese network (64x32) [hidden: 64],2.0
0,0.946054,0.811501,0.148327,Siamese network (128x64) [hidden: 64],2.0
0,0.955956,0.81263,0.120257,Siamese network (32x16) [hidden: 128],2.0
0,0.957357,0.781193,0.154783,Siamese network (64x32) [hidden: 128],2.0
0,0.951122,0.817491,0.142804,Siamese network (128x64) [hidden: 128],2.0
0,0.925616,0.692671,0.265935,Siamese network (768x384) [hidden: 64],2.0
0,0.894775,0.671352,0.280919,Siamese network (768x384) [hidden: 64],2.0


In [352]:
save_metrics(metrics, "siamese")
aggregate_metrics()

Unnamed: 0.1,Unnamed: 0,ratio,mean_pos_sim,mean_diff,method,reduction_rate
0,0,0.550879,0.668035,0.040766,UMAP (768x2),384.0
0,0,0.552829,0.385381,0.106327,PCA (768x2),384.0
0,0,0.584403,0.562329,0.183465,Average parts (384),384.0
0,0,0.610883,0.810334,0.124641,Whitening (768x2),384.0
0,0,0.625553,0.839529,0.093303,LocallyLinearEmbedding (768x2),384.0
...,...,...,...,...,...,...
12,0,0.934006,0.676887,0.184324,UMAP (32x16),2.0
4,0,0.958501,0.987163,0.010953,Autoencoder,2.0
13,0,0.960758,0.750421,0.205391,Siamese network (96x48),2.0
12,0,0.964714,0.738850,0.224645,Whitening (128x64),2.0


## Similarity embedding framework

In [5]:
import sef_dr

# SEF PCA

In [6]:
def test_sef(model, input_dim):
    def _inner(embeddings):
        x = embeddings.reshape(-1, input_dim)
        x = model.transform(x)
        x = x.reshape(len(embeddings), -1)
        return x
    return _inner

def get_pca(input_dim, output_dim):
    model = PCA(output_dim)
    data = train_data[0].reshape(-1, input_dim)
    model.fit(data)
    return model

In [None]:
metrics=[]
for input_dim in [64, 128, 768]:
    for target_dim in [32, 64, 128]:
        for output_dim in [32, 64, 128]:
            if input_dim <= output_dim or output_dim >= target_dim or target_dim>=input_dim:
                continue
            print(f"input {input_dim} output {output_dim} target {target_dim}")
            
            data = torch.tensor(train_data[0]).reshape(-1, input_dim)
            model = get_pca(input_dim, target_dim)
            model.fit(data)
            target_data = np.float32(model.transform(data))
            print("model built")
            proj = sef_dr.LinearSEF(input_dimensionality=input_dim, output_dimensionality=output_dim)
            print("-")
            loss = proj.fit(data=data, target_data=target_data, target='copy', epochs=50, batch_size=128, verbose=True, learning_rate=0.001, regularizer_weight=0.001)
            print("trianed")
            metrics.append(calculate_metrics(test_sef(proj,input_dim), test_data, f"SEF PCA ({input_dim}x{output_dim}|{target_dim})"))

input 128 output 32 target 64
model built
-


In [None]:
cmetrics = pd.concat(metrics)

max_ratio = cmetrics.groupby("reduction_rate").max()["ratio"]
cmetrics = cmetrics[cmetrics.apply(lambda x: x["ratio"]==max_ratio[x["reduction_rate"]], axis=1)]
cmetrics = cmetrics.sort_values("reduction_rate", ascending=False)
cmetrics.to_csv("results/SEF_PCA.csv")
cmetrics

# SEF Whitening


In [None]:
metrics=[]
for input_dim in [64, 128, 768]:
    for target_dim in [32, 64, 128]:
        for output_dim in [32, 64, 128]:
            if input_dim <= output_dim or output_dim >= target_dim or target_dim>=input_dim:
                continue
            print(f"input {input_dim} output {output_dim} target {target_dim}")
            data=train_data[0].reshape(-1, input_dim)
            # data = torch.tensor(train_data[0]).reshape(-1, input_dim)
            kernel, bias = compute_kernel_bias(input_dim)
            kernel = kernel[:, :target_dim]
            target_data = transform_and_normalize(data,kernel=kernel,bias=bias)
            
            print("model built")
            proj = sef_dr.LinearSEF(input_dimensionality=input_dim, output_dimensionality=output_dim)
            loss = proj.fit(data=data, target_data=target_data, target='copy', epochs=50, batch_size=128, verbose=True, learning_rate=0.001, regularizer_weight=0.001)
            print("trianed")
            metrics.append(calculate_metrics(test_sef(proj,input_dim), test_data, f"SEF Whitening({input_dim}x{output_dim}|{target_dim})"))

In [None]:
cmetrics = pd.concat(metrics)

max_ratio = cmetrics.groupby("reduction_rate").max()["ratio"]
cmetrics = cmetrics[cmetrics.apply(lambda x: x["ratio"]==max_ratio[x["reduction_rate"]], axis=1)]
cmetrics = cmetrics.sort_values("reduction_rate", ascending=False)
cmetrics.to_csv("results/SEF_Whitening.csv")


In [None]:
cmetrics

# SEF Averaging

In [None]:
metrics=[]
for input_dim in [64, 128, 768]:
    for target_rate in [4, 8, 16, 32, 64, 128]:
        for output_dim in [32, 64, 128]:
            target_dim=input_dim/target_rate
            if input_dim <= output_dim or output_dim >= target_dim or target_dim>=input_dim:
                continue
            print(f"input {input_dim} output {output_dim} target {target_rate}")
            target_data = train_data[0].reshape(-1, target_rate)
            target_data.mean(axis=1).reshape(-1, 1)
            target_data = data.reshape(len(train_data[0]), -1)
            print(target_data.shape)
            print("model built")
            data=train_data[0].reshape(-1, input_dim)
            proj = sef_dr.LinearSEF(input_dimensionality=input_dim, output_dimensionality=output_dim)
            loss = proj.fit(data=data, target_data=target_data, target='copy', epochs=50, batch_size=128, verbose=True, learning_rate=0.001, regularizer_weight=0.001)
            print("trianed")
            metrics.append(calculate_metrics(test_sef(proj,input_dim), test_data, f"SEF Averaging({input_dim}x{output_dim}|{target_rate})"))

In [None]:
cmetrics = pd.concat(metrics)

max_ratio = cmetrics.groupby("reduction_rate").max()["ratio"]
cmetrics = cmetrics[cmetrics.apply(lambda x: x["ratio"]==max_ratio[x["reduction_rate"]], axis=1)]
cmetrics = cmetrics.sort_values("reduction_rate", ascending=False)
cmetrics.to_csv("results/SEF_Averaging.csv")
cmetrics