In [27]:
import pickle
import torch
from sklearn.decomposition import PCA
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import tqdm
import numpy as np
from loguru import logger
import json
import glob

In [3]:
import warnings
warnings.filterwarnings("ignore")

In [4]:
from torch import nn
import torch.nn.functional as F

In [6]:
logger.remove()
logger.add("log2.txt")

2

In [1]:
DEVICE = "cuda:0"

## Load models for vector generation

## Load data

### Read vectors

In [7]:
embeddings = np.load("data/vectors.npy")

In [8]:
indices = np.arange(embeddings.shape[0]//3)

train_idx, test_idx = train_test_split(indices, test_size=0.3, random_state=42)

train_data = [embeddings[train_idx*3], embeddings[train_idx*3+1], embeddings[train_idx*3+2]]
test_data = [embeddings[test_idx*3], embeddings[test_idx*3+1], embeddings[test_idx*3+2]]

print(train_data[0].shape)
print(test_data[0].shape)

(69982, 768)
(29993, 768)


## Measure cosine similarity differences between pos and neg

In [9]:
def calculate_metrics(method, data, name, show_pbar=True):
    orig_shape = data[0].shape[1]
    pos_similarity = []
    neg_similarity = []

    queue = []
    anchors = method(data[0])
    positives = method(data[1])
    negatives = method(data[2])
   
    for i in tqdm.trange(len(anchors), disable=not show_pbar):
        query, pos, neg = anchors[i], positives[i], negatives[i]
        pos_similarity.append(cosine_similarity([query], [pos])[0][0])
        neg_similarity.append(cosine_similarity([query], [neg])[0][0])

    pos_similarity = np.array(pos_similarity)
    neg_similarity = np.array(neg_similarity)
    
    ratio = np.sum(pos_similarity>neg_similarity)/len(pos_similarity)
    mean_diff = pos_similarity.mean()-neg_similarity.mean()
    mean_pos_sim = pos_similarity.mean()
    
    reduction_rate = orig_shape/len(anchors[0])
    return pd.DataFrame.from_dict([{
                        "ratio": ratio,
                        "mean_pos_sim": mean_pos_sim,
                        "mean_diff": mean_diff,
                        "method": name,
                        "reduction_rate": reduction_rate
                    }])

## Prepare models for final evaluation

### Load benchmark results

In [14]:
bench_results = pd.read_csv("results/total.csv", index_col="method")

In [24]:
top_ratios = bench_results.groupby("reduction_rate").max(["ratio"])[["ratio"]].to_dict()['ratio']

In [26]:
bench_results[bench_results.apply(lambda x: top_ratios[x["reduction_rate"]]==x["ratio"], axis=1)]

Unnamed: 0_level_0,Unnamed: 0,Unnamed: 0.1,ratio,mean_pos_sim,mean_diff,reduction_rate
method,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
PCA (768x1),0,0,0.143567,0.732938,0.137099,768.0
Siamese network (768x2) [hidden: 128],0,0,0.664555,0.909573,0.1487,384.0
Siamese network (768x4) [hidden: 64],1,0,0.756276,0.836421,0.212844,192.0
PCA (128x1),3,0,0.743307,0.702645,0.197961,128.0
Siamese network (768x8) [hidden: 128],2,0,0.822292,0.771496,0.251995,96.0
PCA (64x1),5,0,0.823392,0.705868,0.22448,64.0
Siamese network (768x16) [hidden: 128],4,0,0.871403,0.728268,0.269258,48.0
Average parts (32),6,0,0.88424,0.731807,0.232372,32.0
Average parts (24),7,0,0.903611,0.73449,0.231183,24.0
PCA (64x4),9,0,0.923015,0.734515,0.219317,16.0


### Prepare PCA models

In [31]:
from sklearn.decomposition import PCA

In [29]:
embeddings.shape

(299925, 768)

In [30]:
def get_pca(input_dim, output_dim):
    model = PCA(output_dim)
    # data = train_data[0].reshape(-1, input_dim)
    data = embeddings.reshape(-1, input_dim)
    model.fit(data)
    return model

In [32]:
for shape in [[64, 32],
              [384, 128],
              [768, 128],
              [384, 96],
              [64, 4]]:
    print(shape)
    model = get_pca(shape[0], shape[1])
    with open(f"models/PCA_{shape[0]}x{shape[1]}.pickle", "wb") as f:
        pickle.dump(model, f)
          

[64, 32]
[384, 128]
[768, 128]
[384, 96]
[64, 4]
