In [7]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import pandas as pd
import numpy as np
import torch
from tqdm import tqdm

In [8]:
df = pd.read_csv("data/finegrained/train.csv")

In [9]:
checkpoint = 'coarsegrained_labels_based_retriever/checkpoint-7500'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = AutoModelForSequenceClassification.from_pretrained(checkpoint)

In [10]:
sentences = df["text"].tolist()

In [11]:
all_embeddings = []
for sentence in tqdm(sentences, leave = False):
    with torch.no_grad():
        inputs = tokenizer(sentence, return_tensors='pt', truncation=True, padding = 'max_length', max_length = 25)
        outputs = model(**inputs, output_hidden_states = True)
        last_layer_hidden_states = outputs.hidden_states[-1]
        all_embeddings.append(last_layer_hidden_states)
all_embeddings = np.array([embedding.numpy() for embedding in all_embeddings])
embeddings_means = np.mean(all_embeddings, axis=2, keepdims=True).squeeze()

                                                   

In [12]:
from sklearn.neighbors import NearestNeighbors

neigh = NearestNeighbors(n_neighbors=10)
neigh.fit(embeddings_means)
NearestNeighbors(n_neighbors=5)

In [13]:
index = 20
results = neigh.kneighbors(embeddings_means[index:index + 1])
indices = results[1][0]
print('actual sentence: ', sentences[index])
print('------------------------')
print('similar sentences: ')
for i in indices:
    print(sentences[i])
    print('---')

actual sentence:  The student didn't want to participate in the beach cleanup. Later, the student's advisor says "I can't believe she doesn't believe in climate change".
------------------------
similar sentences: 
The student didn't want to participate in the beach cleanup. Later, the student's advisor says "I can't believe she doesn't believe in climate change".
---
The student didn't announce his figure as the beach cleanup. Afterwards, by student's spokesperson expressed " I can't believe she doesn'of interest in climate change ".
---
The student didn't choose to participate in the beach protest. Later, another student's advisor responded " we can'a believe she doesn'f graduate in odor reduction ".
---
The student didn't want anyone participate on the environmental cleanup. Consequently, the girl's advisor saying "... can'help pretend she doesn't believe in climate management ".
---
The student didn're answer to give into the beach cleanup. Later, the student's advisor said " peopl