### Checking GPU and Clearing Cache

In [81]:
import torch

# Clear GPU memory
torch.cuda.empty_cache()

In [82]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Fri May  5 02:41:11 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   59C    P0    38W /  70W |   3709MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [83]:
!pwd

/content


### Data Loading and Pre-Processing

In [None]:
!pip install datasets
from datasets import load_dataset

In [7]:
dataset = load_dataset('allenai/scirepeval','cite_prediction_new', cache_dir='/content')



  0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['query', 'pos', 'neg'],
        num_rows: 6197963
    })
    validation: Dataset({
        features: ['query', 'pos', 'neg'],
        num_rows: 176430
    })
})

In [None]:
dataset

DatasetDict({
    train: Dataset({
        features: ['query', 'pos', 'neg'],
        num_rows: 6197963
    })
    validation: Dataset({
        features: ['query', 'pos', 'neg'],
        num_rows: 176430
    })
})

In [None]:
dataset['train'][0]

In [None]:
import tensorflow as tf

tf.config.experimental.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
!pip install transformers

In [None]:
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoTokenizer, AutoModel
from datasets import DatasetDict

In [None]:
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

In [None]:
class ResearchPaperDataset(Dataset):
    def __init__(self, queries, pos_examples, neg_examples):
        self.queries = queries
        self.pos_examples = pos_examples
        self.neg_examples = neg_examples
    
    def __len__(self):
        return len(self.queries)
    
    def __getitem__(self, idx):
        query = self.queries[idx]
        pos_example = self.pos_examples[idx]
        neg_example = self.neg_examples[idx]
        return query, pos_example, neg_example

In [9]:
queries_train = []
pos_train = []
neg_train = []

# Define the batch size
batch_size = 100

# Iterate over the dataset in batches
for i in range(0, len(dataset["train"]), batch_size):
    batch = dataset["train"][i:i+batch_size]
    queries_train.extend(batch['query'])
    pos_train.extend(batch['pos'])
    neg_train.extend(batch['neg'])

# Print the number of queries
print(len(queries_train))


6197963


In [26]:
del queries_train
del pos_train
del neg_train

In [10]:
queries_val = []
pos_val = []
neg_val = []

# Define the batch size
batch_size = 100

# Iterate over the dataset in batches
for i in range(0, len(dataset["validation"]), batch_size):
    batch = dataset["validation"][i:i+batch_size]
    queries_val.extend(batch['query'])
    pos_val.extend(batch['pos'])
    neg_val.extend(batch['neg'])

# Print the number of queries
print(len(queries_val))


176430


In [None]:
len(queries_val)

176430

In [11]:
q_v = []

for item in queries_val:
    q_v.append(item['title'] + ' ' + item['abstract'])

In [12]:
with open('q_v.txt', 'w') as file:
    for string in q_v:
        file.write(repr(string) + '\n')


In [None]:
q_v=[]
with open('q_v.txt', 'r') as file:
    q_v = file.readlines()


In [None]:
len(q_v)

176430

In [13]:
p_v = []

for item in pos_val:
  p_v.append(item['title']+ ' '+ item['abstract'])

In [14]:
with open('p_v.txt', 'w') as file:
    for string in p_v:
        file.write(repr(string) + '\n')


In [None]:
p_v=[]
with open('p_v.txt', 'r') as file:
    p_v = file.readlines()


In [15]:
len(p_v)

176430

In [None]:
lines[-1]

"'Economy of exploiting heat from low-temperature geothermal sources using a heat pump The article describes the economy of exploiting heat from low-temperature geothermal sources for high-temperature heating of buildings using a heat pump. For the exploitation of low-temperature geothermal sources, a two stage heat pump with a heat exchanger was planned. The pump consists of two single stage heat pumps which use different refrigerants at each stage. At stage 1, the calculation of the heat pump is conducted with refrigerant R407c; at stage 2 of the heat pump, the refrigerant R600a is used. The main operational characteristics of a two stage heat pump are presented in the form of diagrams. For the exploitation of heat from geothermal water with a temperature of 45 °C, a profitability evaluation of the investment in the heat pump was carried out, using the method of the net present value. In the research, also the coefficient of profitability and the period of time in which the investmen

In [16]:
n_v = []

for item in neg_val:
  n_v.append(item['title']+ ' '+ item['abstract'])

In [17]:
with open('n_v.txt', 'w') as file:
    for string in n_v:
        file.write(repr(string) + '\n')


In [None]:
n_v=[]
with open('n_v.txt', 'r') as file:
    n_v = file.readlines()


In [None]:
len(n_v)

176430

In [18]:
q_t = []

for item in queries_train:
  if item['abstract'] is None:
    q_t.append(item['title'])
  else:
    q_t.append(item['title']+ ' '+ item['abstract'])

In [19]:
with open('q_t.txt', 'w') as file:
    for string in q_t:
        file.write(repr(string) + '\n')

In [None]:
q_t=[]
with open('q_t.txt', 'r') as file:
    q_t = file.readlines()


In [None]:
len(q_t)

6197963

In [20]:
p_t = []

for item in pos_train:
  if item['abstract'] is None:
    p_t.append(item['title'])
  else:
    p_t.append(item['title']+ ' '+ item['abstract'])

In [21]:
with open('p_t.txt', 'w') as file:
    for string in p_t:
        file.write(repr(string) + '\n')


In [None]:
p_t=[]
with open('p_t.txt', 'r') as file:
    p_t = file.readlines()


In [None]:
len(p_t)

6197963

In [22]:
n_t = []

for item in neg_train:
  if item['abstract'] is None:
    n_t.append(item['title'])
  else:
    n_t.append(item['title']+ ' '+ item['abstract'])

In [23]:
with open('n_t.txt', 'w') as file:
    for string in n_t:
        file.write(repr(string) + '\n')


In [None]:
n_t=[]
with open('n_t.txt', 'r') as file:
    n_t = file.readlines()


In [25]:
del dataset

In [None]:
len(n_t)

6197963

In [None]:
len(n_t)

6197963

In [None]:
import shutil

total, used, free = shutil.disk_usage("/")
print(f"Total: {total} bytes")
print(f"Used: {used} bytes")
print(f"Free: {free} bytes")


Total: 199672631296 bytes
Used: 123796434944 bytes
Free: 75859419136 bytes


In [None]:
!pwd

/content


In [None]:
!ls

n_t.txt  n_v.txt  p_t.txt  p_v.txt  q_t.txt  q_v.txt  sample_data


In [None]:
from google.colab import files

file_path = '/content/q_t.txt'

# Trigger the download
files.download(file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files

file_path = '/content/q_v.txt'

# Trigger the download
files.download(file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files

file_path = '/content/p_t.txt'

# Trigger the download
files.download(file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files

file_path = '/content/n_t.txt'

# Trigger the download
files.download(file_path)

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [7]:
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.28.1-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m47.0 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m68.5 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, transformers
Successfully installed tokenizers-0.13.3 transformers-4.28.1


### Model- fine tuning SCI-BERT


In [1]:
import torch
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer

# Set device to use GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define the SCI-BERT model and tokenizer
model_name = "allenai/scibert_scivocab_uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name).to(device)

# Define the triplet margin loss function
class TripletMarginLoss(nn.Module):
    def __init__(self, margin=1.0):
        super().__init__()
        self.margin = margin

    def forward(self, query, pos, neg):
        pos_dist = torch.sqrt(torch.sum((query - pos) ** 2, dim=1))
        neg_dist = torch.sqrt(torch.sum((query - neg) ** 2, dim=1))
        loss = torch.mean(torch.clamp(pos_dist - neg_dist + self.margin, min=0))
        return loss

triplet_loss = TripletMarginLoss().to(device)

# Define the optimizer
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-6)

# Define the training loop
def train_model(epochs, batch_size, query_train, pos_train, neg_train, query_val, pos_val, neg_val):
    for epoch in range(epochs):
        model.train()
        train_loss = 0
        for i in tqdm(range(0, len(query_train), batch_size)):
            queries = query_train[i:i+batch_size]
            pos_papers = pos_train[i:i+batch_size]
            neg_papers = neg_train[i:i+batch_size]

            # Tokenize the input and convert to tensors
            inputs = tokenizer(queries + pos_papers + neg_papers, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
            query_ids, pos_ids, neg_ids = torch.split(inputs["input_ids"], [len(queries), len(pos_papers), len(neg_papers)])
            query_attn, pos_attn, neg_attn = torch.split(inputs["attention_mask"], [len(queries), len(pos_papers), len(neg_papers)])
            query_emb = model(input_ids=query_ids, attention_mask=query_attn)[0][:, 0, :]
            pos_emb = model(input_ids=pos_ids, attention_mask=pos_attn)[0][:, 0, :]
            neg_emb = model(input_ids=neg_ids, attention_mask=neg_attn)[0][:, 0, :]

            # Compute the loss and update the model parameters
            optimizer.zero_grad()
            loss = triplet_loss(query_emb, pos_emb, neg_emb)
            loss.backward()
            optimizer.step()

            train_loss += loss.item()

        # Evaluate the model on the validation set
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for i in tqdm(range(0, len(query_val), batch_size)):
                queries = query_val[i:i+batch_size]
                pos_papers = pos_val[i:i+batch_size]
                neg_papers = neg_val[i:i+batch_size]

                # Tokenize the input and convert to tensors
                inputs = tokenizer(queries + pos_papers + neg_papers, padding=True, truncation=True, max_length=512, return_tensors="pt").to(device)
                query_ids, pos_ids, neg_ids = torch.split(inputs["input_ids"], [len(queries), len(pos_papers), len(neg_papers)])
                query_attn, pos_attn, neg_attn = torch.split(inputs["input_ids"], [len(queries), len(pos_papers), len(neg_papers)])
                query_attn, pos_attn, neg_attn = torch.split(inputs["attention_mask"], [len(queries), len(pos_papers), len(neg_papers)])
                query_emb = model(input_ids=query_ids, attention_mask=query_attn)[0][:, 0, :]
                pos_emb = model(input_ids=pos_ids, attention_mask=pos_attn)[0][:, 0, :]
                neg_emb = model(input_ids=neg_ids, attention_mask=neg_attn)[0][:, 0, :]
                loss = triplet_loss(query_emb, pos_emb, neg_emb)

                val_loss += loss.item()

        # Print the average loss for the epoch
        print(f"Epoch {epoch+1} - Train Loss: {train_loss/len(query_train)}, Val Loss: {val_loss/len(query_val)}")


Some weights of the model checkpoint at allenai/scibert_scivocab_uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [6]:
from tqdm import tqdm

In [2]:
n_v=[]
with open('n_v.txt', 'r') as file:
    n_v = file.readlines()

p_v=[]
with open('p_v.txt', 'r') as file:
    p_v = file.readlines()


q_v=[]
with open('q_v.txt', 'r') as file:
    q_v = file.readlines()


In [3]:
n_t=[]
with open('n_t.txt', 'r') as file:
    n_t = file.readlines()

p_t=[]
with open('p_t.txt', 'r') as file:
    p_t = file.readlines()


q_t=[]
with open('q_t.txt', 'r') as file:
    q_t = file.readlines()

In [4]:
len(q_v)

176430

In [11]:
epochs = 2
batch_size = 4

train_model(epochs, batch_size, q_t[:80000], p_t[:80000], n_t[:80000], q_v[:8000], p_v[:8000], n_v[:8000])

100%|██████████| 20000/20000 [5:30:43<00:00,  1.01it/s]
100%|██████████| 2000/2000 [11:46<00:00,  2.83it/s]


Epoch 1 - Train Loss: 0.054051678788661954, Val Loss: 0.06307736660540104


100%|██████████| 20000/20000 [5:28:42<00:00,  1.01it/s]
100%|██████████| 2000/2000 [11:46<00:00,  2.83it/s]

Epoch 2 - Train Loss: 0.029059681575000287, Val Loss: 0.06922645157575608





In [12]:
torch.save(model, '/content/model.pth')

In [15]:
train_model(epochs, batch_size, q_t[80000:200000], p_t[80000:200000], n_t[80000:200000], q_v[8000:20000], p_v[8000:20000], n_v[8000:20000])

100%|██████████| 30000/30000 [8:15:21<00:00,  1.01it/s]
100%|██████████| 3000/3000 [17:13<00:00,  2.90it/s]


Epoch 1 - Train Loss: 0.051437678973873455, Val Loss: 0.061308435837427774


100%|██████████| 30000/30000 [8:12:27<00:00,  1.02it/s]
100%|██████████| 3000/3000 [17:13<00:00,  2.90it/s]

Epoch 2 - Train Loss: 0.026615897726019223, Val Loss: 0.06932951959967613





In [19]:
torch.save(model, '/content/model3.pth')

In [20]:
model.config

BertConfig {
  "_name_or_path": "allenai/scibert_scivocab_uncased",
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.28.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 31090
}

### Evaluation

In [23]:
dataset = load_dataset('allenai/scirepeval','search', split='evaluation', cache_dir='/content')

Downloading and preparing dataset scirepeval/search (download: 8.98 GiB, generated: 7.47 GiB, post-processed: Unknown size, total: 16.46 GiB) to /content/allenai___scirepeval/search/1.1.0/05b6b341c1750891a5539df5b3eb892babd18e8f299bfaba4735b87e20f6cf83...
{'train': 'https://ai2-s2-research-public.s3.us-west-2.amazonaws.com/scirepeval/train/search/train.jsonl', 'val': 'https://ai2-s2-research-public.s3.us-west-2.amazonaws.com/scirepeval/train/search/val.jsonl', 'test': 'https://ai2-s2-research-public.s3.us-west-2.amazonaws.com/scirepeval/test/search/meta.jsonl'}


Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/8.29G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.31G [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.7M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating evaluation split:   0%|          | 0/2637 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/399878 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/67363 [00:00<?, ? examples/s]

Dataset scirepeval downloaded and prepared to /content/allenai___scirepeval/search/1.1.0/05b6b341c1750891a5539df5b3eb892babd18e8f299bfaba4735b87e20f6cf83. Subsequent calls will reuse this data.


In [101]:
dataset

Dataset({
    features: ['query', 'doc_id', 'candidates'],
    num_rows: 2637
})

In [24]:
papers = dataset['candidates']

In [25]:
papers_dict ={}
ground_truth = []
for i in range(len(papers)):
  temp = []
  for j in range(10):
    papers_dict[papers[i][j]['corpus_id']] = papers[i][j]['title']+ ' ' + papers[i][j]['abstract']
    temp.append(papers[i][j]['corpus_id'])
  ground_truth.append(temp)

In [26]:
new_dataset = [{'id': key, 'abstract': value} for key, value in papers_dict.items()]

In [None]:
!pip install transformers
!pip install nltk

In [28]:
import nltk
from tqdm import tqdm
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [89]:
import torch
from transformers import AutoTokenizer, AutoModel
from nltk.corpus import stopwords

# Load the pretrained SciBERT model and tokenizer
model = torch.load('/content/model3.pth')
tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

stop_words = set(stopwords.words('english'))

# Generate embeddings for each abstract
embeddings = []
for i in tqdm(range(len(new_dataset))):
    abstract = new_dataset[i]['abstract']
    # Tokenize the abstract
    tokens = tokenizer.tokenize(abstract)
    # Remove stop words
    tokens_without_stopwords = [token for token in tokens if token.lower() not in stop_words]
    # Encode the abstract
    encoded_abstract = tokenizer.encode_plus(' '.join(tokens_without_stopwords),
                                             add_special_tokens=True,
                                             max_length=512,
                                             truncation=True,
                                             padding='max_length',
                                             return_tensors='pt').to(device)
    # Generate SciBERT embedding for the abstract
    with torch.no_grad():
        outputs = model(encoded_abstract.input_ids)
        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Extract embedding from the first token [CLS]
    embeddings.append(embedding[0])

100%|██████████| 26048/26048 [16:28<00:00, 26.35it/s]


In [90]:
len_pap = len(dataset['candidates'])

In [91]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

In [92]:
embeddings_array = np.array(embeddings).reshape(len(embeddings), -1)

In [93]:
recommended_papers = []
for i in tqdm(range(len_pap)):
  query = dataset['query'][i]

  # Preprocess the query in the same way as the documents
  query_tokens = tokenizer.tokenize(query)
  query_tokens_without_stopwords = [token for token in query_tokens if token.lower() not in stop_words]
  encoded_query = tokenizer.encode_plus(' '.join(query_tokens_without_stopwords),
                                        add_special_tokens=True,
                                        max_length=512,
                                        truncation=True,
                                        padding='max_length',
                                        return_tensors='pt')

  # Generate SciBERT embedding for the query
  with torch.no_grad():
      query_input_ids = encoded_query.input_ids.to(device)
      query_embedding = model(query_input_ids).last_hidden_state[:, 0, :].cpu().numpy()

  # Calculate cosine similarity between the query embedding and document embeddings
  query_embedding_tensor = torch.tensor(query_embedding).to(device)
  similarity_scores = cosine_similarity(query_embedding_tensor.cpu().numpy(), embeddings_array)
  # print(len(query_embedding_tensor.cpu().numpy()))
  # print(len(embeddings_array))
  # break;

  # Get the indices of the top 10 similar documents
  top_indices = np.argsort(similarity_scores, axis=1)[:, -100:][0][::-1]

  # Retrieve the IDs of the top similar documents
  top_documents = [new_dataset[i]['id'] for i in top_indices]
  temp = []
  for doc_id in top_documents:
    temp.append(doc_id)
  recommended_papers.append(temp)

100%|██████████| 2637/2637 [05:51<00:00,  7.50it/s]


In [94]:
len(recommended_papers)

2637

In [100]:
import numpy as np

def calculate_dcg(scores):
    positions = np.arange(1, len(scores) + 1)
    discounts = np.log2(positions + 1)
    return np.sum(scores / discounts)

def calculate_ndcg(ground_truth, recommended_documents, k):
    ndcg_scores = []
    for gt_docs, rec_docs in zip(ground_truth, recommended_documents):
        relevance_scores = np.zeros(k)
        for i, doc_id in enumerate(rec_docs[:k]):
            if doc_id in gt_docs:
                relevance_scores[i] = 1
        ideal_dcg = calculate_dcg(sorted(relevance_scores, reverse=True))
        dcg = calculate_dcg(relevance_scores)
        ndcg = dcg / ideal_dcg if ideal_dcg > 0 else 0.5 
        ndcg_scores.append(ndcg)
    return np.mean(ndcg_scores)


k = 5 

ndcg = calculate_ndcg(ground_truth, recommended_papers, k)
print(f"NDCG@{k}: {ndcg:.4f}")


NDCG@5: 0.5577


In [106]:
query = new_dataset[0]['abstract']

In [109]:
query_tokens = tokenizer.tokenize(query)
query_tokens_without_stopwords = [token for token in query_tokens if token.lower() not in stop_words]
encoded_query = tokenizer.encode_plus(' '.join(query_tokens_without_stopwords),
                                      add_special_tokens=True,
                                      max_length=512,
                                      truncation=True,
                                      padding='max_length',
                                      return_tensors='pt')

# Generate SciBERT embedding for the query
with torch.no_grad():
    query_input_ids = encoded_query.input_ids.to(device)
    query_embedding = model(query_input_ids).last_hidden_state[:, 0, :].cpu().numpy()

# Calculate cosine similarity between the query embedding and document embeddings
query_embedding_tensor = torch.tensor(query_embedding).to(device)
similarity_scores = cosine_similarity(query_embedding_tensor.cpu().numpy(), embeddings_array)
# print(len(query_embedding_tensor.cpu().numpy()))
# print(len(embeddings_array))
# break;

# Get the indices of the top 10 similar documents
top_indices = np.argsort(similarity_scores, axis=1)[:, -10:][0][::-1]

# Retrieve the IDs of the top similar documents
top_documents = [new_dataset[i]['id'] for i in top_indices]
temp = []
for doc_id in top_documents:
  temp.append(doc_id)
  # print(papers_dict[doc_id])
print(temp)

[80054354, 208446849, 220150649, 165053669, 80387586, 51930068, 76745999, 54150229, 4763889, 190866823]


In [None]:
papers_dict