In [1]:
import torch

# Clear GPU memory
torch.cuda.empty_cache()

In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Fri May  5 02:43:49 2023       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12    Driver Version: 525.85.12    CUDA Version: 12.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla T4            Off  | 00000000:00:04.0 Off |                    0 |
| N/A   53C    P8    10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
!pip install datasets
from datasets import load_dataset

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [4]:
dataset = load_dataset('allenai/scirepeval','search', split='evaluation')



In [5]:
papers = dataset['candidates']

In [6]:
papers_dict ={}
ground_truth = []
for i in range(len(papers)):
  temp = []
  for j in range(10):
    papers_dict[papers[i][j]['corpus_id']] = papers[i][j]['title']+ papers[i][j]['abstract']
    temp.append(papers[i][j]['corpus_id'])
  ground_truth.append(temp)

In [7]:
new_dataset = [{'id': key, 'abstract': value} for key, value in papers_dict.items()]

In [8]:
!pip install transformers
!pip install nltk

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [9]:
import nltk
from tqdm import tqdm

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
import torch
from transformers import AutoTokenizer, AutoModel
from nltk.corpus import stopwords

# Load the pretrained SciBERT model and tokenizer
model_name = 'allenai/specter'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

stop_words = set(stopwords.words('english'))

# Generate embeddings for each abstract
embeddings = []
for i in tqdm(range(len(new_dataset))):
    abstract = new_dataset[i]['abstract']
    # Tokenize the abstract
    tokens = tokenizer.tokenize(abstract)
    # Remove stop words
    tokens_without_stopwords = [token for token in tokens if token.lower() not in stop_words]
    # Encode the abstract
    encoded_abstract = tokenizer.encode_plus(' '.join(tokens_without_stopwords),
                                             add_special_tokens=True,
                                             max_length=512,
                                             truncation=True,
                                             padding='max_length',
                                             return_tensors='pt').to(device)
    # Generate SciBERT embedding for the abstract
    with torch.no_grad():
        outputs = model(encoded_abstract.input_ids)
        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Extract embedding from the first token [CLS]
    embeddings.append(embedding)

100%|██████████| 26048/26048 [16:11<00:00, 26.82it/s]


In [12]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Convert the embeddings list to a numpy array and reshape it
embeddings_array = np.array(embeddings).reshape(len(embeddings), -1)

In [13]:
len_pap = len(dataset['candidates'])

In [14]:
recommended_papers = []
for i in tqdm(range(len_pap)):
  query = dataset['query'][i]

  # Preprocess the query in the same way as the documents
  query_tokens = tokenizer.tokenize(query)
  query_tokens_without_stopwords = [token for token in query_tokens if token.lower() not in stop_words]
  encoded_query = tokenizer.encode_plus(' '.join(query_tokens_without_stopwords),
                                        add_special_tokens=True,
                                        max_length=512,
                                        truncation=True,
                                        padding='max_length',
                                        return_tensors='pt')

  # Generate SciBERT embedding for the query
  with torch.no_grad():
      query_input_ids = encoded_query.input_ids.to(device)
      query_embedding = model(query_input_ids).last_hidden_state[:, 0, :].cpu().numpy()

  # Calculate cosine similarity between the query embedding and document embeddings
  query_embedding_tensor = torch.tensor(query_embedding).to(device)
  similarity_scores = cosine_similarity(query_embedding_tensor.cpu().numpy(), embeddings_array)
  # print(len(query_embedding_tensor.cpu().numpy()))
  # print(len(embeddings_array))
  # break;

  # Get the indices of the top 10 similar documents
  top_indices = np.argsort(similarity_scores, axis=1)[:, -10000:][0][::-1]

  # Retrieve the IDs of the top similar documents
  top_documents = [new_dataset[i]['id'] for i in top_indices]
  temp = []
  for doc_id in top_documents:
    temp.append(doc_id)
  recommended_papers.append(temp)

100%|██████████| 2637/2637 [07:22<00:00,  5.96it/s]


In [18]:
import numpy as np

def calculate_dcg(scores):
    positions = np.arange(1, len(scores) + 1)
    discounts = np.log2(positions + 1)
    return np.sum(scores / discounts)

def calculate_ndcg(ground_truth, recommended_documents, k):
    ndcg_scores = []
    for gt_docs, rec_docs in zip(ground_truth, recommended_documents):
        relevance_scores = np.zeros(k)
        for i, doc_id in enumerate(rec_docs[:k]):
            if doc_id in gt_docs:
                relevance_scores[i] = 1 # Assign a relevance score of 1 for relevant documents
        ideal_dcg = calculate_dcg(sorted(relevance_scores, reverse=True))
        dcg = calculate_dcg(relevance_scores)
        ndcg = dcg / ideal_dcg if ideal_dcg > 0 else 0.5  # Handle division by zero
        ndcg_scores.append(ndcg)
    return np.mean(ndcg_scores)


k = 5  # Consider top k recommended documents for NDCG calculation

ndcg = calculate_ndcg(ground_truth, recommended_papers, k)
print(f"NDCG@{k}: {ndcg:.4f}")


NDCG@5: 0.5345


In [16]:
len(embeddings_array[0])

768

In [17]:
len(query_embedding_tensor.cpu().numpy()[0])

768