### Data Loading and Pre-Processing

In [None]:
!pip install datasets
from datasets import load_dataset

In [None]:
dataset = load_dataset('allenai/scirepeval','search', split='evaluation')

In [None]:
dataset

In [None]:
dataset['evaluation']

In [3]:
papers = dataset['candidates']

In [43]:
papers[0][0]

{'doc_id': '80054354',
 'title': 'Abstract P1 10 04: Elacestrant, a novel oral selective estrogen receptor degrader (SERD) decreases tumoral 18F FES uptake in a phase 1 study of ER+ HER2 advanced breast cancer patients',
 'abstract': 'Background: The estrogen receptor (ER) expressed in approximately 70% of breast cancers, can be mapped using 16a [18F]Fluoro 17b estradiol (FES) PET, a non invasive molecular whole body imaging tool capable of assessing ER target engagement by endocrine therapy. Elacestrant (RAD1901) is a novel, non steroidal oral SERD that has demonstrated single agent activity in heavily pre treated patients with ER+ advanced breast cancer (Bardia et al. J Clin Oncol 35, 2017: suppl; abstr 1014) The primary objective of this study was the visualization and quantification of residual ER binding with FES PET after treatment with elacestrant. Methods: In the RAD1901 106 phase 1 study (NCT02650817) patients with advanced ER+/HER2 breast cancer were treated with elacestrant 

In [None]:
lis_1 = []

In [4]:
papers_dict ={}
ground_truth = []
for i in range(len(papers)):
  temp = []
  for j in range(10):
    papers_dict[papers[i][j]['corpus_id']] = papers[i][j]['title']+ papers[i][j]['abstract']
    temp.append(papers[i][j]['corpus_id'])
  ground_truth.append(temp)

In [5]:
len(papers_dict)

26048

In [None]:
len(ground_truth[324])

10

In [6]:
new_dataset = [{'id': key, 'abstract': value} for key, value in papers_dict.items()]

### Importing Sci-Bert

In [None]:
new_dataset[0]

In [None]:
!pip install transformers

In [None]:
!pip install nltk

In [9]:
import nltk
from tqdm import tqdm

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [58]:
import torch
from transformers import AutoTokenizer, AutoModel
from nltk.corpus import stopwords

# Load the pretrained SciBERT model and tokenizer
model_name = 'allenai/scibert_scivocab_cased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = model.to(device)

stop_words = set(stopwords.words('english'))

# Generate embeddings for each abstract
embeddings = []
for i in tqdm(range(len(new_dataset))):
    abstract = new_dataset[i]['abstract']
    # Tokenize the abstract
    tokens = tokenizer.tokenize(abstract)
    # Remove stop words
    tokens_without_stopwords = [token for token in tokens if token.lower() not in stop_words]
    # Encode the abstract
    encoded_abstract = tokenizer.encode_plus(' '.join(tokens_without_stopwords),
                                             add_special_tokens=True,
                                             max_length=512,
                                             truncation=True,
                                             padding='max_length',
                                             return_tensors='pt').to(device)
    # Generate SciBERT embedding for the abstract
    with torch.no_grad():
        outputs = model(encoded_abstract.input_ids)
        embedding = outputs.last_hidden_state[:, 0, :].cpu().numpy()  # Extract embedding from the first token [CLS]
    embeddings.append(embedding[0])

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/222k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of the model checkpoint at allenai/scibert_scivocab_cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
100%|██████████| 26048/26048 [16:44<00:00, 25.93it/s]


In [59]:
len(embeddings)

26048

In [60]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Convert the embeddings list to a numpy array and reshape it
embeddings_array = np.array(embeddings).reshape(len(embeddings), -1)

In [44]:
papers_dict[80054354]

'Abstract P1 10 04: Elacestrant, a novel oral selective estrogen receptor degrader (SERD) decreases tumoral 18F FES uptake in a phase 1 study of ER+ HER2 advanced breast cancer patientsBackground: The estrogen receptor (ER) expressed in approximately 70% of breast cancers, can be mapped using 16a [18F]Fluoro 17b estradiol (FES) PET, a non invasive molecular whole body imaging tool capable of assessing ER target engagement by endocrine therapy. Elacestrant (RAD1901) is a novel, non steroidal oral SERD that has demonstrated single agent activity in heavily pre treated patients with ER+ advanced breast cancer (Bardia et al. J Clin Oncol 35, 2017: suppl; abstr 1014) The primary objective of this study was the visualization and quantification of residual ER binding with FES PET after treatment with elacestrant. Methods: In the RAD1901 106 phase 1 study (NCT02650817) patients with advanced ER+/HER2 breast cancer were treated with elacestrant at two dose levels (200 mg or 400 mg qd orally) fo

In [32]:
dataset['query'][0]

'fes serd'

In [33]:
ground_truth[0]

[80054354,
 208446849,
 220150649,
 80387586,
 76745999,
 4763889,
 165053669,
 54150229,
 51930068,
 55702260]

In [61]:
len_pap = len(dataset['candidates'])

In [79]:
recommended_papers = []
for i in tqdm(range(len_pap)):
  query = dataset['query'][i]

  # Preprocess the query in the same way as the documents
  query_tokens = tokenizer.tokenize(query)
  query_tokens_without_stopwords = [token for token in query_tokens if token.lower() not in stop_words]
  encoded_query = tokenizer.encode_plus(' '.join(query_tokens_without_stopwords),
                                        add_special_tokens=True,
                                        max_length=512,
                                        truncation=True,
                                        padding='max_length',
                                        return_tensors='pt')

  # Generate SciBERT embedding for the query
  with torch.no_grad():
      query_input_ids = encoded_query.input_ids.to(device)
      query_embedding = model(query_input_ids).last_hidden_state[:, 0, :].cpu().numpy()

  # Calculate cosine similarity between the query embedding and document embeddings
  query_embedding_tensor = torch.tensor(query_embedding).to(device)
  similarity_scores = cosine_similarity(query_embedding_tensor.cpu().numpy(), embeddings_array)
  # print(len(query_embedding_tensor.cpu().numpy()))
  # print(len(embeddings_array))
  # break;

  # Get the indices of the top 10 similar documents
  top_indices = np.argsort(similarity_scores, axis=1)[:, -100:][0][::-1]

  # Retrieve the IDs of the top similar documents
  top_documents = [new_dataset[i]['id'] for i in top_indices]
  temp = []
  for doc_id in top_documents:
    temp.append(doc_id)
  recommended_papers.append(temp)

100%|██████████| 2637/2637 [06:05<00:00,  7.21it/s]


In [39]:
print(ground_truth[0])

[80054354, 208446849, 220150649, 80387586, 76745999, 4763889, 165053669, 54150229, 51930068, 55702260]


In [84]:
import numpy as np

def calculate_dcg(scores):
    positions = np.arange(1, len(scores) + 1)
    discounts = np.log2(positions + 1)
    return np.sum(scores / discounts)

def calculate_ndcg(ground_truth, recommended_documents, k):
    ndcg_scores = []
    for gt_docs, rec_docs in zip(ground_truth, recommended_documents):
        relevance_scores = np.zeros(k)
        for i, doc_id in enumerate(rec_docs[:k]):
            if doc_id in gt_docs:
                relevance_scores[i] = 1 # Assign a relevance score of 1 for relevant documents
        ideal_dcg = calculate_dcg(sorted(relevance_scores, reverse=True))
        dcg = calculate_dcg(relevance_scores)
        ndcg = dcg / ideal_dcg if ideal_dcg > 0 else 0.5 
        ndcg_scores.append(ndcg)
    return np.mean(ndcg_scores)


k = 5  # Consider top k recommended documents for NDCG calculation

ndcg = calculate_ndcg(ground_truth, recommended_papers, k)
print(f"NDCG@{k}: {ndcg:.4f}")


NDCG@5: 0.5329


In [81]:
query = new_dataset[0]['abstract']

In [82]:
query_tokens = tokenizer.tokenize(query)
query_tokens_without_stopwords = [token for token in query_tokens if token.lower() not in stop_words]
encoded_query = tokenizer.encode_plus(' '.join(query_tokens_without_stopwords),
                                      add_special_tokens=True,
                                      max_length=512,
                                      truncation=True,
                                      padding='max_length',
                                      return_tensors='pt')

# Generate SciBERT embedding for the query
with torch.no_grad():
    query_input_ids = encoded_query.input_ids.to(device)
    query_embedding = model(query_input_ids).last_hidden_state[:, 0, :].cpu().numpy()

# Calculate cosine similarity between the query embedding and document embeddings
query_embedding_tensor = torch.tensor(query_embedding).to(device)
similarity_scores = cosine_similarity(query_embedding_tensor.cpu().numpy(), embeddings_array)
# print(len(query_embedding_tensor.cpu().numpy()))
# print(len(embeddings_array))
# break;

# Get the indices of the top 10 similar documents
top_indices = np.argsort(similarity_scores, axis=1)[:, -10:][0][::-1]

# Retrieve the IDs of the top similar documents
top_documents = [new_dataset[i]['id'] for i in top_indices]
temp = []
for doc_id in top_documents:
  temp.append(doc_id)
print(temp)

[80054354, 26333575, 195081459, 210222137, 201194791, 135172862, 201640412, 53144236, 206558037, 204068812]


In [83]:
new_dataset[0]

{'id': 80054354,
 'abstract': 'Abstract P1 10 04: Elacestrant, a novel oral selective estrogen receptor degrader (SERD) decreases tumoral 18F FES uptake in a phase 1 study of ER+ HER2 advanced breast cancer patientsBackground: The estrogen receptor (ER) expressed in approximately 70% of breast cancers, can be mapped using 16a [18F]Fluoro 17b estradiol (FES) PET, a non invasive molecular whole body imaging tool capable of assessing ER target engagement by endocrine therapy. Elacestrant (RAD1901) is a novel, non steroidal oral SERD that has demonstrated single agent activity in heavily pre treated patients with ER+ advanced breast cancer (Bardia et al. J Clin Oncol 35, 2017: suppl; abstr 1014) The primary objective of this study was the visualization and quantification of residual ER binding with FES PET after treatment with elacestrant. Methods: In the RAD1901 106 phase 1 study (NCT02650817) patients with advanced ER+/HER2 breast cancer were treated with elacestrant at two dose levels (