In [1]:
from transformers import AutoTokenizer, AutoModel
import torch
!pip install pypdf
from pypdf import PdfReader
import re
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Load a different biomedical model (e.g., PubMedBERT) and tokenizer
model_name = "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext"  # Change model name here
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Parse PDF document and split into sentences
def parse_document(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        text += page.extract_text() + " "
    sentences = re.split(r'(?<=[.!?]) +', text)
    return sentences

# Get CLS token embedding for each sentence
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    cls_embedding = outputs.last_hidden_state[:, 0, :]  # CLS token embedding
    return cls_embedding.squeeze().numpy()

# Read PDF and generate embeddings for each sentence
file_path = "/content/sample.pdf"  # Replace with the path to your PDF file
sentences = parse_document(file_path)
embeddings = [get_sentence_embedding(sentence) for sentence in sentences]

# Display results
print("Total sentences:", len(sentences))
print("Embedding of the first sentence:", embeddings[0])
print("Embedding shape:", embeddings[0].shape)  # Should match the model's embedding size

# Define the query and get its embedding
query = "How is the patient’s sensory function across various lumbar and sacral segments?"  # Replace this with your actual query sentence
query_embedding = get_sentence_embedding(query)  # Ensure query_embedding has the correct shape

# Calculate cosine similarities
similarities = cosine_similarity([query_embedding], embeddings)[0]

# Define minimum and maximum thresholds
initial_threshold = 0.5
max_threshold = 10
step = 0.01  # Increment step for threshold adjustment

# Initialize variables to store the best response and threshold
best_response = None
best_threshold = initial_threshold

threshold = initial_threshold
while threshold <= max_threshold:
    response = None  # Reset response for each threshold

    # Check if there's any sentence with similarity above the current threshold
    for i, sim in enumerate(similarities):
        if sim > threshold:
            response = f"{sentences[i]}"
            break  # Found a response, so break the inner loop

    # If a response was found, store it as the current best and increase the threshold
    if response:
        best_response = response
        best_threshold = threshold
        threshold += step  # Try the next higher threshold
    else:
        # No response found at the current threshold, stop the loop
        break

# Output the best result found at the highest valid threshold
if best_response:
    print(f"Best response: '{best_response}' at threshold: {best_threshold}")
else:
    print("No similar sentences found.")


Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.1.0-py3-none-any.whl (297 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/298.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m297.0/298.0 kB[0m [31m9.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.1.0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Total sentences: 8
Embedding of the first sentence: [-9.09056142e-03  8.41654837e-02  2.29816288e-01 -2.10193709e-01
 -1.80265188e-01  5.84000045e-05 -5.49236596e-01  2.05591461e-03
 -2.09314913e-01  3.80445831e-03  5.55196926e-02  3.18031311e-01
 -1.10812627e-01  1.89011559e-01 -2.24293679e-01 -7.23012686e-02
  1.39138639e-01 -5.76233938e-02 -3.61397654e-01 -9.63993147e-02
  1.02593802e-01 -5.71879931e-02 -6.16579764e-02 -2.71619081e-01
  2.02910066e-01  5.14497876e-01 -2.54747510e-01 -1.65383220e-01
  5.79821840e-02 -1.85002759e-01 -1.36026889e-01  4.75427479e-01
 -8.20349157e-02  7.72282898e-01 -8.53292048e-02  1.40661806e-01
 -1.50799245e-01 -2.78619118e-02  2.16313422e-01 -2.45520726e-01
 -6.41424134e-02  2.04683214e-01 -2.95758933e-01  2.21501812e-01
 -1.65235743e-01  3.18321675e-01 -1.62162632e-01 -1.48581043e-01
 -4.72304113e-02 -2.32667863e-01 -1.23179518e-01 -2.14521021e-01
 -2.44366556e-01 -3.02342266e-01 -3.98977101e-01  1.87820988e-04
 -2.92282999e-01 -3.47490370e-01  4.58