In [2]:
!pip install -U sentence-transformers
!pip install rank_bm25
!pip install faiss-cpu

Collecting sentence-transformers
  Downloading sentence_transformers-4.0.2-py3-none-any.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.11.0->sentence-transformers)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_

Collecting rank_bm25
  Downloading rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Downloading rank_bm25-0.2.2-py3-none-any.whl (8.6 kB)
Installing collected packages: rank_bm25
Successfully installed rank_bm25-0.2.2
Collecting faiss-cpu
  Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.4 kB)
Downloading faiss_cpu-1.10.0-cp311-cp311-manylinux_2_28_x86_64.whl (30.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m30.7/30.7 MB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.10.0


In [1]:
import requests
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from rank_bm25 import BM25Okapi

In [2]:
# Updated API Endpoint for Semantic Scholar
SEMANTIC_SCHOLAR_BASE_URL = "https://api.semanticscholar.org/graph/v1/paper/search/bulk"
HEADERS = {"User-Agent": "DeepCite/1.0"}

# Function to fetch papers from Semantic Scholar API based on a query
def fetch_papers_by_keywords(keywords, fields="title,abstract,url,year,citationCount,authors", limit=1000):
    papers = []
    offset = 0
    while len(papers) < limit:
        params = {
            "query": keywords,
            "fields": fields,
            "limit": min(limit - len(papers), 100),  # Fetch in batches of 100
            "offset": offset
        }
        response = requests.get(SEMANTIC_SCHOLAR_BASE_URL, headers=HEADERS, params=params)
        if response.status_code == 200:
            data = response.json()
            new_papers = data.get("data", [])
            papers.extend(new_papers)
            offset += 100  # Move the offset to get the next set of papers
        else:
            print(f"Failed to fetch data, status code: {response.status_code}")
            break
    return papers[:limit]

In [3]:
# Initialize the SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def get_embeddings(papers):
    """Generate embeddings for the list of papers using SentenceTransformer."""
    titles_and_abstracts = []
    for paper in papers:
        title = paper.get('title', '')  # Default to empty string if title is missing
        abstract = paper.get('abstract', '')  # Default to empty string if abstract is missing
        title = title if title else ''  # Ensure title is a string
        abstract = abstract if abstract else ''  # Ensure abstract is a string
        titles_and_abstracts.append(title + " " + abstract)
    embeddings = model.encode(titles_and_abstracts, convert_to_tensor=True)
    return embeddings

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
# Function to create and store embeddings in FAISS index
def create_faiss_index(embeddings):
    """Create and store embeddings in FAISS index."""
    # Convert embeddings to a NumPy array for FAISS
    embeddings_np = np.array(embeddings.cpu().detach().numpy()).astype('float32')

    # Create the FAISS index (using L2 distance, Euclidean)
    index = faiss.IndexFlatL2(embeddings_np.shape[1])
    index.add(embeddings_np)  # Add embeddings to the index
    return index

In [5]:
def get_bm25_scores(query, papers):
    """Compute BM25 similarity scores between query and papers."""
    # Preprocess documents: tokenize title + abstract
    tokenized_corpus = [
    ((paper.get('title') or '') + ' ' + (paper.get('abstract') or '')).lower().split()
    for paper in papers]
    bm25 = BM25Okapi(tokenized_corpus)

    tokenized_query = query.lower().split()
    scores = bm25.get_scores(tokenized_query)

    return scores

In [6]:
def search_query(query, faiss_index, papers, top_k=20):
    """Search query in FAISS index and retrieve semantic + BM25 scores."""
    # FAISS
    query_embedding = model.encode([query], convert_to_tensor=True)
    query_embedding_np = np.array(query_embedding.cpu().detach().numpy()).astype('float32')
    distances, indices = faiss_index.search(query_embedding_np, top_k)

    recommended_papers = [papers[i] for i in indices[0]]
    faiss_dists = distances[0]

    # BM25
    bm25_scores = get_bm25_scores(query, recommended_papers)

    return recommended_papers, faiss_dists, bm25_scores

In [7]:
# Function to rank papers by citation count and similarity
def rank_by_citations_and_similarity(papers, distances):
    """Rank papers by citation count and similarity distance."""
    papers_with_distance = list(zip(papers, distances))

    # Sort by citation count first (descending), then by similarity (ascending)
    papers_with_distance.sort(key=lambda x: (-x[0]['citationCount'], x[1]))

    return papers_with_distance

In [8]:
def rank_by_weighted_score_hybrid(papers, faiss_dists, bm25_scores, weights=(0.5, 0.3, 0.2), hybrid_weights=(0.6, 0.4)):
    """Rank papers with hybrid relevance (FAISS + BM25) and citation/recency."""
    relevance_weight, citation_weight, recency_weight = weights
    w_faiss, w_bm25 = hybrid_weights

    # Normalize FAISS distances into similarity
    max_dist = max(faiss_dists) + 1e-5
    faiss_sims = [1 - (d / max_dist) for d in faiss_dists]

    # Normalize BM25 scores
    max_bm25 = max(bm25_scores) + 1e-5
    bm25_sims = [s / max_bm25 for s in bm25_scores]

    # Normalize citation and recency
    max_citation = max(p['citationCount'] for p in papers) + 1e-5
    max_year = max(p['year'] for p in papers) + 1e-5
    min_year = min(p['year'] for p in papers)

    ranked = []
    for i, paper in enumerate(papers):
        combined_relevance = (w_faiss * faiss_sims[i]) + (w_bm25 * bm25_sims[i])
        norm_citation = paper['citationCount'] / max_citation
        norm_recency = (paper['year'] - min_year) / (max_year - min_year)

        final_score = (
            relevance_weight * combined_relevance +
            citation_weight * norm_citation +
            recency_weight * norm_recency
        )

        ranked.append({
            "Title": paper['title'],
            "DOI": paper.get('url', "N/A"),
            "Citation Count": paper['citationCount'],
            "Year": paper['year'],
            "FAISS Similarity": round(faiss_sims[i], 4),
            "BM25 Similarity": round(bm25_sims[i], 4),
            "Combined Relevance": round(combined_relevance, 4),
            "Final Score": round(final_score, 4)
        })

    ranked.sort(key=lambda x: -x['Final Score'])
    return ranked

In [9]:
# Main function to fetch, rank, and return relevant papers based on a query
def get_ranked_papers(query, weights=(0.5, 0.3, 0.2), hybrid_weights=(0.6, 0.4)):
    papers = fetch_papers_by_keywords(query, limit=1000)
    if not papers:
        return "No relevant papers found."

    embeddings = get_embeddings(papers)
    faiss_index = create_faiss_index(embeddings)

    recommended_papers, faiss_dists, bm25_scores = search_query(query, faiss_index, papers, top_k=20)

    ranked_papers = rank_by_weighted_score_hybrid(recommended_papers, faiss_dists, bm25_scores, weights, hybrid_weights)
    return ranked_papers

In [17]:
# Example usage
query = "Supervised Machine Learning in Healthcare"
ranked_papers = get_ranked_papers(query)

# Output the ranked papers in a clean user-friendly format
print("\nTop Recommended Papers:\n")
for i, paper in enumerate(ranked_papers[:10], 1):
    print(f"{i}. {paper['Title']}")
    print(f"   DOI: {paper['DOI']}")
    print(f"   Citations: {paper['Citation Count']} | Year: {paper['Year']}")
    print(f"   Relevance Score: {round(paper['Final Score'] * 100, 2)}%")
    print()

Searching for papers on: 'Supervised Machine Learning in Healthcare'
Found 25 papers. Creating embeddings...
Searching and ranking papers...

Top Recommended Papers:

1. A Comprehensive Review on Machine Learning in Healthcare Industry: Classification, Restrictions, Opportunities and Challenges
   DOI: https://www.semanticscholar.org/paper/ac2cffc4b9f96bae24809d738777ae897094ae33
   Citations: 129 | Year: 2023
   Relevance Score: 68.02%

2. Machine Learning in Healthcare
   DOI: https://www.semanticscholar.org/paper/b50d99925701a88ce998323af1307b92a5b87258
   Citations: 170 | Year: 2021
   Relevance Score: 63.97%

3. Artificial Intelligence and Machine Learning in Healthcare
   DOI: https://www.semanticscholar.org/paper/276b72329076a2aedb552f310bb5bbd5168a9a0f
   Citations: 16 | Year: 2023
   Relevance Score: 50.98%

4. Demystifying Supervised Learning in Healthcare 4.0: A New Reality of Transforming Diagnostic Medicine
   DOI: https://www.semanticscholar.org/paper/4f67cc883f007614fbd4

In [15]:
import requests
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from rank_bm25 import BM25Okapi
import time

# Updated API Endpoint for Semantic Scholar
SEMANTIC_SCHOLAR_BASE_URL = "https://api.semanticscholar.org/graph/v1/paper/search"
HEADERS = {
    "User-Agent": "DeepCite/1.0",
}

# Function to fetch papers from Semantic Scholar API based on a query
def fetch_papers_by_keywords(keywords, fields="title,abstract,url,year,citationCount,authors", limit=100):
    papers = []
    offset = 0
    batch_size = 25  # Smaller batch size to avoid rate limiting

    while len(papers) < limit:
        params = {
            "query": keywords,
            "fields": fields,
            "limit": min(batch_size, limit - len(papers)),
            "offset": offset
        }

        try:
            response = requests.get(SEMANTIC_SCHOLAR_BASE_URL, headers=HEADERS, params=params)

            if response.status_code == 200:
                data = response.json()
                new_papers = data.get("data", [])

                # Break if no new papers or fewer papers than requested (end of results)
                if not new_papers:
                    break

                papers.extend(new_papers)
                offset += len(new_papers)

                # Add delay between requests to avoid rate limiting
                time.sleep(1)

            elif response.status_code == 429:
                print(f"Rate limit exceeded. Waiting 30 seconds before retry...")
                time.sleep(30)  # Wait longer if rate limited
                continue
            else:
                print(f"Failed to fetch data, status code: {response.status_code}")
                break

        except Exception as e:
            print(f"Error fetching papers: {str(e)}")
            break

        # Check if we've reached all available papers
        if len(papers) >= offset:
            break

    return papers[:limit]

# Initialize the SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def get_embeddings(papers):
    """Generate embeddings for the list of papers using SentenceTransformer."""
    if not papers:
        return None

    titles_and_abstracts = []
    for paper in papers:
        title = paper.get('title', '')  # Default to empty string if title is missing
        abstract = paper.get('abstract', '')  # Default to empty string if abstract is missing
        title = title if title else ''  # Ensure title is a string
        abstract = abstract if abstract else ''  # Ensure abstract is a string
        titles_and_abstracts.append(title + " " + abstract)

    embeddings = model.encode(titles_and_abstracts, convert_to_tensor=True)
    return embeddings

# Function to create and store embeddings in FAISS index
def create_faiss_index(embeddings):
    """Create and store embeddings in FAISS index."""
    if embeddings is None:
        return None

    # Convert embeddings to a NumPy array for FAISS
    embeddings_np = np.array(embeddings.cpu().detach().numpy()).astype('float32')

    # Create the FAISS index (using L2 distance, Euclidean)
    index = faiss.IndexFlatL2(embeddings_np.shape[1])
    index.add(embeddings_np)  # Add embeddings to the index
    return index

def get_bm25_scores(query, papers):
    """Compute BM25 similarity scores between query and papers."""
    if not papers:
        return []

    # Preprocess documents: tokenize title + abstract
    tokenized_corpus = [
        ((paper.get('title') or '') + ' ' + (paper.get('abstract') or '')).lower().split()
        for paper in papers]
    bm25 = BM25Okapi(tokenized_corpus)

    tokenized_query = query.lower().split()
    scores = bm25.get_scores(tokenized_query)

    return scores

def search_query(query, faiss_index, papers, top_k=20):
    """Search query in FAISS index and retrieve semantic + BM25 scores."""
    if not papers or faiss_index is None:
        return [], [], []

    # Limit top_k to the number of available papers
    top_k = min(top_k, len(papers))
    if top_k == 0:
        return [], [], []

    # FAISS
    query_embedding = model.encode([query], convert_to_tensor=True)
    query_embedding_np = np.array(query_embedding.cpu().detach().numpy()).astype('float32')
    distances, indices = faiss_index.search(query_embedding_np, top_k)

    # Make sure indices are unique to avoid duplicates
    unique_indices = []
    seen = set()
    for idx in indices[0]:
        if idx not in seen and idx < len(papers):
            seen.add(idx)
            unique_indices.append(idx)

    recommended_papers = [papers[i] for i in unique_indices]

    # Get corresponding distances for the unique indices
    faiss_dists = [distances[0][list(indices[0]).index(i)] for i in unique_indices]

    # BM25
    bm25_scores = get_bm25_scores(query, recommended_papers)

    return recommended_papers, faiss_dists, bm25_scores

def rank_by_weighted_score_hybrid(papers, faiss_dists, bm25_scores, weights=(0.5, 0.3, 0.2), hybrid_weights=(0.6, 0.4)):
    """Rank papers with hybrid relevance (FAISS + BM25) and citation/recency."""
    if not papers:
        return []

    relevance_weight, citation_weight, recency_weight = weights
    w_faiss, w_bm25 = hybrid_weights

    # Normalize FAISS distances into similarity
    max_dist = max(faiss_dists) if faiss_dists else 1
    max_dist = max_dist + 1e-5  # Avoid division by zero
    faiss_sims = [1 - (d / max_dist) for d in faiss_dists]

    # Normalize BM25 scores
    if isinstance(bm25_scores, np.ndarray):
        max_bm25 = np.max(bm25_scores) if bm25_scores.size > 0 else 1
        max_bm25 = max_bm25 + 1e-5  # Avoid division by zero
        bm25_sims = [float(s) / max_bm25 for s in bm25_scores]
    else:
        max_bm25 = max(bm25_scores) if bm25_scores else 1
        max_bm25 = max_bm25 + 1e-5  # Avoid division by zero
        bm25_sims = [s / max_bm25 for s in bm25_scores]

    # Fill in any missing scores if lengths don't match
    while len(faiss_sims) < len(papers):
        faiss_sims.append(0)
    while len(bm25_sims) < len(papers):
        bm25_sims.append(0)

    # Find max values for normalization, with safeguards
    max_citation = max((p.get('citationCount', 0) for p in papers), default=1) + 1e-5

    # Extract years with default for missing values
    years = [p.get('year', 2000) for p in papers]
    years = [y for y in years if y is not None]  # Filter out None values

    if not years:  # If all years are None
        max_year = 2023
        min_year = 2000
    else:
        max_year = max(years) + 1e-5
        min_year = min(years)

    # Ensure we don't divide by zero
    year_range = max(max_year - min_year, 1e-5)

    # Track paper IDs to avoid duplicates
    seen_papers = set()
    ranked = []

    for i, paper in enumerate(papers):
        # Skip duplicates based on paper ID or URL
        paper_id = paper.get('url', '') or paper.get('title', '')
        if paper_id in seen_papers:
            continue
        seen_papers.add(paper_id)

        # Get values with defaults
        citation_count = paper.get('citationCount', 0) or 0
        year = paper.get('year', min_year) or min_year

        # Ensure index is in range
        idx = min(i, len(faiss_sims)-1)

        combined_relevance = (w_faiss * faiss_sims[idx]) + (w_bm25 * bm25_sims[idx])
        norm_citation = citation_count / max_citation
        norm_recency = (year - min_year) / year_range

        final_score = (
            relevance_weight * combined_relevance +
            citation_weight * norm_citation +
            recency_weight * norm_recency
        )

        ranked.append({
            "Title": paper.get('title', 'Untitled'),
            "DOI": paper.get('url', "N/A"),
            "Citation Count": citation_count,
            "Year": year,
            "FAISS Similarity": round(faiss_sims[idx], 4),
            "BM25 Similarity": round(bm25_sims[idx], 4),
            "Combined Relevance": round(combined_relevance, 4),
            "Final Score": round(final_score, 4)
        })

    # Sort by final score and ensure uniqueness
    ranked.sort(key=lambda x: -x['Final Score'])

    # Remove any potential duplicates that might have slipped through
    unique_ranked = []
    seen_titles = set()

    for paper in ranked:
        if paper['Title'] not in seen_titles:
            seen_titles.add(paper['Title'])
            unique_ranked.append(paper)

    return unique_ranked

# Main function to fetch, rank, and return relevant papers based on a query
def get_ranked_papers(query, weights=(0.5, 0.3, 0.2), hybrid_weights=(0.6, 0.4), limit=100):
    print(f"Searching for papers on: '{query}'")
    papers = fetch_papers_by_keywords(query, limit=limit)

    if not papers:
        return "No relevant papers found."

    print(f"Found {len(papers)} papers. Creating embeddings...")

    embeddings = get_embeddings(papers)
    faiss_index = create_faiss_index(embeddings)

    print("Searching and ranking papers...")
    recommended_papers, faiss_dists, bm25_scores = search_query(query, faiss_index, papers, top_k=min(20, len(papers)))

    if not recommended_papers:
        return "No relevant papers could be ranked."

    ranked_papers = rank_by_weighted_score_hybrid(recommended_papers, faiss_dists, bm25_scores, weights, hybrid_weights)
    return ranked_papers

# Example usage
def run_query(query, limit=100):
    ranked_papers = get_ranked_papers(query, limit=limit)

    if isinstance(ranked_papers, str):
        print(ranked_papers)
        return

    # Output the ranked papers in a clean user-friendly format
    print("\nTop Recommended Papers:\n")
    for i, paper in enumerate(ranked_papers[:10], 1):
        print(f"{i}. {paper['Title']}")
        print(f"   DOI: {paper['DOI']}")
        print(f"   Citations: {paper['Citation Count']} | Year: {paper['Year']}")
        print(f"   Relevance Score: {round(paper['Final Score'] * 100, 2)}%")
        print()

    return ranked_papers

# If you want to run this directly:
if __name__ == "__main__":
    query = "Apple Silicon Architecture"
    # Lower the limit to avoid rate limiting during testing
    run_query(query, limit=50)

Searching for papers on: 'Apple Silicon Architecture'
Found 25 papers. Creating embeddings...
Searching and ranking papers...

Top Recommended Papers:

1. Genetic architecture and genomic predictive ability of apple quantitative traits across environments
   DOI: https://www.semanticscholar.org/paper/5a6983c5868baf7e7f03270be78b14f9b282d438
   Citations: 29 | Year: 2022
   Relevance Score: 65.16%

2. Drone-Based Apple Detection: Finding the Depth of Apples Using YOLOv7 Architecture with Multi-Head Attention Mechanism
   DOI: https://www.semanticscholar.org/paper/bebac95732db1431e3fa661fa4a621ec0c105dea
   Citations: 23 | Year: 2023
   Relevance Score: 57.59%

3. Apple Silicon Performance in Scientific Computing
   DOI: https://www.semanticscholar.org/paper/46cdcbad74f229151b2300221b16d626a2aa28aa
   Citations: 12 | Year: 2022
   Relevance Score: 56.71%

4. Branch Different - Spectre Attacks on Apple Silicon
   DOI: https://www.semanticscholar.org/paper/0a268e4a42d298689e4e26391093d381c

In [16]:
import requests
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from rank_bm25 import BM25Okapi
import time

# Updated API Endpoint for Semantic Scholar
SEMANTIC_SCHOLAR_BASE_URL = "https://api.semanticscholar.org/graph/v1/paper/search"
HEADERS = {
    "User-Agent": "DeepCite/1.0",
    # Add your API key here if you have one
    # "x-api-key": "YOUR_API_KEY"
}

# Function to fetch papers from Semantic Scholar API based on a query
def fetch_papers_by_keywords(keywords, fields="title,abstract,url,year,citationCount,authors", limit=100):
    papers = []
    offset = 0
    batch_size = 25  # Smaller batch size to avoid rate limiting

    while len(papers) < limit:
        params = {
            "query": keywords,
            "fields": fields,
            "limit": min(batch_size, limit - len(papers)),
            "offset": offset
        }

        try:
            response = requests.get(SEMANTIC_SCHOLAR_BASE_URL, headers=HEADERS, params=params)

            if response.status_code == 200:
                data = response.json()
                new_papers = data.get("data", [])

                # Break if no new papers or fewer papers than requested (end of results)
                if not new_papers:
                    break

                papers.extend(new_papers)
                offset += len(new_papers)

                # Add delay between requests to avoid rate limiting
                time.sleep(1)

            elif response.status_code == 429:
                print(f"Rate limit exceeded. Waiting 30 seconds before retry...")
                time.sleep(30)  # Wait longer if rate limited
                continue
            else:
                print(f"Failed to fetch data, status code: {response.status_code}")
                break

        except Exception as e:
            print(f"Error fetching papers: {str(e)}")
            break

        # Check if we've reached all available papers
        if len(papers) >= offset:
            break

    return papers[:limit]

# Initialize the SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def get_embeddings(papers):
    """Generate embeddings for the list of papers using SentenceTransformer."""
    if not papers:
        return None

    titles_and_abstracts = []
    for paper in papers:
        title = paper.get('title', '')  # Default to empty string if title is missing
        abstract = paper.get('abstract', '')  # Default to empty string if abstract is missing
        title = title if title else ''  # Ensure title is a string
        abstract = abstract if abstract else ''  # Ensure abstract is a string
        titles_and_abstracts.append(title + " " + abstract)

    embeddings = model.encode(titles_and_abstracts, convert_to_tensor=True)
    return embeddings

# Function to create and store embeddings in FAISS index
def create_faiss_index(embeddings):
    """Create and store embeddings in FAISS index."""
    if embeddings is None:
        return None

    # Convert embeddings to a NumPy array for FAISS
    embeddings_np = np.array(embeddings.cpu().detach().numpy()).astype('float32')

    # Create the FAISS index (using L2 distance, Euclidean)
    index = faiss.IndexFlatL2(embeddings_np.shape[1])
    index.add(embeddings_np)  # Add embeddings to the index
    return index

def get_bm25_scores(query, papers):
    """Compute BM25 similarity scores between query and papers."""
    if not papers:
        return []

    # Preprocess documents: tokenize title + abstract
    tokenized_corpus = [
        ((paper.get('title') or '') + ' ' + (paper.get('abstract') or '')).lower().split()
        for paper in papers]
    bm25 = BM25Okapi(tokenized_corpus)

    tokenized_query = query.lower().split()
    scores = bm25.get_scores(tokenized_query)

    return scores

def search_query(query, faiss_index, papers, top_k=20):
    """Search query in FAISS index and retrieve semantic + BM25 scores."""
    if not papers or faiss_index is None:
        return [], [], []

    # Limit top_k to the number of available papers
    top_k = min(top_k, len(papers))
    if top_k == 0:
        return [], [], []

    # FAISS
    query_embedding = model.encode([query], convert_to_tensor=True)
    query_embedding_np = np.array(query_embedding.cpu().detach().numpy()).astype('float32')
    distances, indices = faiss_index.search(query_embedding_np, top_k)

    # Make sure indices are unique to avoid duplicates
    unique_indices = []
    seen = set()
    for idx in indices[0]:
        if idx not in seen and idx < len(papers):
            seen.add(idx)
            unique_indices.append(idx)

    recommended_papers = [papers[i] for i in unique_indices]

    # Get corresponding distances for the unique indices
    faiss_dists = [distances[0][list(indices[0]).index(i)] for i in unique_indices]

    # BM25
    bm25_scores = get_bm25_scores(query, recommended_papers)

    return recommended_papers, faiss_dists, bm25_scores

def rank_by_weighted_score_hybrid(papers, faiss_dists, bm25_scores, weights=(0.5, 0.3, 0.2), hybrid_weights=(0.6, 0.4)):
    """Rank papers with hybrid relevance (FAISS + BM25) and citation/recency."""
    if not papers:
        return []

    relevance_weight, citation_weight, recency_weight = weights
    w_faiss, w_bm25 = hybrid_weights

    # Normalize FAISS distances into similarity
    max_dist = max(faiss_dists) if faiss_dists else 1
    max_dist = max_dist + 1e-5  # Avoid division by zero
    faiss_sims = [1 - (d / max_dist) for d in faiss_dists]

    # Normalize BM25 scores
    if isinstance(bm25_scores, np.ndarray):
        max_bm25 = np.max(bm25_scores) if bm25_scores.size > 0 else 1
        max_bm25 = max_bm25 + 1e-5  # Avoid division by zero
        bm25_sims = [float(s) / max_bm25 for s in bm25_scores]
    else:
        max_bm25 = max(bm25_scores) if bm25_scores else 1
        max_bm25 = max_bm25 + 1e-5  # Avoid division by zero
        bm25_sims = [s / max_bm25 for s in bm25_scores]

    # Fill in any missing scores if lengths don't match
    while len(faiss_sims) < len(papers):
        faiss_sims.append(0)
    while len(bm25_sims) < len(papers):
        bm25_sims.append(0)

    # Find max values for normalization, with safeguards
    max_citation = max((p.get('citationCount', 0) for p in papers), default=1) + 1e-5

    # Extract years with default for missing values
    years = [p.get('year', 2000) for p in papers]
    years = [y for y in years if y is not None]  # Filter out None values

    if not years:  # If all years are None
        max_year = 2023
        min_year = 2000
    else:
        max_year = max(years) + 1e-5
        min_year = min(years)

    # Ensure we don't divide by zero
    year_range = max(max_year - min_year, 1e-5)

    # Track paper IDs to avoid duplicates, we create a set here
    seen_papers = set()
    ranked = []

    for i, paper in enumerate(papers):
        # Skip duplicates based on paper ID or URL
        paper_id = paper.get('url', '') or paper.get('title', '')
        if paper_id in seen_papers:
            continue
        seen_papers.add(paper_id)

        # Get values with defaults
        citation_count = paper.get('citationCount', 0) or 0
        year = paper.get('year', min_year) or min_year

        # Ensure index is in range
        idx = min(i, len(faiss_sims)-1)

        combined_relevance = (w_faiss * faiss_sims[idx]) + (w_bm25 * bm25_sims[idx])
        norm_citation = citation_count / max_citation
        norm_recency = (year - min_year) / year_range

        final_score = (
            relevance_weight * combined_relevance +
            citation_weight * norm_citation +
            recency_weight * norm_recency
        )

        ranked.append({
            "Title": paper.get('title', 'Untitled'),
            "DOI": paper.get('url', "N/A"),
            "Citation Count": citation_count,
            "Year": year,
            "FAISS Similarity": round(faiss_sims[idx], 4),
            "BM25 Similarity": round(bm25_sims[idx], 4),
            "Combined Relevance": round(combined_relevance, 4),
            "Final Score": round(final_score, 4)
        })

    ranked.sort(key=lambda x: -x['Final Score'])

    unique_ranked = []
    seen_titles = set()

    for paper in ranked:
        if paper['Title'] not in seen_titles:
            seen_titles.add(paper['Title'])
            unique_ranked.append(paper)

    return unique_ranked


def get_ranked_papers(query, weights=(0.5, 0.3, 0.2), hybrid_weights=(0.6, 0.4), limit=100):
    print(f"Searching for papers on: '{query}'")
    papers = fetch_papers_by_keywords(query, limit=limit)

    if not papers:
        return "No relevant papers found."

    print(f"Found {len(papers)} papers. Creating embeddings...")

    embeddings = get_embeddings(papers)
    faiss_index = create_faiss_index(embeddings)

    print("Searching and ranking papers...")
    recommended_papers, faiss_dists, bm25_scores = search_query(query, faiss_index, papers, top_k=min(20, len(papers)))

    if not recommended_papers:
        return "No relevant papers could be ranked."

    ranked_papers = rank_by_weighted_score_hybrid(recommended_papers, faiss_dists, bm25_scores, weights, hybrid_weights)
    return ranked_papers


def run_query(query, limit=100):
    ranked_papers = get_ranked_papers(query, limit=limit)

    if isinstance(ranked_papers, str):
        print(ranked_papers)
        return

    # Output the ranked papers in a clean user-friendly format
    print("\nTop Recommended Papers:\n")
    for i, paper in enumerate(ranked_papers[:10], 1):
        print(f"{i}. {paper['Title']}")
        print(f"   DOI: {paper['DOI']}")
        print(f"   Citations: {paper['Citation Count']} | Year: {paper['Year']}")
        print(f"   Relevance Score: {round(paper['Final Score'] * 100, 2)}%")
        print()

    return ranked_papers

if __name__ == "__main__":
    query = "Database Caching"
    run_query(query, limit=50)

Searching for papers on: 'Database Caching'
Found 25 papers. Creating embeddings...
Searching and ranking papers...

Top Recommended Papers:

1. Middle-tier database caching for e-business
   DOI: https://www.semanticscholar.org/paper/7f222372aaf853213d84a4577c54bdfe25420c9e
   Citations: 184 | Year: 2002
   Relevance Score: 60.4%

2. An analysis of database caching policies
   DOI: https://www.semanticscholar.org/paper/98c204c84f4debfacb4f7e4a6d2722e104cdc6bb
   Citations: 1 | Year: 2016
   Relevance Score: 58.17%

3. MTCache: transparent mid-tier database caching in SQL server
   DOI: https://www.semanticscholar.org/paper/2f6ac431e923495be42d3e2018908e30fd3f7b41
   Citations: 136 | Year: 2004
   Relevance Score: 57.13%

4. CacheOptimizer: helping developers configure caching frameworks for hibernate-based database-centric web applications
   DOI: https://www.semanticscholar.org/paper/8037e72f3d67abe3810971e71417da45bb5a1753
   Citations: 63 | Year: 2016
   Relevance Score: 53.18%

5.