In [20]:
import requests
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from rank_bm25 import BM25Okapi

In [None]:
# Updated API Endpoint for Semantic Scholar
SEMANTIC_SCHOLAR_BASE_URL = "https://api.semanticscholar.org/graph/v1/paper/search/bulk"
HEADERS = {"User-Agent": "DeepCite/1.0"}

# Function to fetch papers from Semantic Scholar API based on a query
def fetch_papers_by_keywords(keywords, fields="title,abstract,url,year,citationCount,authors", limit=1000):
    papers = []
    offset = 0
    while len(papers) < limit:
        params = {
            "query": keywords,
            "fields": fields,
            "limit": min(limit - len(papers), 100),  # Fetch in batches of 100
            "offset": offset
        }
        response = requests.get(SEMANTIC_SCHOLAR_BASE_URL, headers=HEADERS, params=params)
        if response.status_code == 200:
            data = response.json()
            new_papers = data.get("data", [])
            papers.extend(new_papers)
            offset += 100  # Move the offset to get the next set of papers
        else:
            print(f"Failed to fetch data, status code: {response.status_code}")
            break
    return papers[:limit]

In [31]:
# Initialize the SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def get_embeddings(papers):
    """Generate embeddings for the list of papers using SentenceTransformer."""
    titles_and_abstracts = []
    for paper in papers:
        title = paper.get('title', '')  # Default to empty string if title is missing
        abstract = paper.get('abstract', '')  # Default to empty string if abstract is missing
        title = title if title else ''  # Ensure title is a string
        abstract = abstract if abstract else ''  # Ensure abstract is a string
        titles_and_abstracts.append(title + " " + abstract)
    embeddings = model.encode(titles_and_abstracts, convert_to_tensor=True)
    return embeddings

In [32]:
# Function to create and store embeddings in FAISS index
def create_faiss_index(embeddings):
    """Create and store embeddings in FAISS index."""
    # Convert embeddings to a NumPy array for FAISS
    embeddings_np = np.array(embeddings.cpu().detach().numpy()).astype('float32')
    
    # Create the FAISS index (using L2 distance, Euclidean)
    index = faiss.IndexFlatL2(embeddings_np.shape[1])
    index.add(embeddings_np)  # Add embeddings to the index
    return index

In [33]:
def get_bm25_scores(query, papers):
    """Compute BM25 similarity scores between query and papers."""
    # Preprocess documents: tokenize title + abstract
    tokenized_corpus = [
    ((paper.get('title') or '') + ' ' + (paper.get('abstract') or '')).lower().split()
    for paper in papers]
    bm25 = BM25Okapi(tokenized_corpus)

    tokenized_query = query.lower().split()
    scores = bm25.get_scores(tokenized_query)
    
    return scores

In [34]:
def search_query(query, faiss_index, papers, top_k=20):
    """Search query in FAISS index and retrieve semantic + BM25 scores."""
    # FAISS
    query_embedding = model.encode([query], convert_to_tensor=True)
    query_embedding_np = np.array(query_embedding.cpu().detach().numpy()).astype('float32')
    distances, indices = faiss_index.search(query_embedding_np, top_k)

    recommended_papers = [papers[i] for i in indices[0]]
    faiss_dists = distances[0]

    # BM25
    bm25_scores = get_bm25_scores(query, recommended_papers)

    return recommended_papers, faiss_dists, bm25_scores

In [35]:
# Function to rank papers by citation count and similarity
def rank_by_citations_and_similarity(papers, distances):
    """Rank papers by citation count and similarity distance."""
    papers_with_distance = list(zip(papers, distances))
    
    # Sort by citation count first (descending), then by similarity (ascending)
    papers_with_distance.sort(key=lambda x: (-x[0]['citationCount'], x[1]))
    
    return papers_with_distance

In [36]:
def rank_by_weighted_score_hybrid(papers, faiss_dists, bm25_scores, weights=(0.5, 0.3, 0.2), hybrid_weights=(0.6, 0.4)):
    """Rank papers with hybrid relevance (FAISS + BM25) and citation/recency."""
    relevance_weight, citation_weight, recency_weight = weights
    w_faiss, w_bm25 = hybrid_weights

    # Normalize FAISS distances into similarity
    max_dist = max(faiss_dists) + 1e-5
    faiss_sims = [1 - (d / max_dist) for d in faiss_dists]

    # Normalize BM25 scores
    max_bm25 = max(bm25_scores) + 1e-5
    bm25_sims = [s / max_bm25 for s in bm25_scores]

    # Normalize citation and recency
    max_citation = max(p['citationCount'] for p in papers) + 1e-5
    max_year = max(p['year'] for p in papers) + 1e-5
    min_year = min(p['year'] for p in papers)

    ranked = []
    for i, paper in enumerate(papers):
        combined_relevance = (w_faiss * faiss_sims[i]) + (w_bm25 * bm25_sims[i])
        norm_citation = paper['citationCount'] / max_citation
        norm_recency = (paper['year'] - min_year) / (max_year - min_year)

        final_score = (
            relevance_weight * combined_relevance +
            citation_weight * norm_citation +
            recency_weight * norm_recency
        )

        ranked.append({
            "Title": paper['title'],
            "DOI": paper.get('url', "N/A"),
            "Citation Count": paper['citationCount'],
            "Year": paper['year'],
            "FAISS Similarity": round(faiss_sims[i], 4),
            "BM25 Similarity": round(bm25_sims[i], 4),
            "Combined Relevance": round(combined_relevance, 4),
            "Final Score": round(final_score, 4)
        })

    ranked.sort(key=lambda x: -x['Final Score'])
    return ranked

In [37]:
# Main function to fetch, rank, and return relevant papers based on a query
def get_ranked_papers(query, weights=(0.5, 0.3, 0.2), hybrid_weights=(0.6, 0.4)):
    papers = fetch_papers_by_keywords(query, limit=1000)
    if not papers:
        return "No relevant papers found."

    embeddings = get_embeddings(papers)
    faiss_index = create_faiss_index(embeddings)

    recommended_papers, faiss_dists, bm25_scores = search_query(query, faiss_index, papers, top_k=20)

    ranked_papers = rank_by_weighted_score_hybrid(recommended_papers, faiss_dists, bm25_scores, weights, hybrid_weights)
    return ranked_papers

In [38]:
# Example usage
query = "Supervised Machine Learning in Healthcare"
ranked_papers = get_ranked_papers(query)

# Output the ranked papers
for paper in ranked_papers:
    print(paper)

{'Title': 'Supervised machine learning tools: a tutorial for clinicians', 'DOI': 'https://www.semanticscholar.org/paper/64e76a22692fa6f5761a70adbc58f44e9078520e', 'Citation Count': 98, 'Year': 2020, 'FAISS Similarity': np.float32(0.1252), 'BM25 Similarity': np.float64(0.8175), 'Combined Relevance': np.float64(0.4021), 'Final Score': np.float64(0.6122)}
{'Title': 'Machine Learning in Healthcare Data Analysis: A Survey', 'DOI': 'https://www.semanticscholar.org/paper/1d8a78f3f740a01905e925aeb7937df010dfa1df', 'Citation Count': 70, 'Year': 2019, 'FAISS Similarity': np.float32(0.3404), 'BM25 Similarity': np.float64(0.7024), 'Combined Relevance': np.float64(0.4852), 'Final Score': np.float64(0.5458)}
{'Title': 'APPLICATION OF MACHINE LEARNING IN HEALTHCARE', 'DOI': 'https://www.semanticscholar.org/paper/3561b270bbe783369b25bf048edc739e339340d2', 'Citation Count': 1, 'Year': 2024, 'FAISS Similarity': np.float32(0.5975), 'BM25 Similarity': np.float64(0.7833), 'Combined Relevance': np.float64(0