In [1]:
import requests
# Updated API Endpoint for Semantic Scholar
SEMANTIC_SCHOLAR_BASE_URL = "https://api.semanticscholar.org/graph/v1/paper/search/bulk"
HEADERS = {"User-Agent": "DeepCite/1.0"}

# Function to fetch papers from Semantic Scholar API based on a query
def fetch_papers_by_keywords(keywords, fields="title,abstract,url,year,citationCount", limit=1000):
    papers = []
    offset = 0
    while len(papers) < limit:
        params = {
            "query": keywords,
            "fields": fields,
            "limit": min(limit - len(papers), 100),  # Fetch in batches of 100
            "offset": offset
        }
        response = requests.get(SEMANTIC_SCHOLAR_BASE_URL, headers=HEADERS, params=params)
        if response.status_code == 200:
            data = response.json()
            new_papers = data.get("data", [])
            papers.extend(new_papers)
            offset += 100  # Move the offset to get the next set of papers
        else:
            print(f"Failed to fetch data, status code: {response.status_code}")
            break
    return papers[:limit]

In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Initialize the SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def get_embeddings(papers):
    """Generate embeddings for the list of papers using SentenceTransformer."""
    titles_and_abstracts = []
    for paper in papers:
        title = paper.get('title', '')  # Default to empty string if title is missing
        abstract = paper.get('abstract', '')  # Default to empty string if abstract is missing
        title = title if title else ''  # Ensure title is a string
        abstract = abstract if abstract else ''  # Ensure abstract is a string
        titles_and_abstracts.append(title + " " + abstract)
    embeddings = model.encode(titles_and_abstracts, convert_to_tensor=True)
    return embeddings

modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


config_sentence_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/3.51k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [3]:
import faiss
# Function to create and store embeddings in FAISS index
def create_faiss_index(embeddings):
    """Create and store embeddings in FAISS index."""
    # Convert embeddings to a NumPy array for FAISS
    embeddings_np = np.array(embeddings.cpu().detach().numpy()).astype('float32')
    
    # Create the FAISS index (using L2 distance, Euclidean)
    index = faiss.IndexFlatL2(embeddings_np.shape[1])
    index.add(embeddings_np)  # Add embeddings to the index
    return index

In [4]:
# Function to search a query in FAISS index and retrieve the most relevant papers
def search_query(query, faiss_index, papers, top_k=5):
    """Search query in FAISS index and retrieve the most relevant papers."""
    # Generate the embedding for the user query
    query_embedding = model.encode([query], convert_to_tensor=True)
    query_embedding_np = np.array(query_embedding.cpu().detach().numpy()).astype('float32')
    
    # Perform the search in the FAISS index
    distances, indices = faiss_index.search(query_embedding_np, top_k)
    
    # Retrieve the top K papers from the indices
    recommended_papers = [papers[i] for i in indices[0]]
    return recommended_papers, distances[0]

In [5]:
# Function to rank papers by citation count and similarity
def rank_by_citations_and_similarity(papers, distances):
    """Rank papers by citation count and similarity distance."""
    papers_with_distance = list(zip(papers, distances))
    
    # Sort by citation count first (descending), then by similarity (ascending)
    papers_with_distance.sort(key=lambda x: (-x[0]['citationCount'], x[1]))
    
    return papers_with_distance

In [9]:
def rank_by_weighted_score(papers, distances, weights=(0.5, 0.3, 0.2)):
    """Rank papers using a weighted score: relevance, citations, and recency."""
    relevance_weight, citation_weight, recency_weight = weights
    
    # Convert FAISS distances to similarity (smaller distance = higher similarity)
    max_dist = max(distances) + 1e-5  # avoid division by zero
    similarity_scores = [1 - (d / max_dist) for d in distances]

    # Normalize citation counts and years
    max_citation = max(paper['citationCount'] for paper in papers) + 1e-5
    max_year = max(paper['year'] for paper in papers) + 1e-5
    min_year = min(paper['year'] for paper in papers)

    ranked_list = []
    for paper, sim in zip(papers, similarity_scores):
        norm_citation = paper['citationCount'] / max_citation
        norm_recency = (paper['year'] - min_year) / (max_year - min_year)

        final_score = (
            relevance_weight * sim +
            citation_weight * norm_citation +
            recency_weight * norm_recency
        )

        ranked_list.append({
            "Title": paper['title'],
            "DOI": paper.get('url', "N/A"),
            "Citation Count": paper['citationCount'],
            "Year": paper['year'],
            "Similarity Score": round(sim, 4),
            "Final Score": round(final_score, 4)
        })

    # Sort by final weighted score in descending order
    ranked_list.sort(key=lambda x: -x['Final Score'])

    return ranked_list


In [10]:
# Main function to fetch, rank, and return relevant papers based on a query
def get_ranked_papers(query, weights=(0.5, 0.3, 0.2)):
    """Main function to fetch, rank, and return relevant papers based on query."""
    # Step 1: Fetch papers from Semantic Scholar API
    papers = fetch_papers_by_keywords(query, limit=1000)
    if not papers:
        return "No relevant papers found."
    
    # Step 2: Generate embeddings for the fetched papers (using title and abstract)
    embeddings = get_embeddings(papers)
    
    # Step 3: Create FAISS index with the embeddings
    faiss_index = create_faiss_index(embeddings)
    
    # Step 4: Search the query in the FAISS index to get top-k relevant papers
    recommended_papers, distances = search_query(query, faiss_index, papers, top_k=10)
    
    # Step 5: Rank papers based on citation count and similarity
    # ranked_papers = rank_by_citations_and_similarity(recommended_papers, distances)
    ranked_papers = rank_by_weighted_score(recommended_papers, distances, weights=weights)
    
    # # Step 6: Format the result for output
    # ranked_results = []
    # for paper, dist in ranked_papers:
    #     ranked_results.append({
    #         "Title": paper['title'],
    #         "DOI": paper.get('url', "N/A"),  # Semantic Scholar doesn't always return DOI
    #         "Citation Count": paper['citationCount'],
    #         "Year": paper['year'],
    #         "Similarity Distance": dist
    #     })
    
    return ranked_papers

In [11]:
# Example usage
query = "Supervised Machine Learning in Healthcare"
ranked_papers = get_ranked_papers(query)

# Output the ranked papers
for paper in ranked_papers:
    print(paper)

{'Title': 'APPLICATION OF MACHINE LEARNING IN HEALTHCARE', 'DOI': 'https://www.semanticscholar.org/paper/3561b270bbe783369b25bf048edc739e339340d2', 'Citation Count': 1, 'Year': 2024, 'Similarity Score': np.float32(0.5399), 'Final Score': np.float32(0.4731)}
{'Title': 'Application of Machine Learning in Healthcare: An Analysis', 'DOI': 'https://www.semanticscholar.org/paper/4b4eb05c5ec977105fa95f90cb44baa6a4f7aa60', 'Citation Count': 5, 'Year': 2022, 'Similarity Score': np.float32(0.4429), 'Final Score': np.float32(0.3569)}
{'Title': 'Supervised machine learning tools: a tutorial for clinicians', 'DOI': 'https://www.semanticscholar.org/paper/64e76a22692fa6f5761a70adbc58f44e9078520e', 'Citation Count': 97, 'Year': 2020, 'Similarity Score': np.float32(0.0), 'Final Score': np.float32(0.34)}
{'Title': 'Machine Learning in Healthcare Data Analysis: A Survey', 'DOI': 'https://www.semanticscholar.org/paper/1d8a78f3f740a01905e925aeb7937df010dfa1df', 'Citation Count': 70, 'Year': 2019, 'Similari