In [1]:
import requests
# Updated API Endpoint for Semantic Scholar
SEMANTIC_SCHOLAR_BASE_URL = "https://api.semanticscholar.org/graph/v1/paper/search/bulk"
HEADERS = {"User-Agent": "DeepCite/1.0"}

# Function to fetch papers from Semantic Scholar API based on a query
def fetch_papers_by_keywords(keywords, fields="title,abstract,url,year,citationCount", limit=1000):
    papers = []
    offset = 0
    while len(papers) < limit:
        params = {
            "query": keywords,
            "fields": fields,
            "limit": min(limit - len(papers), 100),  # Fetch in batches of 100
            "offset": offset
        }
        response = requests.get(SEMANTIC_SCHOLAR_BASE_URL, headers=HEADERS, params=params)
        if response.status_code == 200:
            data = response.json()
            new_papers = data.get("data", [])
            papers.extend(new_papers)
            offset += 100  # Move the offset to get the next set of papers
        else:
            print(f"Failed to fetch data, status code: {response.status_code}")
            break
    return papers[:limit]

In [14]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Initialize the SentenceTransformer model
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def get_embeddings(papers):
    """Generate embeddings for the list of papers using SentenceTransformer."""
    titles_and_abstracts = []
    for paper in papers:
        title = paper.get('title', '')  # Default to empty string if title is missing
        abstract = paper.get('abstract', '')  # Default to empty string if abstract is missing
        title = title if title else ''  # Ensure title is a string
        abstract = abstract if abstract else ''  # Ensure abstract is a string
        titles_and_abstracts.append(title + " " + abstract)
    embeddings = model.encode(titles_and_abstracts, convert_to_tensor=True)
    return embeddings

In [15]:
import faiss
# Function to create and store embeddings in FAISS index
def create_faiss_index(embeddings):
    """Create and store embeddings in FAISS index."""
    # Convert embeddings to a NumPy array for FAISS
    embeddings_np = np.array(embeddings.cpu().detach().numpy()).astype('float32')
    
    # Create the FAISS index (using L2 distance, Euclidean)
    index = faiss.IndexFlatL2(embeddings_np.shape[1])
    index.add(embeddings_np)  # Add embeddings to the index
    return index

In [16]:
# Function to search a query in FAISS index and retrieve the most relevant papers
def search_query(query, faiss_index, papers, top_k=5):
    """Search query in FAISS index and retrieve the most relevant papers."""
    # Generate the embedding for the user query
    query_embedding = model.encode([query], convert_to_tensor=True)
    query_embedding_np = np.array(query_embedding.cpu().detach().numpy()).astype('float32')
    
    # Perform the search in the FAISS index
    distances, indices = faiss_index.search(query_embedding_np, top_k)
    
    # Retrieve the top K papers from the indices
    recommended_papers = [papers[i] for i in indices[0]]
    return recommended_papers, distances[0]

In [17]:
# Function to rank papers by citation count and similarity
def rank_by_citations_and_similarity(papers, distances):
    """Rank papers by citation count and similarity distance."""
    papers_with_distance = list(zip(papers, distances))
    
    # Sort by citation count first (descending), then by similarity (ascending)
    papers_with_distance.sort(key=lambda x: (-x[0]['citationCount'], x[1]))
    
    return papers_with_distance

In [18]:
# Main function to fetch, rank, and return relevant papers based on a query
def get_ranked_papers(query):
    """Main function to fetch, rank, and return relevant papers based on query."""
    # Step 1: Fetch papers from Semantic Scholar API
    papers = fetch_papers_by_keywords(query, limit=1000)
    if not papers:
        return "No relevant papers found."
    
    # Step 2: Generate embeddings for the fetched papers (using title and abstract)
    embeddings = get_embeddings(papers)
    
    # Step 3: Create FAISS index with the embeddings
    faiss_index = create_faiss_index(embeddings)
    
    # Step 4: Search the query in the FAISS index to get top-k relevant papers
    recommended_papers, distances = search_query(query, faiss_index, papers, top_k=10)
    
    # Step 5: Rank papers based on citation count and similarity
    ranked_papers = rank_by_citations_and_similarity(recommended_papers, distances)
    
    # Step 6: Format the result for output
    ranked_results = []
    for paper, dist in ranked_papers:
        ranked_results.append({
            "Title": paper['title'],
            "DOI": paper.get('url', "N/A"),  # Semantic Scholar doesn't always return DOI
            "Citation Count": paper['citationCount'],
            "Year": paper['year'],
            "Similarity Distance": dist
        })
    
    return ranked_results

In [19]:
# Example usage
query = "Supervised machine learning use in healthcare"
ranked_papers = get_ranked_papers(query)

# Output the ranked papers
for paper in ranked_papers:
    print(paper)

{'Title': 'A Comprehensive Review on Machine Learning in Healthcare Industry: Classification, Restrictions, Opportunities and Challenges', 'DOI': 'https://www.semanticscholar.org/paper/ac2cffc4b9f96bae24809d738777ae897094ae33', 'Citation Count': 116, 'Year': 2023, 'Similarity Distance': np.float32(18.02309)}
{'Title': 'A collaborative empirical analysis on machine learning based disease prediction in health care system', 'DOI': 'https://www.semanticscholar.org/paper/b564b65d08fba33dfbc4a55d3a250cb008fb706b', 'Citation Count': 17, 'Year': 2023, 'Similarity Distance': np.float32(16.87191)}
{'Title': 'Artificial Intelligence and Machine Learning in Healthcare', 'DOI': 'https://www.semanticscholar.org/paper/276b72329076a2aedb552f310bb5bbd5168a9a0f', 'Citation Count': 15, 'Year': 2023, 'Similarity Distance': np.float32(12.856625)}
{'Title': 'Machine learning applied to healthcare: a conceptual review', 'DOI': 'https://www.semanticscholar.org/paper/40620a51a19bb0cb61a27f920f3f9235c0b44b6b', 