In [1]:
!pip install requests pandas tqdm
!pip install sentence_transformers
!pip install --upgrade ipywidgets
!pip install --upgrade numpy
!pip install faiss-cpu

Collecting pandas
  Downloading pandas-2.2.3-cp313-cp313-win_amd64.whl.metadata (19 kB)
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting numpy>=1.26.0 (from pandas)
  Downloading numpy-2.2.3-cp313-cp313-win_amd64.whl.metadata (60 kB)
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2025.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Downloading tzdata-2025.1-py2.py3-none-any.whl.metadata (1.4 kB)
Downloading pandas-2.2.3-cp313-cp313-win_amd64.whl (11.5 MB)
   ---------------------------------------- 0.0/11.5 MB ? eta -:--:--
   --- ------------------------------------ 1.0/11.5 MB 4.7 MB/s eta 0:00:03
   ------- -------------------------------- 2.1/11.5 MB 4.9 MB/s eta 0:00:02
   ---------- ----------------------------- 3.1/11.5 MB 5.2 MB/s eta 0:00:02
   -------------- ------------------------- 4.2/11.5 MB 5.3 MB/s eta 0:00:02
   --------------- ------------------------ 4.5/11.5 MB 4.4 MB/s eta 0:00:02
   

In [None]:
import requests
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss

In [1]:
import requests

CROSSREF_BASE_URL = "https://api.crossref.org/works"
HEADERS = {"User-Agent": "Mozilla/5.0"}

def fetch_crossref_metadata(title, rows=1000):
    """Fetch metadata from CrossRef API using the paper title."""
    params = {"query.title": title, "rows": rows}  # Increase rows to 1000
    response = requests.get(CROSSREF_BASE_URL, params=params, headers=HEADERS)
    
    if response.status_code == 200:
        data = response.json()
        if "message" in data and "items" in data["message"]:
            papers = data["message"]["items"]
            result = []
            for paper in papers:
                paper_title = paper.get("title", ["Unknown"])[0]
                result.append({
                    "title": paper_title,
                    "doi": paper.get("DOI", "N/A"),
                    "citation_count": paper.get("is-referenced-by-count", 0),
                    "year": paper.get("published-print", {}).get("date-parts", [[None]])[0][0]
                })
            return result  # Return the list of papers

    return None  # No valid papers found

In [2]:
from sentence_transformers import SentenceTransformer
import numpy as np

# Load a pre-trained model (you can try other models like SPECTER for academic papers)
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

def get_embeddings(papers):
    """Generate embeddings for the list of papers using SentenceTransformer."""
    titles = [paper['title'] for paper in papers]
    embeddings = model.encode(titles, convert_to_tensor=True)
    return embeddings

In [4]:
import faiss

def create_faiss_index(embeddings):
    """Create and store embeddings in FAISS index."""
    # Convert embeddings to a NumPy array for FAISS
    embeddings_np = np.array(embeddings.cpu().detach().numpy()).astype('float32')
    
    # Create the FAISS index
    index = faiss.IndexFlatL2(embeddings_np.shape[1])  # L2 distance (Euclidean)
    index.add(embeddings_np)  # Add embeddings to the index
    return index

In [5]:
def search_query(query, faiss_index, papers, top_k=5):
    """Search query in FAISS index and retrieve the most relevant papers."""
    # Generate the embedding for the user query
    query_embedding = model.encode([query], convert_to_tensor=True)
    query_embedding_np = np.array(query_embedding.cpu().detach().numpy()).astype('float32')
    
    # Perform the search in the FAISS index
    distances, indices = faiss_index.search(query_embedding_np, top_k)
    
    # Retrieve the top K papers from the indices
    recommended_papers = [papers[i] for i in indices[0]]
    return recommended_papers, distances[0]

In [6]:
def rank_by_citations_and_similarity(recommended_papers, distances):
    """Rank papers by citation count and semantic similarity."""
    ranked_papers = sorted(zip(recommended_papers, distances),
                           key=lambda x: (x[0]['citation_count'], -x[1]), reverse=True)
    return ranked_papers

In [7]:
def get_ranked_papers(query):
    """Main function to fetch, rank, and return relevant papers based on query."""
    # Step 1: Fetch papers from CrossRef API
    papers = fetch_crossref_metadata(query, rows=1000)
    if not papers:
        return "No relevant papers found."
    
    # Step 2: Generate embeddings for the fetched papers
    embeddings = get_embeddings(papers)
    
    # Step 3: Create FAISS index with the embeddings
    faiss_index = create_faiss_index(embeddings)
    
    # Step 4: Search the query in the FAISS index to get top-k relevant papers
    recommended_papers, distances = search_query(query, faiss_index, papers, top_k=10)
    
    # Step 5: Rank papers based on citation count and similarity
    ranked_papers = rank_by_citations_and_similarity(recommended_papers, distances)
    
    # Step 6: Format the result for output
    ranked_results = []
    for paper, dist in ranked_papers:
        ranked_results.append({
            "Title": paper['title'],
            "DOI": paper['doi'],
            "Citation Count": paper['citation_count'],
            "Year": paper['year'],
            "Similarity Distance": dist
        })
    
    return ranked_results

In [8]:
# Example usage
query = "Supervised Machine Learning usage in Healthcare"
ranked_papers = get_ranked_papers(query)
for paper in ranked_papers:
    print(f"Title: {paper['Title']}, DOI: {paper['DOI']}, Citation Count: {paper['Citation Count']}, Year: {paper['Year']}, Similarity Distance: {paper['Similarity Distance']}")

Title: A Study of Machine Learning in Healthcare, DOI: 10.1109/compsac.2017.164, Citation Count: 133, Year: 2017, Similarity Distance: 7.934821128845215
Title: Appropriate use of machine learning in healthcare, DOI: 10.1016/j.ibmed.2021.100041, Citation Count: 13, Year: 2021, Similarity Distance: 7.723365783691406
Title: Application of Machine Learning Techniques in Healthcare, DOI: 10.4018/978-1-5225-9902-9.ch015, Citation Count: 12, Year: 2020, Similarity Distance: 8.307032585144043
Title: Use of Machine Learning in Healthcare, DOI: 10.1201/9781003322597-12, Citation Count: 0, Year: 2022, Similarity Distance: 7.713277816772461
Title: Use of Machine Learning in Healthcare, DOI: 10.1002/9781119769293.ch13, Citation Count: 0, Year: 2022, Similarity Distance: 7.713277816772461
Title: A Survey of Machine Learning in Healthcare, DOI: 10.1201/9781003241409-1, Citation Count: 0, Year: 2022, Similarity Distance: 7.787750244140625
Title: Exploring the Use of Machine Learning in Healthcare, DOI

In [10]:
# Fetch papers from CrossRef (this step is independent of ranking)
papers = fetch_crossref_metadata(query, rows=1000)

# Print the top 10 papers fetched from CrossRef
top_10_papers_from_crossref = papers[:10]
for paper in top_10_papers_from_crossref:
    print(f"Title: {paper['title']}, DOI: {paper['doi']}, Citation Count: {paper['citation_count']}, Year: {paper['year']}")

Title: Supervised Machine Learning Chatbots for Perinatal Mental Healthcare, DOI: 10.1109/ichci51889.2020.00086, Citation Count: 20, Year: 2020
Title: Supervised Machine Learning Techniques for Power Consumption Usage Level Prediction, DOI: 10.1109/csecs60003.2023.10428312, Citation Count: 0, Year: 2023
Title: Prognosis of Supervised Machine Learning Algorithms in Healthcare Sector, DOI: 10.1109/rteict52294.2021.9573665, Citation Count: 1, Year: 2021
Title: Machine Learning – Supervised Learning, DOI: 10.1201/9780429326813-6, Citation Count: 0, Year: 2019
Title: Automatic emotion recognition in healthcare data using supervised machine learning, DOI: 10.7717/peerj-cs.751, Citation Count: 14, Year: None
Title: Supervised learning (machine learning), DOI: 10.53347/rid-56096, Citation Count: 0, Year: None
Title: Supervised Machine Learning, DOI: 10.1007/978-981-97-0217-6_8, Citation Count: 0, Year: 2024
Title: Supervised machine learning, DOI: 10.1007/978-3-662-67882-4_6, Citation Count: 1