In [1]:
!pip install requests pandas tqdm





In [2]:
import requests
import json

# CrossRef API Endpoint (No API key needed)
CROSSREF_BASE_URL = "https://api.crossref.org/works"

# Semantic Scholar API Endpoint
SEMANTIC_SCHOLAR_BASE_URL = "https://api.semanticscholar.org/graph/v1/paper"
HEADERS = {"User-Agent": "DeepCite/1.0"}  # Add a user-agent for API requests


In [7]:
def fetch_crossref_metadata(title):
    """Fetch metadata from CrossRef API using the paper title with better filtering."""
    params = {"query.title": title, "rows": 5}  # Fetch top 5 matches
    response = requests.get(CROSSREF_BASE_URL, params=params, headers=HEADERS)
    
    if response.status_code == 200:
        data = response.json()
        if "message" in data and "items" in data["message"]:
            papers = data["message"]["items"]
            best_match = None

            for paper in papers:
                paper_title = paper.get("title", ["Unknown"])[0]

                # Check if the retrieved title closely matches the given title (case insensitive)
                if paper_title.lower() == title.lower():
                    return {
                        "title": paper_title,
                        "doi": paper.get("DOI", "N/A"),
                        "citation_count": paper.get("is-referenced-by-count", 0),
                        "year": paper.get("published-print", {}).get("date-parts", [[None]])[0][0]
                    }

                # Store the best match with the highest citation count
                if not best_match or paper.get("is-referenced-by-count", 0) > best_match["citation_count"]:
                    best_match = {
                        "title": paper_title,
                        "doi": paper.get("DOI", "N/A"),
                        "citation_count": paper.get("is-referenced-by-count", 0),
                        "year": paper.get("published-print", {}).get("date-parts", [[None]])[0][0]
                    }

            return best_match  # Return the most cited match if no exact title match is found

    return None  # No valid paper found

In [8]:
paper_title = "Attention Is All You Need"
metadata = fetch_crossref_metadata(paper_title)
print(metadata)

{'title': 'Attention Is All You Need In Speech Separation', 'doi': '10.1109/icassp39728.2021.9413901', 'citation_count': 282, 'year': 2021}


In [5]:
def fetch_semantic_scholar_data(doi):
    """Fetch citation metadata from Semantic Scholar using DOI."""
    url = f"{SEMANTIC_SCHOLAR_BASE_URL}/{doi}?fields=title,externalIds,citationCount,influentialCitationCount,citations,references"
    response = requests.get(url, headers=HEADERS)

    if response.status_code == 200:
        data = response.json()
        return {
            "title": data.get("title", "Unknown"),
            "citation_count": data.get("citationCount", 0),
            "influential_citation_count": data.get("influentialCitationCount", 0),
            "inlinks": [c.get("paperId") for c in data.get("citations", [])],
            "outlinks": [r.get("paperId") for r in data.get("references", [])]
        }
    return None  # No valid data found


In [9]:
import requests

def fetch_semantic_scholar_data(paper_id):
    """Fetch citation metadata from Semantic Scholar using a DOI."""
    base_url = f"https://api.semanticscholar.org/v1/paper/{paper_id}"
    
    response = requests.get(base_url)
    
    if response.status_code == 200:
        data = response.json()
        
        return {
            "title": data.get("title", "Unknown"),
            "doi": data.get("doi", "N/A"),
            "citation_count": data.get("citationCount", 0),
            "year": data.get("year", "Unknown"),
            "url": data.get("url", "N/A")
        }
    
    else:
        print(f"Error: {response.status_code}, {response.text}")  # Debugging info
        return None


In [11]:
# doi = "10.48550/arXiv.1706.03762"
paper_id = "arXiv:1706.03762"

citation_data = fetch_semantic_scholar_data(paper_id)
print(citation_data)

{'title': 'Attention is All you Need', 'doi': None, 'citation_count': 0, 'year': 2017, 'url': 'https://www.semanticscholar.org/paper/204e3073870fae3d05bcbc2f6a8e263d9b72e776'}


In [None]:
import json

def save_to_json(data, filename="paper_metadata.json"):
    """Save paper metadata to a JSON file."""
    with open(filename, "w") as f:
        json.dump(data, f, indent=4)

# Example usage
save_to_json(metadata, "crossref_metadata.json")
save_to_json(citation_data, "semantic_scholar_data.json")