# For one source scrape google scholar to get a list of all citations

To run the code replace "base_url" and "filename".
- "base_url" is the url of the google scholar page you want to scrape.

*Note:* It should start with "https://scholar.google.com/scholar?start=0" . You get this url by going to page 2 and then back to page 1 by clicking on the "previous" button.
- "filename" is the name of the json file you want to save the data to.


Disclaimer:
- Scholar does not like it when you scrape their website so it might block you after a while so don't run it more than needed in one day or so. If you are blocked, try a vpn or wait some time.

In [2]:
import requests
from bs4 import BeautifulSoup
import re 
import json
import numpy as np
import time
import pandas as pd


In [3]:
def get_nextpage(soup):
    """Get the next page URL from the bottom navigation bar"""
    bottom_nav = soup.find("div", id='gs_n')
    next_link = bottom_nav.find_all('a')
    if next_link is None:
        return None
    if len(next_link) > 0 and "Next" in next_link[-1].find("b"):
        next_link = next_link[-1]
        part_url = next_link['href']
        url = "https://scholar.google.com" +part_url
        return url
    return None


In [4]:
def get_nextpage_url(url):
    """ Get the next page URL by adding 10 to the current page number in the URL"""
    # Search for the number in the URL using regular expression
    match = re.search(r'\d+', url)

    if match:
        # Extract the matched number
        current_number = int(match.group())

        # Update the number by adding 10
        new_number = current_number + 10

        # Replace the old number with the new number in the URL
        updated_url = re.sub(r'\d+', str(new_number), url, count=1)
        print(updated_url)
        return updated_url
    else:
        # If no number is found in the URL, return the original URL
        print('Errooor: could not find a number in the URL')
        return url

In [5]:
def find_last_year(input_string):
    """Find the last year in the input string"""
    # Define a regex pattern to match a four-digit year
    year_pattern = re.compile(r'\b\d{4}\b')

    # Find all matches of the pattern in the input string
    matches = year_pattern.findall(input_string)

    # If there are no matches, return None
    if not matches:
        return None

    # Return the last match found
    return int(matches[-1])

In [6]:
import requests
from bs4 import BeautifulSoup
import time
def get_citations_per_page(base_url):
    """Get the all citations from the current page and return next page URL"""
    response = requests.get(base_url)

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        citation_list = []
        
        for citation in soup.find_all('div', class_='gs_r gs_or gs_scl'):
            title = citation.find('h3', class_='gs_rt').text.strip()

            # Get authors
            authors_tag = citation.find('div', class_='gs_a')
            authors = authors_tag.text.strip() if authors_tag else ''
            authors = [author.text +", " for author in  authors_tag.find_all('a') ]
            authors = " ".join(authors)[:-2]

            # Get metdata, year and link to paper PDF
            metadata = [content for content in authors_tag.contents if not content.name == 'a'][-1]
            year = find_last_year(metadata)
            link_paper = citation.find('div', class_='gs_or_ggsm')
            link_paper = link_paper.find('a')['href'] if link_paper is not None else None

            # Get the number of citations
            number_cit_line = citation.find('a', string=lambda text: text and text.startswith("Cited by"))
            number_of_citations = None
            if number_cit_line is not None:
                number_of_citations = int(number_cit_line.text.replace("Cited by", "").replace(" ", ""))


            citation_list.append({
                'title': title,
                'authors': authors,
                'year': year,
                'NumberOfCitations': number_of_citations,
                'paper_link':link_paper,
                'metadata':metadata
            })
        next_page_url = get_nextpage_url(base_url)
        return citation_list, next_page_url
    else:
        print(f"Error: {response.status_code}")
        return None, None
    
    
def get_citations_multipage(start_url):
    """Get all citations from all pages - go to next page till invalid page is reached"""
    citation_list = []
    
    page_url = start_url
    page_idx = 0
    while True:
        page_idx +=1
        print("Processing page:", page_idx, "Total citations:", len(citation_list))
        citation_list1, next_page_url = get_citations_per_page(page_url)

        if next_page_url is None or len(citation_list1) == 0:
            print("Quiting No more data")
            return citation_list
        
        citation_list.extend(citation_list1)
        page_url = next_page_url
        time.sleep(1)
        
        
    
def store_citation_list(citation_list, filename):
    citation_dict = {}
    for i, item in enumerate(citation_list):
        citation_dict[i] = item

    with open(filename, 'w') as f:
        json.dump(citation_dict, f)
        
    return citation_dict


### 1. Extracts all citations in a List, then store it as a json

In [7]:
# base_url = "https://scholar.google.com/scholar?start=0&hl=nl&as_sdt=2005&cites=10005508259730753239&scipsc="
base_url = "https://scholar.google.com/scholar?start=0&hl=nl&as_sdt=2005&cites=14566886582099332396&scipsc="
filename = "citations_replication_failure"


citation_list = get_citations_multipage(base_url)
citation_dict = store_citation_list(citation_list, filename)

print_citations = True
if print_citations:
    if citation_list:
        for citation_i in citation_list:
            for k,v in citation_i.items():
                print(f'{k} : {v}')
            print("----------")

Processing page: 1 Total citations: 0
https://scholar.google.com/scholar?start=10&hl=nl&as_sdt=2005&cites=14566886582099332396&scipsc=
Processing page: 2 Total citations: 10
https://scholar.google.com/scholar?start=20&hl=nl&as_sdt=2005&cites=14566886582099332396&scipsc=
Processing page: 3 Total citations: 20
https://scholar.google.com/scholar?start=30&hl=nl&as_sdt=2005&cites=14566886582099332396&scipsc=
Processing page: 4 Total citations: 30
https://scholar.google.com/scholar?start=40&hl=nl&as_sdt=2005&cites=14566886582099332396&scipsc=
Processing page: 5 Total citations: 40
https://scholar.google.com/scholar?start=50&hl=nl&as_sdt=2005&cites=14566886582099332396&scipsc=
Processing page: 6 Total citations: 50
https://scholar.google.com/scholar?start=60&hl=nl&as_sdt=2005&cites=14566886582099332396&scipsc=
Processing page: 7 Total citations: 60
https://scholar.google.com/scholar?start=70&hl=nl&as_sdt=2005&cites=14566886582099332396&scipsc=
Processing page: 8 Total citations: 70
https://sc

### 2. Convert the Json to excel file

In [8]:
# Load the citation list from file  -- Uncomment if you want to load the citation list from file
# filename2 = filename + ".json"
# with open(filename2, 'r') as f:
#     citation_dict = json.load(f)

In [9]:
def json_to_excel(json_data, excel_file_path):
    if "xlsx" not in excel_file_path:
        excel_file_path += ".xlsx"
    # Convert the JSON data to a DataFrame
    df = pd.DataFrame.from_dict(json_data, orient='index')

    # Write the DataFrame to an Excel file
    df.to_excel(excel_file_path, index=False)


excel_file_path = filename + ".xlsx"  # Provide the desired output Excel file path
json_to_excel(citation_dict, excel_file_path)

### 3. Get Abstract from Arxiv and dl.acm. Add to json and store as excel

In [10]:
def extract_number_from_url(url):
    # Define the regular expression pattern
    pattern = r'/(\d+\.\d+)(?:\.pdf)?'

    # Use re.search to find the pattern in the URL
    match = re.search(pattern, url)

    # Check if a match is found and extract the number
    if match:
        extracted_number = match.group(1)
        return extracted_number
    else:
        return None


def get_abs_arxiv(arx_url):
    response = requests.get(arx_url)
    abstract_text = None
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        for abstract in soup.find_all('blockquote', class_='abstract mathjax'):
            abstract_text = abstract.text.replace("Abstract:", "").replace("\n", "")            
    else:
        print("URL COULDN'T BE LOADED")
        
    return abstract_text


def get_abs_acm(acm_url):
    response = requests.get(acm_url)
    
    abstract_text = None
    if response.status_code == 200:
        soup = BeautifulSoup(response.text, 'html.parser')
        for abstract in soup.find_all('div', class_='abstractSection abstractInFull'):
            abstract_text = abstract.text.replace("\n", "")
    else:
        print("URL COULDN'T BE LOADED")
    if abstract_text is None:
        print("could not load abstract for:", acm_url)
    return abstract_text


def add_abstracts_to_citation_dict(citation_dict):
    """Add the abstracts to the citation_dict"""
    linklist = []
    for _, cit in citation_dict.items():
        paperlink = cit["paper_link"]
        abstract_text = None
        if paperlink is not None:
            if "arxiv" in paperlink:
                print("Arxiv link found")
                number = extract_number_from_url(paperlink)
                url = "https://arxiv.org/abs/" + str(number)
                linklist.append(number)
                abstract_text = get_abs_arxiv(url)
                time.sleep(0.2) # sleep to not overload the server
                print("URL:", url, " \n Abstract:",abstract_text )
            elif "dl.acm" in paperlink:
                print("ACM link found")
                number = paperlink.split("/")[-1]
                url = "https://dl.acm.org/doi/10.1145/" + str(number)
                abstract_text = get_abs_acm(url)
                linklist.append(number)
                time.sleep(0.2) # sleep to not overload the server
                print("URL:", url, " \n Abstract:",abstract_text )

        
        # if abstract_text is not None:
        cit["abstract"] = abstract_text
    print("Number of found abstracts:", len(linklist))
    return citation_dict

In [11]:
citation_dict = add_abstracts_to_citation_dict(citation_dict)

Arxiv link found
URL: https://arxiv.org/abs/2111.15366  
 Abstract: There is a tendency across different subfields in AI to valorize a small collection of influential benchmarks. These benchmarks operate as stand-ins for a range of anointed common problems that are frequently framed as foundational milestones on the path towards flexible and generalizable AI systems. State-of-the-art performance on these benchmarks is widely understood as indicative of progress towards these long-term goals. In this position paper, we explore the limits of such benchmarks in order to reveal the construct validity issues in their framing as the functionally "general" broad measures of progress they are set up to be.    
Arxiv link found
URL: https://arxiv.org/abs/1707.09861  
 Abstract: In this paper we show that reporting a single performance score is insufficient to compare non-deterministic approaches. We demonstrate for common sequence tagging tasks that the seed value for the random number generato

In [22]:
# store json and excel file
filename2 = filename + "_with_abstracts"
with open(filename2+ ".json", 'w') as f:
    json.dump(citation_dict, f)

json_to_excel(citation_dict, filename2)