# For one source scrape google scholar to get a list of all citations

To run the code replace "base_url" and "filename".
- "base_url" is the url of the google scholar page you want to scrape.

*Note:* It should start with "https://scholar.google.com/scholar?start=0" . You get this url by going to page 2 and then back to page 1 by clicking on the "previous" button.
- "filename" is the name of the json file you want to save the data to.


Disclaimer:
- Scholar does not like it when you scrape their website so it might block you after a while so don't run it more than needed in one day or so. If you are blocked, try a vpn or wait some time.

In [1]:
import requests
from bs4 import BeautifulSoup
import re 
import json
import numpy as np
import time
import pandas as pd


In [2]:
def get_nextpage_url(url, semantic_schol = False):
    """ Get the next page URL by adding 10 to the current page number in the URL"""
    # Search for the number in the URL using regular expression
    match = re.search(r'\d+', url)
    
    if semantic_schol is True:
        url_split = url.split("page=")
        if len(url_split) != 2:
            print("The url does not contain exactly one time the substring: 'page='")
            print("url_split", url_split)
        curr_page_num = url_split[-1]
        new_url =  url_split[0] + "page=" + str(int(curr_page_num) +1)
        print(new_url)
        return new_url
    else:
        if match:
            # Extract the matched number
            current_number = int(match.group())

            # Update the number by adding 10
            new_number = current_number + 10

            # Replace the old number with the new number in the URL
            updated_url = re.sub(r'\d+', str(new_number), url, count=1)
            print(updated_url)
            return updated_url
        else:
            # If no number is found in the URL, return the original URL
            print('Errooor: could not find a number in the URL')
            return url

In [6]:
def store_citation_list(citation_list, filename):
    citation_dict = {}
    for i, item in enumerate(citation_list):
        citation_dict[i] = item

    with open(filename, 'w') as f:
        json.dump(citation_dict, f)
        
    return citation_dict

def json_to_excel(json_data, excel_file_path):
    if "xlsx" not in excel_file_path:
        excel_file_path += ".xlsx"
    # Convert the JSON data to a DataFrame
    df = pd.DataFrame.from_dict(json_data, orient='index')

    # Write the DataFrame to an Excel file
    df.to_excel(excel_file_path, index=False)


## Citation for semantic scholar:

In [4]:
from selenium import webdriver
from bs4 import BeautifulSoup

def get_soup_selentium(url):
    # Set up the Selenium webdriver
    driver = webdriver.Chrome()  # You'll need to have ChromeDriver installed and in your PATH

    # Open the webpage
    driver.get(url)

    # Wait for the page to fully load (you might need to adjust the time based on the page)
    driver.implicitly_wait(10)

    # Get the page source after JavaScript has executed
    page_source = driver.page_source
    while "Human Verification" in page_source:
        print("Human verification required")
        time.sleep(1)
        page_source = driver.page_source

    # Close the browser
    driver.quit()

    # Now, you can use BeautifulSoup to parse the page source
    soup = BeautifulSoup(page_source, 'html.parser')
    if "Human Verification" in soup.text:
        return None
    
    return soup

# soup = get_soup_selentium(base_url)

In [5]:
base_url = "https://www.semanticscholar.org/paper/Bias-in-Bios%3A-A-Case-Study-of-Semantic-Bias-in-a-De-Arteaga-Romanov/c4afa2b3eda95a1194313394901e0e96e24cefaa?sort=is-influential&page=1"
citation_list = []


In [7]:
""" Note: sometimes it crashes therefore the base_url is updated, so you can just rerun this cell and it will continue where it left off"""
while True:
    soup = get_soup_selentium(base_url)
    if soup:

        citation_page = soup.find('div', class_='paper-detail-content-card result-page')
        for citation in citation_page.find_all('div', class_='cl-paper-row citation-list__paper-row'):
            title = citation.find('h3', class_='cl-paper-title').text.strip()

            # Get authors
            authors_tag = citation.find('ul', class_='cl-paper__bulleted-row')
            authors = citation.find('span', class_='cl-paper-authors')
            if authors is not None:
                authors = authors.find_all('a')
                author_list = [author.text for author in authors]
                author_string = ", ".join(author_list)
                authors = author_string

            # Get metdata, year and link to paper PDF
            paper_fos = citation.find("span", class_= "cl-paper-fos")
            paper_fos = paper_fos.text if paper_fos is not None else paper_fos
            paper_venue = citation.find('span', class_='cl-paper-venue')
            paper_venue = paper_venue.text if paper_venue is not None else paper_venue
            
            year = citation.find("span", class_= "cl-paper-pubdates").text

            # Get the link to the paper on semantic scholar
            link_paper = citation.find('li', class_='cl-paper-stats__item')
            if link_paper is not None:
                link_paper = link_paper.find("a")['href'] if link_paper is not None else None
                link2 = "https://www.semanticscholar.org" + link_paper
                link_paper = link2.replace("/paper", "/reader")
                
            # Get the number of times this paper cites the orignial paper
            num_exerpts = citation.find("button", {"data-heap-type":"excerpt"})
            num_exerpts = num_exerpts.text if num_exerpts is not None else num_exerpts
            
            # Get boolean if the paper is highly influenced by the original paper
            highly_influenced = citation.find("span", class_="cl-paper-stats__hideable-text")
            highly_influenced = highly_influenced.text if highly_influenced is not None else highly_influenced

            # Get either the TLDR or the abstract -whichever is available
            tldr = citation.find("div", class_="tldr-abstract-replacement text-truncator")
            tldr = tldr.text.replace("TLDR", "TLDR: ") if tldr is not None else tldr
            if tldr is None:
                tldr = citation.find("div", class_="cl-paper-abstract")
                tldr = "Abstract:" + tldr.text if tldr is not None else tldr
                print("Found abstract:", tldr)


            citation_list.append({
                'title': title,
                'authors': authors,
                'tldr': tldr,
                'year': year,
                'paper_link':link_paper,
                'paper_fos':paper_fos,
                'paper_venue':paper_venue,
                'num_exerpts':num_exerpts,
                'highly_influenced':highly_influenced
            })
            print("new entry:", citation_list[-1])
            print("len now:", len(citation_list))
        next_page_url = get_nextpage_url(base_url, semantic_schol=True)
        base_url = next_page_url
 
    else:
        print(f"Error: could not load page")
    

new entry: {'title': 'Fair Text Classification with Wasserstein Independence', 'authors': "Thibaud Leteno, Antoine Gourru, Charlotte Laclau, R'emi Emonet, Christophe Gravier", 'tldr': "TLDR: This paper presents a novel method for mitigating biases in neural text classification, agnostic to the model architecture, that takes inspiration from adversarial training to induce Wasserstein independence between representations learned to predict the authors' target label and the ones learning to predict some sensitive attribute.Expand", 'year': '2023', 'paper_link': 'https://www.semanticscholar.org/reader/354d12991f358e12107abbe0b763ac05fa90f957?sort=is-influential#citing-papers', 'paper_fos': 'Computer Science', 'paper_venue': None, 'num_exerpts': '15 Excerpts', 'highly_influenced': 'Highly Influenced'}
len now: 1
new entry: {'title': 'JAB: Joint Adversarial Prompting and Belief Augmentation', 'authors': 'Ninareh Mehrabi, Palash Goyal, Rahul Gupta', 'tldr': 'TLDR: A joint framework in which a

NoSuchWindowException: Message: no such window: target window already closed
from unknown error: web view not found
  (Session info: chrome=113.0.5672.126)
Stacktrace:
#0 0x560864849133 <unknown>
#1 0x56086457d966 <unknown>
#2 0x560864557cec <unknown>
#3 0x5608645dec0f <unknown>
#4 0x5608645f1c2b <unknown>
#5 0x5608645d99a3 <unknown>
#6 0x5608645ae46a <unknown>
#7 0x5608645af55e <unknown>
#8 0x560864808cae <unknown>
#9 0x56086480c8fe <unknown>
#10 0x560864815f20 <unknown>
#11 0x56086480d923 <unknown>
#12 0x5608647e0c0e <unknown>
#13 0x560864830b08 <unknown>
#14 0x560864830c97 <unknown>
#15 0x560864841113 <unknown>
#16 0x7f9b68766609 start_thread


In [8]:
print(base_url)
print(len(citation_list))

https://www.semanticscholar.org/paper/Bias-in-Bios%3A-A-Case-Study-of-Semantic-Bias-in-a-De-Arteaga-Romanov/c4afa2b3eda95a1194313394901e0e96e24cefaa?sort=is-influential&page=7
60


In [63]:
citation_list2 = citation_list

In [9]:
filename = "biasbios_semantic5_temp"
citation_json = store_citation_list(citation_list, filename + ".json")
json_to_excel(citation_json, filename + ".xlsx")