Google Scholar and Semantic Scholar scraping. 

In [1]:
# Packages
from bs4 import BeautifulSoup
import requests
import pandas as pd
import regex as re


# Request to Google Scholar
url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q={Computational+Propaganda}&btnG='
response=requests.get(url)

# Parse the HTML content
soup=BeautifulSoup(response.content,'html.parser')

In [2]:
results = []
# Get the title of the article
for entry in soup.find_all("h2", attrs={"class": "gs_rt"}):
    results.append({"title": entry.a.text})    
dataframe1 = pd.DataFrame(results)

In [3]:
# Get Authors, year, journal
results2 = []
for entry2 in soup.find_all("div", attrs={"class": "gs_a"}):
    year = re.findall('[0-9]+', entry2.text)
    journal = re.findall('[^-]+$', entry2.text)
    authors = re.findall('(^(.+?) - )', entry2.text)
    results2.append({"authors": authors, "year": year, "journal": journal})
dataframe2 = pd.DataFrame(results2)
dataframe2
# Clean the data
dataframe2['authors'] = dataframe2['authors'].str.get(0)
dataframe2['authors'] = dataframe2['authors'].str.get(0)
dataframe2['journal'] = dataframe2['journal'].str.get(0)
dataframe2['year'] = dataframe2['year'].str.get(0)

# Get the authors
dataframe_temp = dataframe2['authors'].str.extract('(^(.+?)- )')
dataframe_temp.drop(dataframe_temp.columns[[0]], axis=1, inplace=True)

# Concat the dataframes
dataframe2 = pd.concat([dataframe_temp, dataframe2], axis=1)

dataframe2.rename(columns={ dataframe2.columns[0]: "author" }, inplace = True)
dataframe2 = dataframe2.drop('authors', axis=1)

dataframe2

Unnamed: 0,author,year,journal
0,"SC Woolley, P Howard",2017,ora.ox.ac.uk
1,"GDS Martino, S Cresci, A Barrón-Cedeño, S Yu…",2020,arxiv.org
2,LMN Neudert,2017,demtech.oii.ox.ac.uk
3,"SC Woolley, PN Howard",2016,par.nsf.gov
4,"SC Woolley, PN Howard",2018,books.google.com
5,"SC Woolley, D Guilbeault",2017,ora.ox.ac.uk
6,D Arnaudo,2017,ora.ox.ac.uk
7,"G Bolsover, P Howard",2017,liebertpub.com
8,S Sanovich,2017,ora.ox.ac.uk
9,R DiResta,2018,muse.jhu.edu


In [4]:
results3 = []
# get number of citations for each paper
for paper in soup.find_all("div", class_="gs_ri"):
    txt_cite = paper.find("div", class_="gs_fl").find_all("a")[2].string
    if txt_cite:
        citations = re.findall('[0-9]+', txt_cite)
        if citations:
            results3.append({"Number of citation": citations})
        else:
            results3.append({"Number of citation": 0})
    else:
        results3.append({"Number of citation": 0})
dataframe3 = pd.DataFrame(results3)
dataframe3['Number of citation'] = dataframe3['Number of citation'].str.get(0)


In [5]:
# Concat the dataframes and clean the dataset
dataframe = pd.concat([dataframe1, dataframe2,dataframe3], axis=1)
dataframe.to_csv('Google Scholar - publications', sep='\t')


In [6]:
# Extract suggested results
results_suggestion = []
for entry in soup.find_all("div", attrs={"class": "gs_qsuggest gs_qsuggest_regular"}):
    for item in entry.find_all("li"):
        results_suggestion.append({"suggestion": item.text})
dataframe_suggestion = pd.DataFrame(results_suggestion)
dataframe_suggestion.to_csv('Google Scholar - suggestions', sep='\t')
